Remove disabled low bit-depth codepath This MR attempts to remove the low bitdepth codepath and the public API related to it. AOM_IMG_FMT_HIGHBITDEPTH is left in place in order to let the library users feed the encoder frames using 8bit buffers for 8bit content. These are in turn upshifted to 16bit buffers in the aom_codec_encode_fn_t callback from aom_codec_av1_cx_algo to be handled internally (the aomenc CLI is already doing the same, but other library users do not benefit from it).

commit: 857e93fa9519ec2478eacd44f32a40ac05535b23 [log] [tgz]
author: James Almer <jamrial@videolan.org> Wed May 25 16:44:43 2022 +0000
committer: Urvang Joshi <urvang@google.com> Wed May 25 16:44:43 2022 +0000
tree: 37d57305506d01d07327d952c2f52250b9004633
parent: 9aca3b7ef56b8a5f9ec5bfb0cdf05b39a34259bb [diff]
diff --git a/aom/aom_codec.h b/aom/aom_codec.h
index 66b3cb6..993503b 100644
--- a/aom/aom_codec.h
+++ b/aom/aom_codec.h

@@ -83,7 +83,7 @@
  *       {
  *           aom_codec_ctx_t algo;
  *           int threads = 4;
- *           aom_codec_dec_cfg_t cfg = { threads, 0, 0, 1 };
+ *           aom_codec_dec_cfg_t cfg = { threads, 0, 0 };
  *           res = aom_codec_dec_init(&algo, &my_codec, &cfg, 0);
  *       }
  *     </pre>

diff --git a/aom/aom_decoder.h b/aom/aom_decoder.h
index 60da50f..eea1213 100644
--- a/aom/aom_decoder.h
+++ b/aom/aom_decoder.h

@@ -93,8 +93,7 @@
   unsigned int threads; /**< Maximum number of threads to use, default 1 */
   unsigned int w;       /**< Width */
   unsigned int h;       /**< Height */
-  unsigned int allow_lowbitdepth; /**< Allow use of low-bitdepth coding path */
-} aom_codec_dec_cfg_t;            /**< alias for struct aom_codec_dec_cfg */
+} aom_codec_dec_cfg_t;  /**< alias for struct aom_codec_dec_cfg */
 
 /*!\brief Initialize a decoder instance
  *

diff --git a/aom/aom_encoder.h b/aom/aom_encoder.h
index 2261449..2ee03fd 100644
--- a/aom/aom_encoder.h
+++ b/aom/aom_encoder.h

@@ -56,10 +56,6 @@
  */
 #define AOM_CODEC_CAP_PSNR 0x10000 /**< Can issue PSNR packets */
 
-/*! Can support input images at greater than 8 bitdepth.
- */
-#define AOM_CODEC_CAP_HIGHBITDEPTH 0x40000
-
 /*! \brief Initialization-time Feature Enabling
  *
  *  Certain codec features must be known at initialization time, to allow
@@ -68,8 +64,6 @@
  *  The available flags are specified by AOM_CODEC_USE_* defines.
  */
 #define AOM_CODEC_USE_PSNR 0x10000 /**< Calculate PSNR on each frame */
-/*!\brief Make the encoder output one  partition at a time. */
-#define AOM_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */
 /*!\brief Print per frame stats. */
 #define AOM_CODEC_USE_PER_FRAME_STATS 0x80000 /**< Enable printing of stats */
 

diff --git a/aom/exports_com b/aom/exports_com
index 266e294..caf11eb 100644
--- a/aom/exports_com
+++ b/aom/exports_com

@@ -26,6 +26,7 @@
 text aom_img_plane_width
 text aom_img_remove_metadata
 text aom_img_set_rect
+text aom_img_upshift
 text aom_img_wrap
 text aom_malloc
 text aom_rb_bytes_read

diff --git a/aom/internal/aom_codec_internal.h b/aom/internal/aom_codec_internal.h
index 0afc7b7..83e7d3c 100644
--- a/aom/internal/aom_codec_internal.h
+++ b/aom/internal/aom_codec_internal.h

@@ -36,7 +36,7 @@
  *       {
  *           aom_codec_ctx_t algo;
  *           int threads = 4;
- *           aom_codec_dec_cfg_t cfg = { threads, 0, 0, 1 };
+ *           aom_codec_dec_cfg_t cfg = { threads, 0, 0 };
  *           res = aom_codec_dec_init(&algo, &my_codec, &cfg, 0);
  *       }
  *     </pre>

diff --git a/aom/internal/aom_image_internal.h b/aom/internal/aom_image_internal.h
index 5e23757..d347208 100644
--- a/aom/internal/aom_image_internal.h
+++ b/aom/internal/aom_image_internal.h

@@ -87,6 +87,8 @@
                                    aom_alloc_img_data_cb_fn_t alloc_cb,
                                    void *cb_priv);
 
+void aom_img_upshift(aom_image_t *dst, const aom_image_t *src, int input_shift);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/aom/src/aom_image.c b/aom/src/aom_image.c
index 18bfd22..f52c571 100644
--- a/aom/src/aom_image.c
+++ b/aom/src/aom_image.c

@@ -10,9 +10,13 @@
  * aomedia.org/license/patent-license/.
  */
 
+#include <stdarg.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
+#include "config/aom_config.h"
+
 #include "aom/aom_image.h"
 #include "aom/aom_integer.h"
 #include "aom/internal/aom_image_internal.h"
@@ -386,3 +390,114 @@
   if (!img || !img->metadata) return 0;
   return img->metadata->sz;
 }
+
+#define LOG_ERROR(label)               \
+  do {                                 \
+    const char *l = label;             \
+    va_list ap;                        \
+    va_start(ap, fmt);                 \
+    if (l) fprintf(stderr, "%s: ", l); \
+    vfprintf(stderr, fmt, ap);         \
+    fprintf(stderr, "\n");             \
+    va_end(ap);                        \
+  } while (0)
+
+#if defined(__GNUC__)
+#define AOM_NO_RETURN __attribute__((noreturn))
+#else
+#define AOM_NO_RETURN
+#endif
+
+AOM_NO_RETURN static void fatal(const char *fmt, ...) {
+  LOG_ERROR("Fatal");
+  exit(EXIT_FAILURE);
+}
+
+static void highbd_img_upshift(aom_image_t *dst, const aom_image_t *src,
+                               int input_shift) {
+#if CONFIG_ZERO_OFFSET_BITUPSHIFT
+  const int offset = 0;
+#else
+  // Note the offset is 1 less than half.
+  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
+#endif  // CONFIG_ZERO_OFFSET_BITUPSHIFT
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift || dst->fmt != src->fmt ||
+      input_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (src->fmt) {
+    case AOM_IMG_FMT_I42016:
+    case AOM_IMG_FMT_I42216:
+    case AOM_IMG_FMT_I44416: break;
+    default: fatal("Unsupported image conversion"); break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      const uint16_t *p_src =
+          (const uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      uint16_t *p_dst =
+          (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
+      for (x = 0; x < w; x++) *p_dst++ = (*p_src++ << input_shift) + offset;
+    }
+  }
+}
+
+static void lowbd_img_upshift(aom_image_t *dst, const aom_image_t *src,
+                              int input_shift) {
+#if CONFIG_ZERO_OFFSET_BITUPSHIFT
+  const int offset = 0;
+#else
+  // Note the offset is 1 less than half.
+  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
+#endif  // CONFIG_ZERO_OFFSET_BITUPSHIFT
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift ||
+      dst->fmt != src->fmt + AOM_IMG_FMT_HIGHBITDEPTH || input_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (src->fmt) {
+    case AOM_IMG_FMT_YV12:
+    case AOM_IMG_FMT_I420:
+    case AOM_IMG_FMT_I422:
+    case AOM_IMG_FMT_I444: break;
+    default: fatal("Unsupported image conversion"); break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      const uint8_t *p_src = src->planes[plane] + y * src->stride[plane];
+      uint16_t *p_dst =
+          (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
+      for (x = 0; x < w; x++) {
+        *p_dst++ = (*p_src++ << input_shift) + offset;
+      }
+    }
+  }
+}
+
+void aom_img_upshift(aom_image_t *dst, const aom_image_t *src,
+                     int input_shift) {
+  if (src->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+    highbd_img_upshift(dst, src, input_shift);
+  } else {
+    lowbd_img_upshift(dst, src, input_shift);
+  }
+}

diff --git a/aom_dsp/aom_convolve.c b/aom_dsp/aom_convolve.c
index 85455ec..ef68c63 100644
--- a/aom_dsp/aom_convolve.c
+++ b/aom_dsp/aom_convolve.c

@@ -34,45 +34,6 @@
   return sum;
 }
 
-static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const InterpKernel *x_filters, int x0_q4,
-                           int x_step_q4, int w, int h) {
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (int y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (int x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      const int sum = horz_scalar_product(src_x, x_filter);
-      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
-                          uint8_t *dst, ptrdiff_t dst_stride,
-                          const InterpKernel *y_filters, int y0_q4,
-                          int y_step_q4, int w, int h) {
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
-  for (int x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (int y = 0; y < h; ++y) {
-      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
-      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
 static const InterpKernel *get_filter_base(const int16_t *filter) {
   // NOTE: This assumes that the filter table is 256-byte aligned.
   return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
@@ -82,79 +43,6 @@
   return (int)((const InterpKernel *)(intptr_t)f - base);
 }
 
-void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4, int w,
-                           int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
-  (void)y_step_q4;
-
-  convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
-                 w, h);
-}
-
-void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                          uint8_t *dst, ptrdiff_t dst_stride,
-                          const int16_t *filter_x, int x_step_q4,
-                          const int16_t *filter_y, int y_step_q4, int w,
-                          int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
-  (void)x_step_q4;
-
-  convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
-                w, h);
-}
-
-void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     ptrdiff_t dst_stride, const InterpKernel *filter,
-                     int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
-                     int h) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  // When calling in frame scaling function, the smallest scaling factor is x1/4
-  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
-  // big enough.
-  uint8_t temp[64 * 135];
-  const int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= 64);
-  assert(h <= 64);
-  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
-  assert(x_step_q4 <= 64);
-
-  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
-                 filter, x0_q4, x_step_q4, w, intermediate_height);
-  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,
-                y0_q4, y_step_q4, w, h);
-}
-
-void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, int w, int h) {
-  for (int r = h; r > 0; --r) {
-    memmove(dst, src, w);
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
 static INLINE int highbd_vert_scalar_product(const uint16_t *a,
                                              ptrdiff_t a_stride,
                                              const int16_t *b) {
@@ -170,12 +58,10 @@
   return sum;
 }
 
-static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
-                                  uint8_t *dst8, ptrdiff_t dst_stride,
+static void highbd_convolve_horiz(const uint16_t *src, ptrdiff_t src_stride,
+                                  uint16_t *dst, ptrdiff_t dst_stride,
                                   const InterpKernel *x_filters, int x0_q4,
                                   int x_step_q4, int w, int h, int bd) {
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   src -= SUBPEL_TAPS / 2 - 1;
   for (int y = 0; y < h; ++y) {
     int x_q4 = x0_q4;
@@ -191,12 +77,10 @@
   }
 }
 
-static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
-                                 uint8_t *dst8, ptrdiff_t dst_stride,
+static void highbd_convolve_vert(const uint16_t *src, ptrdiff_t src_stride,
+                                 uint16_t *dst, ptrdiff_t dst_stride,
                                  const InterpKernel *y_filters, int y0_q4,
                                  int y_step_q4, int w, int h, int bd) {
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
   for (int x = 0; x < w; ++x) {
     int y_q4 = y0_q4;
@@ -213,11 +97,58 @@
   }
 }
 
-void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
+void aom_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride,
+                            uint16_t *dst, ptrdiff_t dst_stride,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
+                            int h, int bd) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
+  uint16_t temp[64 * 135];
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
+
+  highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
+                        temp, 64, filter, x0_q4, x_step_q4, w,
+                        intermediate_height, bd);
+  highbd_convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+                       filter, y0_q4, y_step_q4, w, h, bd);
+}
+
+void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, int w, int h) {
+  for (int r = h; r > 0; --r) {
+    memmove(dst, src, w);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void aom_highbd_convolve8_horiz_c(const uint8_t *src8, ptrdiff_t src_stride,
+                                  uint8_t *dst8, ptrdiff_t dst_stride,
                                   const int16_t *filter_x, int x_step_q4,
                                   const int16_t *filter_y, int y_step_q4, int w,
                                   int h, int bd) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   const InterpKernel *const filters_x = get_filter_base(filter_x);
   const int x0_q4 = get_filter_offset(filter_x, filters_x);
   (void)filter_y;
@@ -227,11 +158,13 @@
                         x_step_q4, w, h, bd);
 }
 
-void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
+void aom_highbd_convolve8_vert_c(const uint8_t *src8, ptrdiff_t src_stride,
+                                 uint8_t *dst8, ptrdiff_t dst_stride,
                                  const int16_t *filter_x, int x_step_q4,
                                  const int16_t *filter_y, int y_step_q4, int w,
                                  int h, int bd) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   const InterpKernel *const filters_y = get_filter_base(filter_y);
   const int y0_q4 = get_filter_offset(filter_y, filters_y);
   (void)filter_x;

diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 4cc8f57..c8e32de 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake

@@ -57,14 +57,12 @@
   "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm"
   "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm"
   "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_asm_sse2.asm"
-  "${AOM_ROOT}/aom_dsp/x86/intrapred_asm_sse2.asm"
   "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm")
 
 list(
   APPEND
   AOM_DSP_COMMON_INTRIN_SSE2
   "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.c"
-  "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c"
   "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
   "${AOM_ROOT}/aom_dsp/x86/convolve.h"
   "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h"
@@ -73,9 +71,7 @@
   "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
   "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c"
   "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c"
-  "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c"
   "${AOM_ROOT}/aom_dsp/x86/intrapred_x86.h"
-  "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c"
   "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h"
   "${AOM_ROOT}/aom_dsp/x86/mem_sse2.h"
   "${AOM_ROOT}/aom_dsp/x86/transpose_sse2.h"
@@ -88,12 +84,8 @@
      "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm"
      "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm")
 
-list(
-  APPEND
-  AOM_DSP_COMMON_INTRIN_SSSE3
-  "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c"
-  "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_ssse3.c"
-  "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c")
+list(APPEND AOM_DSP_COMMON_INTRIN_SSSE3
+     "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_ssse3.c")
 
 list(
   APPEND
@@ -107,9 +99,7 @@
   APPEND
   AOM_DSP_COMMON_INTRIN_AVX2
   "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_avx2.c"
-  "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c"
   "${AOM_ROOT}/aom_dsp/x86/common_avx2.h"
-  "${AOM_ROOT}/aom_dsp/x86/subtract_avx2.c"
   "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h"
   "${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h"
   "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c"
@@ -154,7 +144,6 @@
   "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_vert_msa.c"
   "${AOM_ROOT}/aom_dsp/mips/aom_convolve_copy_msa.c"
   "${AOM_ROOT}/aom_dsp/mips/aom_convolve_msa.h"
-  "${AOM_ROOT}/aom_dsp/mips/intrapred_msa.c"
   "${AOM_ROOT}/aom_dsp/mips/macros_msa.h")
 
 if(CONFIG_AV1_DECODER)
@@ -206,14 +195,7 @@
     "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm"
     "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm"
     "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm"
-    "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm"
-    "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm"
-    "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm"
-    "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm"
-    "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm")
-
-  list(APPEND AOM_DSP_ENCODER_ASM_SSE2_X86_64
-       "${AOM_ROOT}/aom_dsp/x86/ssim_sse2_x86_64.asm")
+    "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm")
 
   list(
     APPEND
@@ -232,7 +214,6 @@
     "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c")
 
   list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64
-       "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm"
        "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm")
 
   list(
@@ -243,7 +224,6 @@
     "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_avx2.c"
     "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_avx2.c"
     "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c"
-    "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c"
     "${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c"
     "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c"
     "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c"
@@ -251,7 +231,6 @@
     "${AOM_ROOT}/aom_dsp/x86/sse_avx2.c"
     "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c"
     "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c"
-    "${AOM_ROOT}/aom_dsp/x86/obmc_variance_avx2.c"
     "${AOM_ROOT}/aom_dsp/x86/blk_sse_sum_avx2.c"
     "${AOM_ROOT}/aom_dsp/x86/sum_squares_avx2.c")
 
@@ -268,7 +247,6 @@
     "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c"
     "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3.c"
     "${AOM_ROOT}/aom_dsp/x86/variance_impl_ssse3.c"
-    "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c"
     "${AOM_ROOT}/aom_dsp/x86/jnt_sad_ssse3.c")
 
   list(
@@ -279,17 +257,9 @@
     "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c"
     "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c")
 
-  list(
-    APPEND
-    AOM_DSP_ENCODER_INTRIN_NEON
-    "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/sad_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/variance_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/avg_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/sse_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon.c")
+  list(APPEND AOM_DSP_ENCODER_INTRIN_NEON "${AOM_ROOT}/aom_dsp/arm/avg_neon.c"
+       "${AOM_ROOT}/aom_dsp/arm/sse_neon.c"
+       "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon.c")
 
   list(
     APPEND

diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 92017cf..22e4a08 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl

@@ -76,214 +76,11 @@
 foreach (@tx_sizes) {
   ($w, $h) = @$_;
   foreach $pred_name (@pred_names) {
-    add_proto "void", "aom_${pred_name}_predictor_${w}x${h}",
-              "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
     add_proto "void", "aom_highbd_${pred_name}_predictor_${w}x${h}",
               "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   }
 }
 
-specialize qw/aom_dc_top_predictor_4x4 msa neon sse2/;
-specialize qw/aom_dc_top_predictor_4x8 sse2/;
-specialize qw/aom_dc_top_predictor_4x16 sse2/;
-specialize qw/aom_dc_top_predictor_8x4 sse2/;
-specialize qw/aom_dc_top_predictor_8x8 neon msa sse2/;
-specialize qw/aom_dc_top_predictor_8x16 sse2/;
-specialize qw/aom_dc_top_predictor_8x32 sse2/;
-specialize qw/aom_dc_top_predictor_16x4 sse2/;
-specialize qw/aom_dc_top_predictor_16x8 sse2/;
-specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/;
-
-specialize qw/aom_dc_top_predictor_16x32 sse2/;
-specialize qw/aom_dc_top_predictor_16x64 sse2/;
-specialize qw/aom_dc_top_predictor_32x8 sse2/;
-specialize qw/aom_dc_top_predictor_32x16 sse2 avx2/;
-specialize qw/aom_dc_top_predictor_32x32 msa neon sse2 avx2/;
-specialize qw/aom_dc_top_predictor_32x64 sse2 avx2/;
-specialize qw/aom_dc_top_predictor_64x64 sse2 avx2/;
-specialize qw/aom_dc_top_predictor_64x32 sse2 avx2/;
-specialize qw/aom_dc_top_predictor_64x16 sse2 avx2/;
-specialize qw/aom_dc_left_predictor_4x4 msa neon sse2/;
-specialize qw/aom_dc_left_predictor_4x8 sse2/;
-specialize qw/aom_dc_left_predictor_4x16 sse2/;
-specialize qw/aom_dc_left_predictor_8x4 sse2/;
-specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/;
-specialize qw/aom_dc_left_predictor_8x16 sse2/;
-specialize qw/aom_dc_left_predictor_8x32 sse2/;
-specialize qw/aom_dc_left_predictor_16x4 sse2/;
-specialize qw/aom_dc_left_predictor_16x8 sse2/;
-specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/;
-specialize qw/aom_dc_left_predictor_16x32 sse2/;
-specialize qw/aom_dc_left_predictor_16x64 sse2/;
-specialize qw/aom_dc_left_predictor_32x8 sse2/;
-specialize qw/aom_dc_left_predictor_32x16 sse2 avx2/;
-specialize qw/aom_dc_left_predictor_32x32 msa neon sse2 avx2/;
-specialize qw/aom_dc_left_predictor_32x64 sse2 avx2/;
-specialize qw/aom_dc_left_predictor_64x64 sse2 avx2/;
-specialize qw/aom_dc_left_predictor_64x32 sse2 avx2/;
-specialize qw/aom_dc_left_predictor_64x16 sse2 avx2/;
-specialize qw/aom_dc_128_predictor_4x4 msa neon sse2/;
-specialize qw/aom_dc_128_predictor_4x8 sse2/;
-specialize qw/aom_dc_128_predictor_4x16 sse2/;
-specialize qw/aom_dc_128_predictor_8x4 sse2/;
-specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/;
-specialize qw/aom_dc_128_predictor_8x16 sse2/;
-specialize qw/aom_dc_128_predictor_8x32 sse2/;
-specialize qw/aom_dc_128_predictor_16x4 sse2/;
-specialize qw/aom_dc_128_predictor_16x8 sse2/;
-specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/;
-specialize qw/aom_dc_128_predictor_16x32 sse2/;
-specialize qw/aom_dc_128_predictor_16x64 sse2/;
-specialize qw/aom_dc_128_predictor_32x8 sse2/;
-specialize qw/aom_dc_128_predictor_32x16 sse2 avx2/;
-specialize qw/aom_dc_128_predictor_32x32 msa neon sse2 avx2/;
-specialize qw/aom_dc_128_predictor_32x64 sse2 avx2/;
-specialize qw/aom_dc_128_predictor_64x64 sse2 avx2/;
-specialize qw/aom_dc_128_predictor_64x32 sse2 avx2/;
-specialize qw/aom_dc_128_predictor_64x16 sse2 avx2/;
-specialize qw/aom_v_predictor_4x4 neon msa sse2/;
-specialize qw/aom_v_predictor_4x8 sse2/;
-specialize qw/aom_v_predictor_4x16 sse2/;
-specialize qw/aom_v_predictor_8x4 sse2/;
-specialize qw/aom_v_predictor_8x8 neon msa sse2/;
-specialize qw/aom_v_predictor_8x16 sse2/;
-specialize qw/aom_v_predictor_8x32 sse2/;
-specialize qw/aom_v_predictor_16x4 sse2/;
-specialize qw/aom_v_predictor_16x8 sse2/;
-specialize qw/aom_v_predictor_16x16 neon msa sse2/;
-specialize qw/aom_v_predictor_16x32 sse2/;
-specialize qw/aom_v_predictor_16x64 sse2/;
-specialize qw/aom_v_predictor_32x8 sse2/;
-specialize qw/aom_v_predictor_32x16 sse2 avx2/;
-specialize qw/aom_v_predictor_32x32 neon msa sse2 avx2/;
-specialize qw/aom_v_predictor_32x64 sse2 avx2/;
-specialize qw/aom_v_predictor_64x64 sse2 avx2/;
-specialize qw/aom_v_predictor_64x32 sse2 avx2/;
-specialize qw/aom_v_predictor_64x16 sse2 avx2/;
-specialize qw/aom_h_predictor_4x8 sse2/;
-specialize qw/aom_h_predictor_4x16 sse2/;
-specialize qw/aom_h_predictor_4x4 neon dspr2 msa sse2/;
-specialize qw/aom_h_predictor_8x4 sse2/;
-specialize qw/aom_h_predictor_8x8 neon dspr2 msa sse2/;
-specialize qw/aom_h_predictor_8x16 sse2/;
-specialize qw/aom_h_predictor_8x32 sse2/;
-specialize qw/aom_h_predictor_16x4 sse2/;
-specialize qw/aom_h_predictor_16x8 sse2/;
-specialize qw/aom_h_predictor_16x16 neon dspr2 msa sse2/;
-specialize qw/aom_h_predictor_16x32 sse2/;
-specialize qw/aom_h_predictor_16x64 sse2/;
-specialize qw/aom_h_predictor_32x8 sse2/;
-specialize qw/aom_h_predictor_32x16 sse2/;
-specialize qw/aom_h_predictor_32x32 neon msa sse2 avx2/;
-specialize qw/aom_h_predictor_32x64 sse2/;
-specialize qw/aom_h_predictor_64x64 sse2/;
-specialize qw/aom_h_predictor_64x32 sse2/;
-specialize qw/aom_h_predictor_64x16 sse2/;
-specialize qw/aom_paeth_predictor_4x4 ssse3/;
-specialize qw/aom_paeth_predictor_4x8 ssse3/;
-specialize qw/aom_paeth_predictor_4x16 ssse3/;
-specialize qw/aom_paeth_predictor_8x4 ssse3/;
-specialize qw/aom_paeth_predictor_8x8 ssse3/;
-specialize qw/aom_paeth_predictor_8x16 ssse3/;
-specialize qw/aom_paeth_predictor_8x32 ssse3/;
-specialize qw/aom_paeth_predictor_16x4 ssse3/;
-specialize qw/aom_paeth_predictor_16x8 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_16x16 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_16x32 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_16x64 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_32x8 ssse3/;
-specialize qw/aom_paeth_predictor_32x16 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_32x32 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_32x64 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_64x32 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_64x64 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_64x16 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_16x8 ssse3/;
-specialize qw/aom_paeth_predictor_16x16 ssse3/;
-specialize qw/aom_paeth_predictor_16x32 ssse3/;
-specialize qw/aom_paeth_predictor_32x16 ssse3/;
-specialize qw/aom_paeth_predictor_32x32 ssse3/;
-specialize qw/aom_smooth_predictor_4x4 neon ssse3/;
-specialize qw/aom_smooth_predictor_4x8 neon ssse3/;
-specialize qw/aom_smooth_predictor_4x16 neon ssse3/;
-specialize qw/aom_smooth_predictor_8x4 neon ssse3/;
-specialize qw/aom_smooth_predictor_8x8 neon ssse3/;
-specialize qw/aom_smooth_predictor_8x16 neon ssse3/;
-specialize qw/aom_smooth_predictor_8x32 neon ssse3/;
-specialize qw/aom_smooth_predictor_16x4 neon ssse3/;
-specialize qw/aom_smooth_predictor_16x8 neon ssse3/;
-specialize qw/aom_smooth_predictor_16x16 neon ssse3/;
-specialize qw/aom_smooth_predictor_16x32 neon ssse3/;
-specialize qw/aom_smooth_predictor_16x64 neon ssse3/;
-specialize qw/aom_smooth_predictor_32x8 neon ssse3/;
-specialize qw/aom_smooth_predictor_32x16 neon ssse3/;
-specialize qw/aom_smooth_predictor_32x32 neon ssse3/;
-specialize qw/aom_smooth_predictor_32x64 neon ssse3/;
-specialize qw/aom_smooth_predictor_64x64 neon ssse3/;
-specialize qw/aom_smooth_predictor_64x32 neon ssse3/;
-specialize qw/aom_smooth_predictor_64x16 neon ssse3/;
-
-specialize qw/aom_smooth_v_predictor_4x4 ssse3/;
-specialize qw/aom_smooth_v_predictor_4x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_4x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x4 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x4 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x64 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x64 ssse3/;
-specialize qw/aom_smooth_v_predictor_64x64 ssse3/;
-specialize qw/aom_smooth_v_predictor_64x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_64x16 ssse3/;
-
-specialize qw/aom_smooth_h_predictor_4x4 ssse3/;
-specialize qw/aom_smooth_h_predictor_4x8 ssse3/;
-specialize qw/aom_smooth_h_predictor_4x16 ssse3/;
-specialize qw/aom_smooth_h_predictor_8x4 ssse3/;
-specialize qw/aom_smooth_h_predictor_8x8 ssse3/;
-specialize qw/aom_smooth_h_predictor_8x16 ssse3/;
-specialize qw/aom_smooth_h_predictor_8x32 ssse3/;
-specialize qw/aom_smooth_h_predictor_16x4 ssse3/;
-specialize qw/aom_smooth_h_predictor_16x8 ssse3/;
-specialize qw/aom_smooth_h_predictor_16x16 ssse3/;
-specialize qw/aom_smooth_h_predictor_16x32 ssse3/;
-specialize qw/aom_smooth_h_predictor_16x64 ssse3/;
-specialize qw/aom_smooth_h_predictor_32x8 ssse3/;
-specialize qw/aom_smooth_h_predictor_32x16 ssse3/;
-specialize qw/aom_smooth_h_predictor_32x32 ssse3/;
-specialize qw/aom_smooth_h_predictor_32x64 ssse3/;
-specialize qw/aom_smooth_h_predictor_64x64 ssse3/;
-specialize qw/aom_smooth_h_predictor_64x32 ssse3/;
-specialize qw/aom_smooth_h_predictor_64x16 ssse3/;
-
-# TODO(yunqingwang): optimize rectangular DC_PRED to replace division
-# by multiply and shift.
-specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/;
-specialize qw/aom_dc_predictor_4x8 sse2/;
-specialize qw/aom_dc_predictor_4x16 sse2/;
-specialize qw/aom_dc_predictor_8x4 sse2/;
-specialize qw/aom_dc_predictor_8x8 dspr2 neon msa sse2/;
-specialize qw/aom_dc_predictor_8x16 sse2/;
-specialize qw/aom_dc_predictor_8x32 sse2/;
-specialize qw/aom_dc_predictor_16x4 sse2/;
-specialize qw/aom_dc_predictor_16x8 sse2/;
-specialize qw/aom_dc_predictor_16x16 dspr2 neon msa sse2/;
-specialize qw/aom_dc_predictor_16x32 sse2/;
-specialize qw/aom_dc_predictor_16x64 sse2/;
-specialize qw/aom_dc_predictor_32x8 sse2/;
-specialize qw/aom_dc_predictor_32x16 sse2 avx2/;
-specialize qw/aom_dc_predictor_32x32 msa neon sse2 avx2/;
-specialize qw/aom_dc_predictor_32x64 sse2 avx2/;
-specialize qw/aom_dc_predictor_64x64 sse2 avx2/;
-specialize qw/aom_dc_predictor_64x32 sse2 avx2/;
-specialize qw/aom_dc_predictor_64x16 sse2 avx2/;
 specialize qw/aom_highbd_v_predictor_4x4 sse2/;
 specialize qw/aom_highbd_v_predictor_4x8 sse2/;
 specialize qw/aom_highbd_v_predictor_8x4 sse2/;
@@ -352,14 +149,10 @@
 #
 # Sub Pixel Filters
 #
-add_proto qw/void aom_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+add_proto qw/void aom_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
 add_proto qw/void aom_convolve_copy/,             "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, int w, int h";
-add_proto qw/void aom_convolve8_horiz/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_vert/,            "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 
 specialize qw/aom_convolve_copy       neon dspr2 msa sse2 avx2/;
-specialize qw/aom_convolve8_horiz     sse2 ssse3/, "$avx2_ssse3";
-specialize qw/aom_convolve8_vert      sse2 ssse3/, "$avx2_ssse3";
 
 add_proto qw/void aom_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int w, int h";
 specialize qw/aom_highbd_convolve_copy sse2 avx2/;
@@ -376,57 +169,7 @@
 if (aom_config("CONFIG_NEW_DF") eq "yes") {
       add_proto qw/void aom_highbd_lpf_horizontal_generic/, "uint16_t *s, int pitch, int filt_width, const uint16_t *q_thresh, const uint16_t *side_thresh, int bd";
       add_proto qw/void aom_highbd_lpf_vertical_generic/, "uint16_t *s, int pitch, int filt_width, const uint16_t *q_thresh, const uint16_t *side_thresh, int bd";
-      add_proto qw/void aom_lpf_horizontal_generic/, "uint8_t *s, int pitch, int filt_width, const uint16_t *q_thresh, const uint16_t *side_thresh";
-      add_proto qw/void aom_lpf_vertical_generic/, "uint8_t *s, int pitch, int filt_width, const uint16_t *q_thresh, const uint16_t *side_thresh";
 } else {
-add_proto qw/void aom_lpf_vertical_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_vertical_14 sse2 neon/;
-
-add_proto qw/void aom_lpf_vertical_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_vertical_14_dual sse2/;
-
-add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_vertical_6 sse2 neon/;
-
-add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_vertical_8 sse2 neon/;
-
-add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_vertical_8_dual sse2/;
-
-add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_vertical_4 sse2 neon/;
-
-add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_vertical_4_dual sse2/;
-
-add_proto qw/void aom_lpf_horizontal_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_horizontal_14 sse2 neon/;
-
-add_proto qw/void aom_lpf_horizontal_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_horizontal_14_dual sse2/;
-
-add_proto qw/void aom_lpf_horizontal_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_horizontal_6 sse2 neon/;
-
-add_proto qw/void aom_lpf_horizontal_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_horizontal_6_dual sse2/;
-
-add_proto qw/void aom_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_horizontal_8 sse2 neon/;
-
-add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_horizontal_8_dual sse2/;
-
-add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_horizontal_4 sse2 neon/;
-
-add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_horizontal_4_dual sse2/;
-
-add_proto qw/void aom_lpf_vertical_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_vertical_6_dual sse2/;
-
 add_proto qw/void aom_highbd_lpf_vertical_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
 specialize qw/aom_highbd_lpf_vertical_14 sse2/;
 
@@ -490,9 +233,6 @@
     add_proto qw/void aom_fdct4x4_lp/, "const int16_t *input, int16_t *output, int stride";
     specialize qw/aom_fdct4x4_lp neon sse2/;
 
-    add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct8x8 neon sse2/, "$ssse3_x86_64";
-    # High bit depth
     add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
     specialize qw/aom_highbd_fdct8x8 sse2/;
 
@@ -530,15 +270,6 @@
 # Quantization
 #
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
-    add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    add_proto qw/void aom_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    add_proto qw/void aom_quantize_b_32x32_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    add_proto qw/void aom_quantize_b_64x64_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-}  # CONFIG_AV1_ENCODER
-
-if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/aom_highbd_quantize_b sse2/;
 
@@ -558,15 +289,6 @@
 #
 # Alpha blending with mask
 #
-add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params";
-specialize qw/aom_lowbd_blend_a64_d16_mask sse4_1 avx2 neon/;
-add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh";
-add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
-add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
-specialize "aom_blend_a64_mask", qw/sse4_1 avx2/;
-specialize "aom_blend_a64_hmask", qw/sse4_1 neon/;
-specialize "aom_blend_a64_vmask", qw/sse4_1 neon/;
-
 add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, int bd";
 add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
 add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
@@ -579,16 +301,10 @@
 #
 # Block subtraction
 #
-add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/aom_subtract_block neon msa avx2/;
-
 add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
 specialize qw/aom_highbd_subtract_block sse2/;
 
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
-  add_proto qw/int64_t/, "aom_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height";
-  specialize qw/aom_sse  sse4_1 avx2 neon/;
-
   add_proto qw/void/, "aom_get_blk_sse_sum", "const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum";
   specialize qw/aom_get_blk_sse_sum sse2 avx2/;
 
@@ -612,125 +328,8 @@
     specialize qw/aom_var_2d_u16 sse2 avx2/;
   }
 
-  #
-  # Single block SAD / Single block Avg SAD
-  #
-  foreach (@block_sizes) {
-    ($w, $h) = @$_;
-    add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-    add_proto qw/unsigned int/, "aom_sad_skip_${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-    add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-    add_proto qw/unsigned int/, "aom_dist_wtd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param";
-  }
-
   add_proto qw/uint64_t aom_sum_sse_2d_i16/, "const int16_t *src, int src_stride, int width, int height, int *sum";
   specialize qw/aom_sum_sse_2d_i16 sse2 avx2/;
-  specialize qw/aom_sad128x128    avx2 neon     sse2/;
-  specialize qw/aom_sad128x64     avx2          sse2/;
-  specialize qw/aom_sad64x128     avx2          sse2/;
-  specialize qw/aom_sad64x64      avx2 neon msa sse2/;
-  specialize qw/aom_sad64x32      avx2      msa sse2/;
-  specialize qw/aom_sad32x64      avx2      msa sse2/;
-  specialize qw/aom_sad32x32      avx2 neon msa sse2/;
-  specialize qw/aom_sad32x16      avx2      msa sse2/;
-  specialize qw/aom_sad16x32                msa sse2/;
-  specialize qw/aom_sad16x16           neon msa sse2/;
-  specialize qw/aom_sad16x8            neon msa sse2/;
-  specialize qw/aom_sad8x16            neon msa sse2/;
-  specialize qw/aom_sad8x8             neon msa sse2/;
-  specialize qw/aom_sad8x4                  msa sse2/;
-  specialize qw/aom_sad4x8                  msa sse2/;
-  specialize qw/aom_sad4x4             neon msa sse2/;
-
-  specialize qw/aom_sad4x16                     sse2/;
-  specialize qw/aom_sad16x4                     sse2/;
-  specialize qw/aom_sad8x32                     sse2/;
-  specialize qw/aom_sad32x8                     sse2/;
-  specialize qw/aom_sad16x64                    sse2/;
-  specialize qw/aom_sad64x16                    sse2/;
-
-  specialize qw/aom_sad_skip_128x128    avx2          sse2  neon/;
-  specialize qw/aom_sad_skip_128x64     avx2          sse2  neon/;
-  specialize qw/aom_sad_skip_64x128     avx2          sse2  neon/;
-  specialize qw/aom_sad_skip_64x64      avx2          sse2  neon/;
-  specialize qw/aom_sad_skip_64x32      avx2          sse2  neon/;
-  specialize qw/aom_sad_skip_32x64      avx2          sse2  neon/;
-  specialize qw/aom_sad_skip_32x32      avx2          sse2  neon/;
-  specialize qw/aom_sad_skip_32x16      avx2          sse2  neon/;
-  specialize qw/aom_sad_skip_16x32                    sse2  neon/;
-  specialize qw/aom_sad_skip_16x16                    sse2  neon/;
-  specialize qw/aom_sad_skip_16x8                     sse2  neon/;
-  specialize qw/aom_sad_skip_8x16                     sse2  neon/;
-  specialize qw/aom_sad_skip_8x8                      sse2  neon/;
-  specialize qw/aom_sad_skip_4x8                      sse2  neon/;
-
-  specialize qw/aom_sad_skip_4x16                     sse2  neon/;
-  specialize qw/aom_sad_skip_8x32                     sse2  neon/;
-  specialize qw/aom_sad_skip_32x8                     sse2  neon/;
-  specialize qw/aom_sad_skip_16x64                    sse2  neon/;
-  specialize qw/aom_sad_skip_64x16                    sse2  neon/;
-
-  specialize qw/aom_sad128x128_avg avx2     sse2/;
-  specialize qw/aom_sad128x64_avg  avx2     sse2/;
-  specialize qw/aom_sad64x128_avg  avx2     sse2/;
-  specialize qw/aom_sad64x64_avg   avx2 msa sse2/;
-  specialize qw/aom_sad64x32_avg   avx2 msa sse2/;
-  specialize qw/aom_sad32x64_avg   avx2 msa sse2/;
-  specialize qw/aom_sad32x32_avg   avx2 msa sse2/;
-  specialize qw/aom_sad32x16_avg   avx2 msa sse2/;
-  specialize qw/aom_sad16x32_avg        msa sse2/;
-  specialize qw/aom_sad16x16_avg        msa sse2/;
-  specialize qw/aom_sad16x8_avg         msa sse2/;
-  specialize qw/aom_sad8x16_avg         msa sse2/;
-  specialize qw/aom_sad8x8_avg          msa sse2/;
-  specialize qw/aom_sad8x4_avg          msa sse2/;
-  specialize qw/aom_sad4x8_avg          msa sse2/;
-  specialize qw/aom_sad4x4_avg          msa sse2/;
-
-  specialize qw/aom_sad4x16_avg             sse2/;
-  specialize qw/aom_sad16x4_avg             sse2/;
-  specialize qw/aom_sad8x32_avg             sse2/;
-  specialize qw/aom_sad32x8_avg             sse2/;
-  specialize qw/aom_sad16x64_avg            sse2/;
-  specialize qw/aom_sad64x16_avg            sse2/;
-
-  specialize qw/aom_dist_wtd_sad128x128_avg ssse3/;
-  specialize qw/aom_dist_wtd_sad128x64_avg  ssse3/;
-  specialize qw/aom_dist_wtd_sad64x128_avg  ssse3/;
-  specialize qw/aom_dist_wtd_sad64x64_avg   ssse3/;
-  specialize qw/aom_dist_wtd_sad64x32_avg   ssse3/;
-  specialize qw/aom_dist_wtd_sad32x64_avg   ssse3/;
-  specialize qw/aom_dist_wtd_sad32x32_avg   ssse3/;
-  specialize qw/aom_dist_wtd_sad32x16_avg   ssse3/;
-  specialize qw/aom_dist_wtd_sad16x32_avg   ssse3/;
-  specialize qw/aom_dist_wtd_sad16x16_avg   ssse3/;
-  specialize qw/aom_dist_wtd_sad16x8_avg    ssse3/;
-  specialize qw/aom_dist_wtd_sad8x16_avg    ssse3/;
-  specialize qw/aom_dist_wtd_sad8x8_avg     ssse3/;
-  specialize qw/aom_dist_wtd_sad8x4_avg     ssse3/;
-  specialize qw/aom_dist_wtd_sad4x8_avg     ssse3/;
-  specialize qw/aom_dist_wtd_sad4x4_avg     ssse3/;
-
-  specialize qw/aom_dist_wtd_sad4x16_avg     ssse3/;
-  specialize qw/aom_dist_wtd_sad16x4_avg     ssse3/;
-  specialize qw/aom_dist_wtd_sad8x32_avg     ssse3/;
-  specialize qw/aom_dist_wtd_sad32x8_avg     ssse3/;
-  specialize qw/aom_dist_wtd_sad16x64_avg    ssse3/;
-  specialize qw/aom_dist_wtd_sad64x16_avg    ssse3/;
-
-  add_proto qw/unsigned int/, "aom_sad4xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
-  add_proto qw/unsigned int/, "aom_sad8xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
-  add_proto qw/unsigned int/, "aom_sad16xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
-  add_proto qw/unsigned int/, "aom_sad32xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
-  add_proto qw/unsigned int/, "aom_sad64xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
-  add_proto qw/unsigned int/, "aom_sad128xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
-
-  specialize qw/aom_sad4xh   sse2/;
-  specialize qw/aom_sad8xh   sse2/;
-  specialize qw/aom_sad16xh  sse2/;
-  specialize qw/aom_sad32xh  sse2/;
-  specialize qw/aom_sad64xh  sse2/;
-  specialize qw/aom_sad128xh sse2/;
 
   foreach (@block_sizes) {
     ($w, $h) = @$_;
@@ -813,12 +412,6 @@
   #
   foreach (@block_sizes) {
     ($w, $h) = @$_;
-    add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask";
-    specialize "aom_masked_sad${w}x${h}", qw/ssse3 avx2/;
-  }
-
-  foreach (@block_sizes) {
-    ($w, $h) = @$_;
     add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask";
     specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2/;
   }
@@ -828,14 +421,6 @@
   #
   foreach (@block_sizes) {
     ($w, $h) = @$_;
-    add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
-    if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
-       specialize "aom_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
-    }
-  }
-
-  foreach (@block_sizes) {
-    ($w, $h) = @$_;
     add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
     if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
       specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
@@ -847,128 +432,6 @@
   #
   foreach (@block_sizes) {
     ($w, $h) = @$_;
-    add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
-    add_proto qw/void/, "aom_sad${w}x${h}x4d_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]";
-    add_proto qw/void/, "aom_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
-    add_proto qw/void/, "aom_masked_sad${w}x${h}x4d", "const uint8_t *src, int src_stride, const uint8_t *ref[], int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned sads[]";
-  }
-
-  specialize qw/aom_sad128x128x4d avx2          sse2/;
-  specialize qw/aom_sad128x64x4d  avx2          sse2/;
-  specialize qw/aom_sad64x128x4d  avx2          sse2/;
-  specialize qw/aom_sad64x64x4d   avx2 neon msa sse2/;
-  specialize qw/aom_sad64x32x4d   avx2      msa sse2/;
-  specialize qw/aom_sad64x16x4d   avx2          sse2/;
-  specialize qw/aom_sad32x64x4d   avx2      msa sse2/;
-  specialize qw/aom_sad32x32x4d   avx2 neon msa sse2/;
-  specialize qw/aom_sad32x16x4d   avx2      msa sse2/;
-  specialize qw/aom_sad32x8x4d    avx2          sse2/;
-  specialize qw/aom_sad16x64x4d                 sse2/;
-  specialize qw/aom_sad16x32x4d             msa sse2/;
-  specialize qw/aom_sad16x16x4d         neon msa sse2/;
-  specialize qw/aom_sad16x8x4d               msa sse2/;
-
-  specialize qw/aom_sad8x16x4d              msa sse2/;
-  specialize qw/aom_sad8x8x4d               msa sse2/;
-  specialize qw/aom_sad8x4x4d               msa sse2/;
-  specialize qw/aom_sad4x16x4d              msa sse2/;
-  specialize qw/aom_sad4x8x4d               msa sse2/;
-  specialize qw/aom_sad4x4x4d               msa sse2/;
-
-  specialize qw/aom_sad4x32x4d  sse2/;
-  specialize qw/aom_sad4x16x4d  sse2/;
-  specialize qw/aom_sad16x4x4d  sse2/;
-  specialize qw/aom_sad8x32x4d  sse2/;
-  specialize qw/aom_sad32x8x4d  sse2/;
-  specialize qw/aom_sad64x16x4d sse2/;
-
-  specialize qw/aom_sad_skip_128x128x4d avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_128x64x4d  avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_64x128x4d  avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_64x64x4d   avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_64x32x4d   avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_64x16x4d   avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_32x64x4d   avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_32x32x4d   avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_32x16x4d   avx2 sse2 neon/;
-  specialize qw/aom_sad_skip_32x8x4d    avx2 sse2 neon/;
-
-  specialize qw/aom_sad_skip_16x64x4d        sse2 neon/;
-  specialize qw/aom_sad_skip_16x32x4d        sse2 neon/;
-  specialize qw/aom_sad_skip_16x16x4d        sse2 neon/;
-  specialize qw/aom_sad_skip_16x8x4d         sse2 neon/;
-  specialize qw/aom_sad_skip_8x16x4d         sse2 neon/;
-  specialize qw/aom_sad_skip_8x8x4d          sse2 neon/;
-  specialize qw/aom_sad_skip_4x16x4d         sse2 neon/;
-  specialize qw/aom_sad_skip_4x8x4d          sse2 neon/;
-  specialize qw/aom_sad_skip_4x32x4d         sse2 neon/;
-  specialize qw/aom_sad_skip_4x16x4d         sse2 neon/;
-  specialize qw/aom_sad_skip_8x32x4d         sse2 neon/;
-  specialize qw/aom_sad_skip_32x8x4d         sse2 neon/;
-  specialize qw/aom_sad_skip_64x16x4d        sse2 neon/;
-
-  specialize qw/aom_sad128x128x4d_avg sse2/;
-  specialize qw/aom_sad128x64x4d_avg  sse2/;
-  specialize qw/aom_sad64x128x4d_avg  sse2/;
-  specialize qw/aom_sad64x64x4d_avg   sse2/;
-  specialize qw/aom_sad64x32x4d_avg   sse2/;
-  specialize qw/aom_sad64x16x4d_avg   sse2/;
-  specialize qw/aom_sad32x64x4d_avg   sse2/;
-  specialize qw/aom_sad32x32x4d_avg   sse2/;
-  specialize qw/aom_sad32x16x4d_avg   sse2/;
-  specialize qw/aom_sad32x8x4d_avg    sse2/;
-  specialize qw/aom_sad16x64x4d_avg   sse2/;
-  specialize qw/aom_sad16x32x4d_avg   sse2/;
-  specialize qw/aom_sad16x16x4d_avg   sse2/;
-  specialize qw/aom_sad16x8x4d_avg    sse2/;
-
-  specialize qw/aom_sad8x16x4d_avg    sse2/;
-  specialize qw/aom_sad8x8x4d_avg     sse2/;
-  specialize qw/aom_sad8x4x4d_avg     sse2/;
-  specialize qw/aom_sad4x16x4d_avg    sse2/;
-  specialize qw/aom_sad4x8x4d_avg     sse2/;
-  specialize qw/aom_sad4x4x4d_avg     sse2/;
-
-  specialize qw/aom_sad4x32x4d_avg    sse2/;
-  specialize qw/aom_sad4x16x4d_avg    sse2/;
-  specialize qw/aom_sad16x4x4d_avg    sse2/;
-  specialize qw/aom_sad8x32x4d_avg    sse2/;
-  specialize qw/aom_sad32x8x4d_avg    sse2/;
-  specialize qw/aom_sad64x16x4d_avg   sse2/;
-
-  specialize qw/aom_masked_sad128x128x4d  ssse3/;
-  specialize qw/aom_masked_sad128x64x4d   ssse3/;
-  specialize qw/aom_masked_sad64x128x4d   ssse3/;
-  specialize qw/aom_masked_sad64x64x4d    ssse3/;
-  specialize qw/aom_masked_sad64x32x4d    ssse3/;
-  specialize qw/aom_masked_sad64x16x4d    ssse3/;
-  specialize qw/aom_masked_sad32x64x4d    ssse3/;
-  specialize qw/aom_masked_sad32x32x4d    ssse3/;
-  specialize qw/aom_masked_sad32x16x4d    ssse3/;
-  specialize qw/aom_masked_sad32x8x4d     ssse3/;
-  specialize qw/aom_masked_sad16x64x4d    ssse3/;
-  specialize qw/aom_masked_sad16x32x4d    ssse3/;
-  specialize qw/aom_masked_sad16x16x4d    ssse3/;
-  specialize qw/aom_masked_sad16x8x4d     ssse3/;
-
-  specialize qw/aom_masked_sad8x16x4d     ssse3/;
-  specialize qw/aom_masked_sad8x8x4d      ssse3/;
-  specialize qw/aom_masked_sad8x4x4d      ssse3/;
-  specialize qw/aom_masked_sad4x16x4d     ssse3/;
-  specialize qw/aom_masked_sad4x8x4d      ssse3/;
-  specialize qw/aom_masked_sad4x4x4d      ssse3/;
-
-  specialize qw/aom_masked_sad4x32x4d     ssse3/;
-  specialize qw/aom_masked_sad4x16x4d     ssse3/;
-  specialize qw/aom_masked_sad16x4x4d     ssse3/;
-  specialize qw/aom_masked_sad8x32x4d     ssse3/;
-  specialize qw/aom_masked_sad32x8x4d     ssse3/;
-  specialize qw/aom_masked_sad64x16x4d    ssse3/;
-  #
-  # Multi-block SAD, comparing a reference to N independent blocks
-  #
-  foreach (@block_sizes) {
-    ($w, $h) = @$_;
     add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
     add_proto qw/void/, "aom_highbd_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
     if ($w != 128 && $h != 128) {
@@ -1023,15 +486,6 @@
   #
   # Avg
   #
-  add_proto qw/unsigned int aom_avg_8x8/, "const uint8_t *, int p";
-  specialize qw/aom_avg_8x8 sse2 neon/;
-
-  add_proto qw/unsigned int aom_avg_4x4/, "const uint8_t *, int p";
-  specialize qw/aom_avg_4x4 sse2 neon/;
-
-  add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
-  specialize qw/aom_minmax_8x8 sse2/;
-
   add_proto qw/unsigned int aom_highbd_avg_8x8/, "const uint8_t *, int p";
   add_proto qw/unsigned int aom_highbd_avg_4x4/, "const uint8_t *, int p";
   add_proto qw/void aom_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
@@ -1050,21 +504,6 @@
   #
   # hamadard transform and satd for implmenting temporal dependency model
   #
-  add_proto qw/void aom_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-  specialize qw/aom_hadamard_8x8 sse2 neon/;
-
-  add_proto qw/void aom_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-  specialize qw/aom_hadamard_16x16 avx2 sse2 neon/;
-
-  add_proto qw/void aom_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-  specialize qw/aom_hadamard_32x32 avx2 sse2/;
-
-  add_proto qw/void aom_hadamard_lp_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
-  specialize qw/aom_hadamard_lp_8x8 sse2 neon/;
-
-  add_proto qw/void aom_hadamard_lp_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
-  specialize qw/aom_hadamard_lp_16x16 avx2 neon/;
-
   add_proto qw/void aom_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
   specialize qw/aom_highbd_hadamard_8x8 avx2/;
 
@@ -1084,12 +523,6 @@
   # Structured Similarity (SSIM)
   #
   if (aom_config("CONFIG_INTERNAL_STATS") eq "yes") {
-    add_proto qw/void aom_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    specialize qw/aom_ssim_parms_8x8/, "$sse2_x86_64";
-
-    add_proto qw/void aom_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    specialize qw/aom_ssim_parms_16x16/, "$sse2_x86_64";
-
     add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
   }
 }  # CONFIG_AV1_ENCODER
@@ -1098,23 +531,6 @@
   #
   # Specialty Variance
   #
-  add_proto qw/void aom_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-
-  add_proto qw/void aom_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-
-  specialize qw/aom_get16x16var                neon msa/;
-  specialize qw/aom_get8x8var             sse2 neon msa/;
-
-
-  add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int aom_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int aom_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-
-  specialize qw/aom_mse16x16          sse2 avx2 neon msa/;
-  specialize qw/aom_mse16x8           sse2           msa/;
-  specialize qw/aom_mse8x16           sse2           msa/;
-  specialize qw/aom_mse8x8            sse2           msa/;
 
   foreach $bd (8, 10, 12) {
     add_proto qw/void/, "aom_highbd_${bd}_get16x16var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
@@ -1133,30 +549,6 @@
   #
   #
   #
-  add_proto qw/void aom_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                          const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3,
-                                          int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search";
-  specialize qw/aom_upsampled_pred sse2/;
-
-  add_proto qw/void aom_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                   const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-                                                   int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-                                                   int ref_stride, int subpel_search";
-  specialize qw/aom_comp_avg_upsampled_pred sse2/;
-
-  add_proto qw/void aom_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                       const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-                                                       int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-                                                       int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
-  specialize qw/aom_dist_wtd_comp_avg_upsampled_pred ssse3/;
-
-  add_proto qw/void aom_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                       const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-                                                       int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-                                                       int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
-                                                       int subpel_search";
-  specialize qw/aom_comp_mask_upsampled_pred sse2/;
-
   add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
                                                    const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3,
                                                    int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
@@ -1173,131 +565,23 @@
                                                               int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
   specialize qw/aom_highbd_dist_wtd_comp_avg_upsampled_pred sse2/;
 
+  add_proto qw/void aom_highbd_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                              const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+                                                              int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+                                                              int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+                                                              int bd, int subpel_search";
+
 
   #
   #
   #
   add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *";
-  add_proto qw/unsigned int aom_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
 
   specialize qw/aom_get_mb_ss sse2 msa/;
-  specialize qw/aom_get4x4sse_cs neon msa/;
 
   #
   # Variance / Subpixel Variance / Subpixel Avg Variance
   #
-  add_proto qw/unsigned int/, "aom_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-  add_proto qw/unsigned int/, "aom_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-  add_proto qw/unsigned int/, "aom_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-  add_proto qw/uint64_t/, "aom_mse_wxh_16bit", "uint8_t *dst, int dstride,uint16_t *src, int sstride, int w, int h";
-  specialize qw/aom_mse_wxh_16bit  sse2 avx2/;
-
-  foreach (@block_sizes) {
-    ($w, $h) = @$_;
-    add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    add_proto qw/uint32_t/, "aom_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    add_proto qw/uint32_t/, "aom_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param";
-  }
-  specialize qw/aom_variance128x128   sse2 avx2 neon    /;
-  specialize qw/aom_variance128x64    sse2 avx2 neon    /;
-  specialize qw/aom_variance64x128    sse2 avx2 neon    /;
-  specialize qw/aom_variance64x64     sse2 avx2 neon msa/;
-  specialize qw/aom_variance64x32     sse2 avx2 neon msa/;
-  specialize qw/aom_variance32x64     sse2 avx2 neon msa/;
-  specialize qw/aom_variance32x32     sse2 avx2 neon msa/;
-  specialize qw/aom_variance32x16     sse2 avx2 neon msa/;
-  specialize qw/aom_variance16x32     sse2 avx2 neon msa/;
-  specialize qw/aom_variance16x16     sse2 avx2 neon msa/;
-  specialize qw/aom_variance16x8      sse2 avx2 neon msa/;
-  specialize qw/aom_variance8x16      sse2      neon msa/;
-  specialize qw/aom_variance8x8       sse2      neon msa/;
-  specialize qw/aom_variance8x4       sse2      neon msa/;
-  specialize qw/aom_variance4x8       sse2      neon msa/;
-  specialize qw/aom_variance4x4       sse2      neon msa/;
-
-  specialize qw/aom_sub_pixel_variance128x128   avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance128x64    avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance64x128    avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance64x64     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance64x32     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance32x64     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance32x32     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance32x16     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x32     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x16     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x8      avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance8x16           neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance8x8            neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance8x4            neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance4x8            neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance4x4            neon msa sse2 ssse3/;
-
-  specialize qw/aom_sub_pixel_avg_variance128x128 avx2     sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance128x64  avx2     sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance64x128  avx2     sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance64x64   avx2 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance64x32   avx2 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance32x64   avx2 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance32x32   avx2 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance32x16   avx2 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x32        msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x16        msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x8         msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance8x16         msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance8x8          msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance8x4          msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance4x8          msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance4x4          msa sse2 ssse3/;
-
-  specialize qw/aom_variance4x16 sse2/;
-  specialize qw/aom_variance16x4 sse2 avx2/;
-  specialize qw/aom_variance8x32 sse2/;
-  specialize qw/aom_variance32x8 sse2 avx2/;
-  specialize qw/aom_variance16x64 sse2 avx2/;
-  specialize qw/aom_variance64x16 sse2 avx2/;
-
-  specialize qw/aom_sub_pixel_variance4x16 neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x4 neon avx2 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance8x32 neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance32x8 neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x64 neon avx2 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance64x16 neon sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance8x32 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance32x8 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x64 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance64x16 sse2 ssse3/;
-
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x64 ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x32 ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x64 ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x32 ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x16 ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x32 ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x16 ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x8  ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x16  ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x8   ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x4   ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x8   ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x4   ssse3/;
-
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16  ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4  ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x32  ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x8  ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x64 ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x16 ssse3/;
-
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x128  ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x64   ssse3/;
-  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x128   ssse3/;
-
   foreach $bd (8, 10, 12) {
     add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 
@@ -1333,12 +617,6 @@
   #
   # Masked Variance / Masked Subpixel Variance
   #
-  foreach (@block_sizes) {
-    ($w, $h) = @$_;
-    add_proto qw/unsigned int/, "aom_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
-    specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
-  }
-
   foreach $bd ("_8_", "_10_", "_12_") {
     foreach (@block_sizes) {
       ($w, $h) = @$_;
@@ -1350,14 +628,6 @@
   #
   # OBMC Variance / OBMC Subpixel Variance
   #
-  foreach (@block_sizes) {
-    ($w, $h) = @$_;
-    add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
-    add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
-    specialize "aom_obmc_variance${w}x${h}", qw/sse4_1 avx2/;
-    specialize "aom_obmc_sub_pixel_variance${w}x${h}", q/sse4_1/;
-  }
-
   foreach $bd ("_", "_10_", "_12_") {
     foreach (@block_sizes) {
       ($w, $h) = @$_;
@@ -1367,53 +637,9 @@
     }
   }
 
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance64x32 msa sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance32x64 msa sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance32x16 msa sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
-
   #
   # Comp Avg
   #
-  add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
-
-  add_proto qw/void aom_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param";
-  specialize qw/aom_dist_wtd_comp_avg_pred ssse3/;
-
   add_proto qw/unsigned int aom_highbd_12_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/aom_highbd_12_variance128x128 sse2/;
 
@@ -1830,9 +1056,6 @@
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
 
-  add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
-  specialize qw/aom_comp_mask_pred ssse3 avx2/;
-
   add_proto qw/void aom_highbd_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
   specialize qw/aom_highbd_comp_mask_pred sse2 avx2/;
 }  # CONFIG_AV1_ENCODER

diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
index eb48a42..82a9fc3 100644
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c

@@ -16,38 +16,6 @@
 #include "av1/common/arm/mem_neon.h"
 #include "av1/common/arm/transpose_neon.h"
 
-unsigned int aom_avg_4x4_neon(const uint8_t *a, int a_stride) {
-  const uint8x16_t b = load_unaligned_u8q(a, a_stride);
-  const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b));
-#if defined(__aarch64__)
-  const uint32_t d = vaddlvq_u16(c);
-  return (d + 8) >> 4;
-#else
-  const uint32x2_t d = horizontal_add_u16x8(c);
-  return vget_lane_u32(vrshr_n_u32(d, 4), 0);
-#endif
-}
-
-unsigned int aom_avg_8x8_neon(const uint8_t *a, int a_stride) {
-  uint16x8_t sum;
-  uint32x2_t d;
-  uint8x8_t b = vld1_u8(a);
-  a += a_stride;
-  uint8x8_t c = vld1_u8(a);
-  a += a_stride;
-  sum = vaddl_u8(b, c);
-
-  for (int i = 0; i < 6; ++i) {
-    const uint8x8_t e = vld1_u8(a);
-    a += a_stride;
-    sum = vaddw_u8(sum, e);
-  }
-
-  d = horizontal_add_u16x8(sum);
-
-  return vget_lane_u32(vrshr_n_u32(d, 6), 0);
-}
-
 int aom_satd_lp_neon(const int16_t *coeff, int length) {
   const int16x4_t zero = vdup_n_s16(0);
   int32x4_t accum = vdupq_n_s32(0);

diff --git a/aom_dsp/arm/hadamard_neon.c b/aom_dsp/arm/hadamard_neon.c
deleted file mode 100644
index 929792a..0000000
--- a/aom_dsp/arm/hadamard_neon.c
+++ /dev/null

@@ -1,183 +0,0 @@
-/*
- *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_dsp_rtcd.h"
-#include "aom/aom_integer.h"
-#include "av1/common/arm/mem_neon.h"
-#include "av1/common/arm/transpose_neon.h"
-
-static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
-                                 int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
-                                 int16x8_t *a6, int16x8_t *a7) {
-  const int16x8_t b0 = vaddq_s16(*a0, *a1);
-  const int16x8_t b1 = vsubq_s16(*a0, *a1);
-  const int16x8_t b2 = vaddq_s16(*a2, *a3);
-  const int16x8_t b3 = vsubq_s16(*a2, *a3);
-  const int16x8_t b4 = vaddq_s16(*a4, *a5);
-  const int16x8_t b5 = vsubq_s16(*a4, *a5);
-  const int16x8_t b6 = vaddq_s16(*a6, *a7);
-  const int16x8_t b7 = vsubq_s16(*a6, *a7);
-
-  const int16x8_t c0 = vaddq_s16(b0, b2);
-  const int16x8_t c1 = vaddq_s16(b1, b3);
-  const int16x8_t c2 = vsubq_s16(b0, b2);
-  const int16x8_t c3 = vsubq_s16(b1, b3);
-  const int16x8_t c4 = vaddq_s16(b4, b6);
-  const int16x8_t c5 = vaddq_s16(b5, b7);
-  const int16x8_t c6 = vsubq_s16(b4, b6);
-  const int16x8_t c7 = vsubq_s16(b5, b7);
-
-  *a0 = vaddq_s16(c0, c4);
-  *a1 = vsubq_s16(c2, c6);
-  *a2 = vsubq_s16(c0, c4);
-  *a3 = vaddq_s16(c2, c6);
-  *a4 = vaddq_s16(c3, c7);
-  *a5 = vsubq_s16(c3, c7);
-  *a6 = vsubq_s16(c1, c5);
-  *a7 = vaddq_s16(c1, c5);
-}
-
-void aom_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
-                           tran_low_t *coeff) {
-  int16x8_t a0 = vld1q_s16(src_diff);
-  int16x8_t a1 = vld1q_s16(src_diff + src_stride);
-  int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
-  int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
-  int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
-  int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
-  int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
-  int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
-
-  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
-
-  transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
-
-  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
-
-  // Skip the second transpose because it is not required.
-
-  store_s16q_to_tran_low(coeff + 0, a0);
-  store_s16q_to_tran_low(coeff + 8, a1);
-  store_s16q_to_tran_low(coeff + 16, a2);
-  store_s16q_to_tran_low(coeff + 24, a3);
-  store_s16q_to_tran_low(coeff + 32, a4);
-  store_s16q_to_tran_low(coeff + 40, a5);
-  store_s16q_to_tran_low(coeff + 48, a6);
-  store_s16q_to_tran_low(coeff + 56, a7);
-}
-
-void aom_hadamard_lp_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
-                              int16_t *coeff) {
-  int16x8_t a0 = vld1q_s16(src_diff);
-  int16x8_t a1 = vld1q_s16(src_diff + src_stride);
-  int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
-  int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
-  int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
-  int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
-  int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
-  int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
-
-  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
-
-  transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
-
-  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
-
-  // Skip the second transpose because it is not required.
-
-  vst1q_s16(coeff + 0, a0);
-  vst1q_s16(coeff + 8, a1);
-  vst1q_s16(coeff + 16, a2);
-  vst1q_s16(coeff + 24, a3);
-  vst1q_s16(coeff + 32, a4);
-  vst1q_s16(coeff + 40, a5);
-  vst1q_s16(coeff + 48, a6);
-  vst1q_s16(coeff + 56, a7);
-}
-
-void aom_hadamard_lp_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
-                                int16_t *coeff) {
-  /* Rearrange 16x16 to 8x32 and remove stride.
-   * Top left first. */
-  aom_hadamard_lp_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride,
-                           coeff + 0);
-  /* Top right. */
-  aom_hadamard_lp_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride,
-                           coeff + 64);
-  /* Bottom left. */
-  aom_hadamard_lp_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride,
-                           coeff + 128);
-  /* Bottom right. */
-  aom_hadamard_lp_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride,
-                           coeff + 192);
-
-  for (int i = 0; i < 64; i += 8) {
-    const int16x8_t a0 = vld1q_s16(coeff + 0);
-    const int16x8_t a1 = vld1q_s16(coeff + 64);
-    const int16x8_t a2 = vld1q_s16(coeff + 128);
-    const int16x8_t a3 = vld1q_s16(coeff + 192);
-
-    const int16x8_t b0 = vhaddq_s16(a0, a1);
-    const int16x8_t b1 = vhsubq_s16(a0, a1);
-    const int16x8_t b2 = vhaddq_s16(a2, a3);
-    const int16x8_t b3 = vhsubq_s16(a2, a3);
-
-    const int16x8_t c0 = vaddq_s16(b0, b2);
-    const int16x8_t c1 = vaddq_s16(b1, b3);
-    const int16x8_t c2 = vsubq_s16(b0, b2);
-    const int16x8_t c3 = vsubq_s16(b1, b3);
-
-    vst1q_s16(coeff + 0, c0);
-    vst1q_s16(coeff + 64, c1);
-    vst1q_s16(coeff + 128, c2);
-    vst1q_s16(coeff + 192, c3);
-
-    coeff += 8;
-  }
-}
-
-void aom_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
-                             tran_low_t *coeff) {
-  /* Rearrange 16x16 to 8x32 and remove stride.
-   * Top left first. */
-  aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
-  /* Top right. */
-  aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
-  /* Bottom left. */
-  aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
-  /* Bottom right. */
-  aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
-
-  for (int i = 0; i < 64; i += 8) {
-    const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0);
-    const int16x8_t a1 = load_tran_low_to_s16q(coeff + 64);
-    const int16x8_t a2 = load_tran_low_to_s16q(coeff + 128);
-    const int16x8_t a3 = load_tran_low_to_s16q(coeff + 192);
-
-    const int16x8_t b0 = vhaddq_s16(a0, a1);
-    const int16x8_t b1 = vhsubq_s16(a0, a1);
-    const int16x8_t b2 = vhaddq_s16(a2, a3);
-    const int16x8_t b3 = vhsubq_s16(a2, a3);
-
-    const int16x8_t c0 = vaddq_s16(b0, b2);
-    const int16x8_t c1 = vaddq_s16(b1, b3);
-    const int16x8_t c2 = vsubq_s16(b0, b2);
-    const int16x8_t c3 = vsubq_s16(b1, b3);
-
-    store_s16q_to_tran_low(coeff + 0, c0);
-    store_s16q_to_tran_low(coeff + 64, c1);
-    store_s16q_to_tran_low(coeff + 128, c2);
-    store_s16q_to_tran_low(coeff + 192, c3);
-
-    coeff += 8;
-  }
-}

diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index 0e80528..5f45ee8 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c

@@ -19,519 +19,6 @@
 
 #include "aom/aom_integer.h"
 
-//------------------------------------------------------------------------------
-// DC 4x4
-
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
-                          const uint8_t *left, int do_above, int do_left) {
-  uint16x8_t sum_top;
-  uint16x8_t sum_left;
-  uint8x8_t dc0;
-
-  if (do_above) {
-    const uint8x8_t A = vld1_u8(above);  // top row
-    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
-    const uint16x4_t p1 = vpadd_u16(p0, p0);
-    sum_top = vcombine_u16(p1, p1);
-  }
-
-  if (do_left) {
-    const uint8x8_t L = vld1_u8(left);   // left border
-    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
-    const uint16x4_t p1 = vpadd_u16(p0, p0);
-    sum_left = vcombine_u16(p1, p1);
-  }
-
-  if (do_above && do_left) {
-    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
-    dc0 = vrshrn_n_u16(sum, 3);
-  } else if (do_above) {
-    dc0 = vrshrn_n_u16(sum_top, 2);
-  } else if (do_left) {
-    dc0 = vrshrn_n_u16(sum_left, 2);
-  } else {
-    dc0 = vdup_n_u8(0x80);
-  }
-
-  {
-    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
-    int i;
-    for (i = 0; i < 4; ++i) {
-      vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
-    }
-  }
-}
-
-void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  dc_4x4(dst, stride, above, left, 1, 1);
-}
-
-void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  dc_4x4(dst, stride, NULL, left, 0, 1);
-}
-
-void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  dc_4x4(dst, stride, above, NULL, 1, 0);
-}
-
-void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  dc_4x4(dst, stride, NULL, NULL, 0, 0);
-}
-
-//------------------------------------------------------------------------------
-// DC 8x8
-
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
-                          const uint8_t *left, int do_above, int do_left) {
-  uint16x8_t sum_top;
-  uint16x8_t sum_left;
-  uint8x8_t dc0;
-
-  if (do_above) {
-    const uint8x8_t A = vld1_u8(above);  // top row
-    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
-    const uint16x4_t p1 = vpadd_u16(p0, p0);
-    const uint16x4_t p2 = vpadd_u16(p1, p1);
-    sum_top = vcombine_u16(p2, p2);
-  }
-
-  if (do_left) {
-    const uint8x8_t L = vld1_u8(left);   // left border
-    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
-    const uint16x4_t p1 = vpadd_u16(p0, p0);
-    const uint16x4_t p2 = vpadd_u16(p1, p1);
-    sum_left = vcombine_u16(p2, p2);
-  }
-
-  if (do_above && do_left) {
-    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
-    dc0 = vrshrn_n_u16(sum, 4);
-  } else if (do_above) {
-    dc0 = vrshrn_n_u16(sum_top, 3);
-  } else if (do_left) {
-    dc0 = vrshrn_n_u16(sum_left, 3);
-  } else {
-    dc0 = vdup_n_u8(0x80);
-  }
-
-  {
-    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
-    int i;
-    for (i = 0; i < 8; ++i) {
-      vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc));
-    }
-  }
-}
-
-void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  dc_8x8(dst, stride, above, left, 1, 1);
-}
-
-void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  dc_8x8(dst, stride, NULL, left, 0, 1);
-}
-
-void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  dc_8x8(dst, stride, above, NULL, 1, 0);
-}
-
-void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  dc_8x8(dst, stride, NULL, NULL, 0, 0);
-}
-
-//------------------------------------------------------------------------------
-// DC 16x16
-
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *above, const uint8_t *left,
-                            int do_above, int do_left) {
-  uint16x8_t sum_top;
-  uint16x8_t sum_left;
-  uint8x8_t dc0;
-
-  if (do_above) {
-    const uint8x16_t A = vld1q_u8(above);  // top row
-    const uint16x8_t p0 = vpaddlq_u8(A);   // cascading summation of the top
-    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-    const uint16x4_t p2 = vpadd_u16(p1, p1);
-    const uint16x4_t p3 = vpadd_u16(p2, p2);
-    sum_top = vcombine_u16(p3, p3);
-  }
-
-  if (do_left) {
-    const uint8x16_t L = vld1q_u8(left);  // left row
-    const uint16x8_t p0 = vpaddlq_u8(L);  // cascading summation of the left
-    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-    const uint16x4_t p2 = vpadd_u16(p1, p1);
-    const uint16x4_t p3 = vpadd_u16(p2, p2);
-    sum_left = vcombine_u16(p3, p3);
-  }
-
-  if (do_above && do_left) {
-    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
-    dc0 = vrshrn_n_u16(sum, 5);
-  } else if (do_above) {
-    dc0 = vrshrn_n_u16(sum_top, 4);
-  } else if (do_left) {
-    dc0 = vrshrn_n_u16(sum_left, 4);
-  } else {
-    dc0 = vdup_n_u8(0x80);
-  }
-
-  {
-    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
-    int i;
-    for (i = 0; i < 16; ++i) {
-      vst1q_u8(dst + i * stride, dc);
-    }
-  }
-}
-
-void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  dc_16x16(dst, stride, above, left, 1, 1);
-}
-
-void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  dc_16x16(dst, stride, NULL, left, 0, 1);
-}
-
-void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  dc_16x16(dst, stride, above, NULL, 1, 0);
-}
-
-void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  dc_16x16(dst, stride, NULL, NULL, 0, 0);
-}
-
-//------------------------------------------------------------------------------
-// DC 32x32
-
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *above, const uint8_t *left,
-                            int do_above, int do_left) {
-  uint16x8_t sum_top;
-  uint16x8_t sum_left;
-  uint8x8_t dc0;
-
-  if (do_above) {
-    const uint8x16_t A0 = vld1q_u8(above);  // top row
-    const uint8x16_t A1 = vld1q_u8(above + 16);
-    const uint16x8_t p0 = vpaddlq_u8(A0);  // cascading summation of the top
-    const uint16x8_t p1 = vpaddlq_u8(A1);
-    const uint16x8_t p2 = vaddq_u16(p0, p1);
-    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
-    const uint16x4_t p4 = vpadd_u16(p3, p3);
-    const uint16x4_t p5 = vpadd_u16(p4, p4);
-    sum_top = vcombine_u16(p5, p5);
-  }
-
-  if (do_left) {
-    const uint8x16_t L0 = vld1q_u8(left);  // left row
-    const uint8x16_t L1 = vld1q_u8(left + 16);
-    const uint16x8_t p0 = vpaddlq_u8(L0);  // cascading summation of the left
-    const uint16x8_t p1 = vpaddlq_u8(L1);
-    const uint16x8_t p2 = vaddq_u16(p0, p1);
-    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
-    const uint16x4_t p4 = vpadd_u16(p3, p3);
-    const uint16x4_t p5 = vpadd_u16(p4, p4);
-    sum_left = vcombine_u16(p5, p5);
-  }
-
-  if (do_above && do_left) {
-    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
-    dc0 = vrshrn_n_u16(sum, 6);
-  } else if (do_above) {
-    dc0 = vrshrn_n_u16(sum_top, 5);
-  } else if (do_left) {
-    dc0 = vrshrn_n_u16(sum_left, 5);
-  } else {
-    dc0 = vdup_n_u8(0x80);
-  }
-
-  {
-    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
-    int i;
-    for (i = 0; i < 32; ++i) {
-      vst1q_u8(dst + i * stride, dc);
-      vst1q_u8(dst + i * stride + 16, dc);
-    }
-  }
-}
-
-void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  dc_32x32(dst, stride, above, left, 1, 1);
-}
-
-void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  dc_32x32(dst, stride, NULL, left, 0, 1);
-}
-
-void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  dc_32x32(dst, stride, above, NULL, 1, 0);
-}
-
-void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  dc_32x32(dst, stride, NULL, NULL, 0, 0);
-}
-
-// -----------------------------------------------------------------------------
-
-void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
-  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
-  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
-  const uint32x2_t zero = vdup_n_u32(0);
-  const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
-  const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
-  const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
-  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
-  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
-  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
-  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
-  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
-  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
-  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
-  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
-  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
-  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
-  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
-  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
-  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
-  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
-  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
-  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
-  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
-}
-
-void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  int i;
-  uint32x2_t d0u32 = vdup_n_u32(0);
-  (void)left;
-
-  d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
-  for (i = 0; i < 4; i++, dst += stride)
-    vst1_lane_u32((uint32_t *)dst, d0u32, 0);
-}
-
-void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  int i;
-  uint8x8_t d0u8 = vdup_n_u8(0);
-  (void)left;
-
-  d0u8 = vld1_u8(above);
-  for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
-}
-
-void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  int i;
-  uint8x16_t q0u8 = vdupq_n_u8(0);
-  (void)left;
-
-  q0u8 = vld1q_u8(above);
-  for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
-}
-
-void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  int i;
-  uint8x16_t q0u8 = vdupq_n_u8(0);
-  uint8x16_t q1u8 = vdupq_n_u8(0);
-  (void)left;
-
-  q0u8 = vld1q_u8(above);
-  q1u8 = vld1q_u8(above + 16);
-  for (i = 0; i < 32; i++, dst += stride) {
-    vst1q_u8(dst, q0u8);
-    vst1q_u8(dst + 16, q1u8);
-  }
-}
-
-void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  uint8x8_t d0u8 = vdup_n_u8(0);
-  uint32x2_t d1u32 = vdup_n_u32(0);
-  (void)above;
-
-  d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
-
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-}
-
-void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  uint8x8_t d0u8 = vdup_n_u8(0);
-  uint64x1_t d1u64 = vdup_n_u64(0);
-  (void)above;
-
-  d1u64 = vld1_u64((const uint64_t *)left);
-
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
-  vst1_u8(dst, d0u8);
-}
-
-void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  int j;
-  uint8x8_t d2u8 = vdup_n_u8(0);
-  uint8x16_t q0u8 = vdupq_n_u8(0);
-  uint8x16_t q1u8 = vdupq_n_u8(0);
-  (void)above;
-
-  q1u8 = vld1q_u8(left);
-  d2u8 = vget_low_u8(q1u8);
-  for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
-    q0u8 = vdupq_lane_u8(d2u8, 0);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 1);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 2);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 3);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 4);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 5);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 6);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 7);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-  }
-}
-
-void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  int j, k;
-  uint8x8_t d2u8 = vdup_n_u8(0);
-  uint8x16_t q0u8 = vdupq_n_u8(0);
-  uint8x16_t q1u8 = vdupq_n_u8(0);
-  (void)above;
-
-  for (k = 0; k < 2; k++, left += 16) {
-    q1u8 = vld1q_u8(left);
-    d2u8 = vget_low_u8(q1u8);
-    for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
-      q0u8 = vdupq_lane_u8(d2u8, 0);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 1);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 2);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 3);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 4);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 5);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 6);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 7);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-    }
-  }
-}
-
 static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
                                        const uint16_t *above,
                                        const uint16_t *left) {
@@ -592,524 +79,4 @@
 intra_pred_square(dc);
 #undef intra_pred_square
 
-static const int sm_weight_log2_scale = 8;
-
-// max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST])
-#define MAX_BLOCK_DIM 64
-
-/* clang-format off */
-static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
-    // Unused, because we always offset by bs, which is at least 2.
-    0, 0,
-    // bs = 2
-    255, 128,
-    // bs = 4
-    255, 149, 85, 64,
-    // bs = 8
-    255, 197, 146, 105, 73, 50, 37, 32,
-    // bs = 16
-    255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
-    // bs = 32
-    255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
-    66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
-    // bs = 64
-    255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
-    150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
-    69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
-    15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4,
-};
 /* clang-format on */
-
-// -----------------------------------------------------------------------------
-// SMOOTH_PRED
-
-// pixels[0]: above and below_pred interleave vector
-// pixels[1]: left vector
-// pixels[2]: right_pred vector
-static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
-                                 int height, uint8x16_t *pixels) {
-  uint32x4_t zero = vdupq_n_u32(0);
-  const uint8x8_t d = vcreate_u8(((const uint32_t *)above)[0]);
-  if (height == 4)
-    pixels[1] =
-        vreinterpretq_u8_u32(vld1q_lane_u32((const uint32_t *)left, zero, 0));
-  else if (height == 8) {
-    pixels[1] = vreinterpretq_u8_u64(vsetq_lane_u64(
-        ((const uint64_t *)left)[0], vreinterpretq_u64_u32(zero), 0));
-  } else {
-    pixels[1] = vld1q_u8(left);
-  }
-
-  pixels[2] = vreinterpretq_u8_u16(vdupq_n_u16(above[3]));
-
-  const uint16x8_t bp = vdupq_n_u16(left[height - 1]);
-#if defined(__aarch64__)
-  pixels[0] = vreinterpretq_u8_u16(vzip1q_u16(vmovl_u8(d), bp));
-#else
-  pixels[0] = vreinterpretq_u8_u16(vzipq_u16(vmovl_u8(d), bp).val[0]);
-#endif  // (__aarch64__)
-}
-
-// weight_h[0]: weight_h vector
-// weight_h[1]: scale - weight_h vector
-// weight_h[2]: same as [0], second half for height = 16 only
-// weight_h[3]: same as [1], second half for height = 16 only
-// weight_w[0]: weights_w and scale - weights_w interleave vector
-static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
-                                  uint16x8_t *weight_h, uint16x8_t *weight_w) {
-  const uint16x8_t d = vdupq_n_u16((uint16_t)(1 << sm_weight_log2_scale));
-  const uint8x8_t t = vcreate_u8(((const uint32_t *)(weight_array))[1]);
-  weight_h[0] = vmovl_u8(t);
-  weight_h[1] = vsubw_u8(d, t);
-#if defined(__aarch64__)
-  weight_w[0] = vzip1q_u16(weight_h[0], weight_h[1]);
-#else
-  weight_w[0] = vzipq_u16(weight_h[0], weight_h[1]).val[0];
-#endif  // (__aarch64__)
-
-  if (height == 8) {
-    const uint8x8_t weight = vld1_u8(&weight_array[8]);
-    weight_h[0] = vmovl_u8(weight);
-    weight_h[1] = vsubw_u8(d, weight);
-  } else if (height == 16) {
-    const uint8x16_t zero = vdupq_n_u8(0);
-    const uint8x16_t weight = vld1q_u8(&weight_array[16]);
-    const uint8x16x2_t weight_h_02 = vzipq_u8(weight, zero);
-    weight_h[0] = vreinterpretq_u16_u8(weight_h_02.val[0]);
-    weight_h[1] = vsubq_u16(d, vreinterpretq_u16_u8(weight_h_02.val[0]));
-    weight_h[2] = vreinterpretq_u16_u8(weight_h_02.val[1]);
-    weight_h[3] = vsubq_u16(d, vreinterpretq_u16_u8(weight_h_02.val[1]));
-  }
-}
-
-static INLINE void smooth_pred_4xh(const uint8x16_t *pixel,
-                                   const uint16x8_t *wh, const uint16x8_t *ww,
-                                   int h, uint8_t *dst, ptrdiff_t stride,
-                                   int second_half) {
-  const uint16x4_t one = vdup_n_u16(1);
-  const uint16x4_t inc = vdup_n_u16(0x202);
-  uint16x4_t rep =
-      second_half ? vdup_n_u16((uint16_t)0x8008) : vdup_n_u16((uint16_t)0x8000);
-  uint16x4_t d = vdup_n_u16(0x100);
-  const uint16x4_t v_pixel_0_lo = vmovn_u32(vreinterpretq_u32_u8(pixel[0]));
-  const uint16x4_t v_pixel_0_hi =
-      vmovn_u32(vreinterpretq_u32_u8(vextq_u8(pixel[0], pixel[0], 2)));
-  const uint16x4_t v_pixel_2 = vget_low_u16(vreinterpretq_u16_u8(pixel[2]));
-  const uint16x4_t ww_0_lo = vmovn_u32(vreinterpretq_u32_u16(ww[0]));
-  const uint16x4_t ww_0_hi =
-      vmovn_u32(vreinterpretq_u32_u16(vextq_u16(ww[0], ww[0], 1)));
-  const uint8x8_t save_mask = vcreate_u8(0 + (2 << 8) + (4 << 16) + (6 << 24));
-
-#if !defined(__aarch64__)
-  const uint8x8x2_t v_split1 = { { vget_low_u8(vreinterpretq_u8_u16(wh[0])),
-                                   vget_high_u8(
-                                       vreinterpretq_u8_u16(wh[0])) } };
-  const uint8x8x2_t v_split2 = { { vget_low_u8(vreinterpretq_u8_u16(wh[1])),
-                                   vget_high_u8(
-                                       vreinterpretq_u8_u16(wh[1])) } };
-  const uint8x8x2_t v_split3 = { { vget_low_u8(pixel[1]),
-                                   vget_high_u8(pixel[1]) } };
-#endif  // (__aarch64__)
-
-  for (int i = 0; i < h; ++i) {
-#if defined(__aarch64__)
-    const uint8x8_t wg =
-        vqtbl1_u8(vreinterpretq_u8_u16(wh[0]), vreinterpret_u8_u16(d));
-    const uint8x8_t sc =
-        vqtbl1_u8(vreinterpretq_u8_u16(wh[1]), vreinterpret_u8_u16(d));
-#else
-    const uint8x8_t wg = vtbl2_u8(v_split1, vreinterpret_u8_u16(d));
-    const uint8x8_t sc = vtbl2_u8(v_split2, vreinterpret_u8_u16(d));
-#endif  // (__aarch64__)
-
-    uint32x4_t sum = vmull_u16(v_pixel_0_lo, vreinterpret_u16_u8(wg));
-    sum = vmlal_u16(sum, v_pixel_0_hi, vreinterpret_u16_u8(sc));
-
-#if defined(__aarch64__)
-    uint8x8_t b = vqtbl1_u8(pixel[1], vreinterpret_u8_u16(rep));
-#else
-    uint8x8_t b = vtbl2_u8(v_split3, vreinterpret_u8_u16(rep));
-#endif  // (__aarch64__)
-
-    sum = vmlal_u16(sum, vreinterpret_u16_u8(b), ww_0_lo);
-    sum = vmlal_u16(sum, v_pixel_2, ww_0_hi);
-    uint8x8_t sum_l = vreinterpret_u8_u16(vqrshrn_n_u32(sum, 9));
-    uint32x2_t predsh = vreinterpret_u32_u8(vtbl1_u8(sum_l, save_mask));
-    vst1_lane_u32((uint32_t *)dst, predsh, 0);
-
-    dst += stride;
-
-    rep = vadd_u16(rep, one);
-    d = vadd_u16(d, inc);
-  }
-}
-
-void aom_smooth_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  uint8x16_t pixels[3];
-  load_pixel_w4(above, left, 4, pixels);
-
-  uint16x8_t wh[4], ww[2];
-  load_weight_w4(sm_weight_arrays, 4, wh, ww);
-
-  smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
-}
-
-void aom_smooth_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  uint8x16_t pixels[3];
-  load_pixel_w4(above, left, 8, pixels);
-
-  uint16x8_t wh[4], ww[2];
-  load_weight_w4(sm_weight_arrays, 8, wh, ww);
-
-  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
-}
-
-void aom_smooth_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  uint8x16_t pixels[3];
-  load_pixel_w4(above, left, 16, pixels);
-
-  uint16x8_t wh[4], ww[2];
-  load_weight_w4(sm_weight_arrays, 16, wh, ww);
-
-  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
-}
-
-// pixels[0]: above and below_pred interleave vector, first half
-// pixels[1]: above and below_pred interleave vector, second half
-// pixels[2]: left vector
-// pixels[3]: right_pred vector
-// pixels[4]: above and below_pred interleave vector, first half
-// pixels[5]: above and below_pred interleave vector, second half
-// pixels[6]: left vector + 16
-// pixels[7]: right_pred vector
-static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
-                                 int height, uint8x16_t *pixels) {
-  pixels[0] = vreinterpretq_u8_u16(vmovl_u8(vld1_u8(above)));
-  pixels[1] = vreinterpretq_u8_u16(vdupq_n_u16((uint16_t)left[height - 1]));
-  pixels[3] = vreinterpretq_u8_u16(vdupq_n_u16((uint16_t)above[7]));
-
-  if (height == 4) {
-    const uint32x4_t zero32 = vdupq_n_u32(0);
-    pixels[2] =
-        vreinterpretq_u8_u32(vld1q_lane_u32((const uint32_t *)left, zero32, 0));
-  } else if (height == 8) {
-    const uint64x2_t zero64 = vdupq_n_u64(0);
-    pixels[2] = vreinterpretq_u8_u64(
-        vsetq_lane_u64(((const uint64_t *)left)[0], zero64, 0));
-  } else if (height == 16) {
-    pixels[2] = vld1q_u8(left);
-  } else {
-    pixels[2] = vld1q_u8(left);
-    pixels[4] = pixels[0];
-    pixels[5] = pixels[1];
-    pixels[6] = vld1q_u8(left + 16);
-    pixels[7] = pixels[3];
-  }
-}
-
-// weight_h[0]: weight_h vector
-// weight_h[1]: scale - weight_h vector
-// weight_h[2]: same as [0], offset 8
-// weight_h[3]: same as [1], offset 8
-// weight_h[4]: same as [0], offset 16
-// weight_h[5]: same as [1], offset 16
-// weight_h[6]: same as [0], offset 24
-// weight_h[7]: same as [1], offset 24
-// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
-// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
-static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
-                                  uint16x8_t *weight_h, uint16x8_t *weight_w) {
-  const uint8x16_t zero = vdupq_n_u8(0);
-  const int we_offset = height < 8 ? 4 : 8;
-  uint8x16_t we = vld1q_u8(&weight_array[we_offset]);
-#if defined(__aarch64__)
-  weight_h[0] = vreinterpretq_u16_u8(vzip1q_u8(we, zero));
-#else
-  weight_h[0] = vreinterpretq_u16_u8(vzipq_u8(we, zero).val[0]);
-#endif  // (__aarch64__)
-  const uint16x8_t d = vdupq_n_u16(256);
-  weight_h[1] = vsubq_u16(d, weight_h[0]);
-
-  if (height == 4) {
-    we = vextq_u8(we, zero, 4);
-#if defined(__aarch64__)
-    weight_w[0] = vreinterpretq_u16_u8(vzip1q_u8(we, zero));
-#else
-    weight_w[0] = vmovl_u8(vget_low_u8(we));
-#endif  // (__aarch64__)
-    weight_w[1] = vsubq_u16(d, weight_w[0]);
-  } else {
-    weight_w[0] = weight_h[0];
-    weight_w[1] = weight_h[1];
-  }
-
-  if (height == 16) {
-    we = vld1q_u8(&weight_array[16]);
-    const uint8x16x2_t weight_h_02 = vzipq_u8(we, zero);
-    weight_h[0] = vreinterpretq_u16_u8(weight_h_02.val[0]);
-    weight_h[1] = vsubq_u16(d, weight_h[0]);
-    weight_h[2] = vreinterpretq_u16_u8(weight_h_02.val[1]);
-    weight_h[3] = vsubq_u16(d, weight_h[2]);
-  } else if (height == 32) {
-    const uint8x16_t weight_lo = vld1q_u8(&weight_array[32]);
-    const uint8x16x2_t weight_h_02 = vzipq_u8(weight_lo, zero);
-    weight_h[0] = vreinterpretq_u16_u8(weight_h_02.val[0]);
-    weight_h[1] = vsubq_u16(d, weight_h[0]);
-    weight_h[2] = vreinterpretq_u16_u8(weight_h_02.val[1]);
-    weight_h[3] = vsubq_u16(d, weight_h[2]);
-    const uint8x16_t weight_hi = vld1q_u8(&weight_array[32 + 16]);
-    const uint8x16x2_t weight_h_46 = vzipq_u8(weight_hi, zero);
-    weight_h[4] = vreinterpretq_u16_u8(weight_h_46.val[0]);
-    weight_h[5] = vsubq_u16(d, weight_h[4]);
-    weight_h[6] = vreinterpretq_u16_u8(weight_h_46.val[1]);
-    weight_h[7] = vsubq_u16(d, weight_h[6]);
-  }
-}
-
-static INLINE void smooth_pred_8xh(const uint8x16_t *pixels,
-                                   const uint16x8_t *wh, const uint16x8_t *ww,
-                                   int h, uint8_t *dst, ptrdiff_t stride,
-                                   int second_half) {
-  const uint16x8_t one = vdupq_n_u16(1);
-  const uint16x8_t inc = vdupq_n_u16(0x202);
-  uint16x8_t rep = second_half ? vdupq_n_u16((uint16_t)0x8008)
-                               : vdupq_n_u16((uint16_t)0x8000);
-  uint16x8_t d = vdupq_n_u16(0x100);
-
-#if !defined(__aarch64__)
-  const uint8x8x2_t v_split1 = { { vget_low_u8(vreinterpretq_u8_u16(wh[0])),
-                                   vget_high_u8(
-                                       vreinterpretq_u8_u16(wh[0])) } };
-  const uint8x8x2_t v_split2 = { { vget_low_u8(vreinterpretq_u8_u16(wh[1])),
-                                   vget_high_u8(
-                                       vreinterpretq_u8_u16(wh[1])) } };
-  const uint8x8x2_t v_split3 = { { vget_low_u8(pixels[2]),
-                                   vget_high_u8(pixels[2]) } };
-#endif
-
-  for (int i = 0; i < h; ++i) {
-#if defined(__aarch64__)
-    const uint8x16_t wg_wg =
-        vqtbl1q_u8(vreinterpretq_u8_u16(wh[0]), vreinterpretq_u8_u16(d));
-    const uint8x16_t sc_sc =
-        vqtbl1q_u8(vreinterpretq_u8_u16(wh[1]), vreinterpretq_u8_u16(d));
-#else
-    const uint8x8_t v_d_lo = vreinterpret_u8_u16(vget_low_u16(d));
-    const uint8x8_t v_d_hi = vreinterpret_u8_u16(vget_high_u16(d));
-    const uint8x16_t wg_wg =
-        vcombine_u8(vtbl2_u8(v_split1, v_d_lo), vtbl2_u8(v_split1, v_d_hi));
-    const uint8x16_t sc_sc =
-        vcombine_u8(vtbl2_u8(v_split2, v_d_lo), vtbl2_u8(v_split2, v_d_hi));
-#endif  // (__aarch64__)
-    uint16x8_t s01 =
-        vmulq_u16(vreinterpretq_u16_u8(pixels[0]), vreinterpretq_u16_u8(wg_wg));
-    s01 = vmlaq_u16(s01, vreinterpretq_u16_u8(pixels[1]),
-                    vreinterpretq_u16_u8(sc_sc));
-#if defined(__aarch64__)
-    const uint8x16_t b = vqtbl1q_u8(pixels[2], vreinterpretq_u8_u16(rep));
-#else
-    const uint8x16_t b = vcombine_u8(
-        vtbl2_u8(v_split3, vget_low_u8(vreinterpretq_u8_u16(rep))),
-        vtbl2_u8(v_split3, vget_high_u8(vreinterpretq_u8_u16(rep))));
-#endif  // (__aarch64__)
-    uint16x8_t sum0 = vmulq_u16(vreinterpretq_u16_u8(b), ww[0]);
-    sum0 = vmlaq_u16(sum0, vreinterpretq_u16_u8(pixels[3]), ww[1]);
-
-    uint32x4_t s0 = vaddl_u16(vget_low_u16(s01), vget_low_u16(sum0));
-#if defined(__aarch64__)
-    uint32x4_t s1 = vaddl_high_u16(s01, sum0);
-#else
-    uint32x4_t s1 = vaddl_u16(vget_high_u16(s01), vget_high_u16(sum0));
-#endif  // (__aarch64__)
-
-    sum0 = vcombine_u16(vqrshrn_n_u32(s0, 9), vqrshrn_n_u32(s1, 9));
-    uint8x8_t predsh = vqmovn_u16(sum0);
-    vst1_u8(dst, predsh);
-
-    dst += stride;
-    rep = vaddq_u16(rep, one);
-    d = vaddq_u16(d, inc);
-  }
-}
-
-void aom_smooth_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  uint8x16_t pixels[4];
-  load_pixel_w8(above, left, 4, pixels);
-
-  uint16x8_t wh[4], ww[2];
-  load_weight_w8(sm_weight_arrays, 4, wh, ww);
-
-  smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
-}
-
-void aom_smooth_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  uint8x16_t pixels[4];
-  load_pixel_w8(above, left, 8, pixels);
-
-  uint16x8_t wh[4], ww[2];
-  load_weight_w8(sm_weight_arrays, 8, wh, ww);
-
-  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
-}
-
-void aom_smooth_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  uint8x16_t pixels[4];
-  load_pixel_w8(above, left, 16, pixels);
-
-  uint16x8_t wh[4], ww[2];
-  load_weight_w8(sm_weight_arrays, 16, wh, ww);
-
-  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
-}
-
-void aom_smooth_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  uint8x16_t pixels[8];
-  load_pixel_w8(above, left, 32, pixels);
-
-  uint16x8_t wh[8], ww[2];
-  load_weight_w8(sm_weight_arrays, 32, wh, ww);
-
-  smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
-  dst += stride << 3;
-  smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
-}
-
-static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left, uint32_t bw,
-                                        uint32_t bh) {
-  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
-  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
-  const uint16x8_t scale_value = vdupq_n_u16(256);
-
-  for (uint32_t y = 0; y < bh; ++y) {
-    const uint8x8_t left_y = vdup_n_u8(left[y]);
-    const uint8x8_t weights_y_dup = vdup_n_u8(sm_weights_h[y]);
-    const uint32x4_t pred_scaled_bl =
-        vdupq_n_u32(256 + (256 - sm_weights_h[y]) * left[bh - 1]);
-
-    for (uint32_t x = 0; x < bw; x += 8) {
-      const uint8x8_t weights_x = vld1_u8(sm_weights_w + x);
-      const uint8x8_t top_x = vld1_u8(above + x);
-
-      uint16x8_t pred_m1, pred_m2;
-      uint32x4_t pred_lo, pred_hi;
-      pred_m1 = vmull_u8(top_x, weights_y_dup);
-      pred_m2 = vmull_u8(weights_x, left_y);
-
-      pred_lo = vaddl_u16(vget_low_u16(pred_m1), vget_low_u16(pred_m2));
-#if defined(__aarch64__)
-      pred_hi = vaddl_high_u16(pred_m1, pred_m2);
-#else
-      pred_hi = vaddl_u16(vget_high_u16(pred_m1), vget_high_u16(pred_m2));
-#endif  // (__aarch64__)
-
-      const uint16x8_t scale_m_weights_x = vsubw_u8(scale_value, weights_x);
-
-      const uint16x8_t swxtr = vmulq_n_u16(scale_m_weights_x, above[bw - 1]);
-
-      pred_lo = vaddq_u32(pred_lo, pred_scaled_bl);
-      pred_hi = vaddq_u32(pred_hi, pred_scaled_bl);
-
-      pred_lo = vaddw_u16(pred_lo, vget_low_u16(swxtr));
-#if defined(__aarch64__)
-      pred_hi = vaddw_high_u16(pred_hi, swxtr);
-#else
-      pred_hi = vaddw_u16(pred_hi, vget_high_u16(swxtr));
-#endif  // (__aarch64__)
-
-      uint16x8_t pred =
-          vcombine_u16(vshrn_n_u32(pred_lo, 9), vshrn_n_u32(pred_hi, 9));
-
-      uint8x8_t predsh = vqmovn_u16(pred);
-
-      vst1_u8(dst + x, predsh);
-    }
-
-    dst += stride;
-  }
-}
-
-void aom_smooth_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 16, 4);
-}
-
-void aom_smooth_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 16, 8);
-}
-
-void aom_smooth_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 16, 16);
-}
-
-void aom_smooth_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 16, 32);
-}
-
-void aom_smooth_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 32, 8);
-}
-
-void aom_smooth_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 32, 16);
-}
-
-void aom_smooth_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 32, 32);
-}
-
-void aom_smooth_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 32, 64);
-}
-
-void aom_smooth_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 64, 64);
-}
-
-void aom_smooth_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 64, 32);
-}
-
-void aom_smooth_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 64, 16);
-}
-
-void aom_smooth_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 16, 64);
-}

diff --git a/aom_dsp/arm/sad4d_neon.c b/aom_dsp/arm/sad4d_neon.c
deleted file mode 100644
index 8e60939..0000000
--- a/aom_dsp/arm/sad4d_neon.c
+++ /dev/null

@@ -1,592 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-
-static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
-                                                    const uint16x8_t vec_hi) {
-  const uint32x4_t vec_l_lo =
-      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
-  const uint32x4_t vec_l_hi =
-      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
-  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
-}
-
-// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
-// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
-// and vec_sum_ref_hi.
-static void sad_neon_64(const uint8x16_t vec_src_00,
-                        const uint8x16_t vec_src_16,
-                        const uint8x16_t vec_src_32,
-                        const uint8x16_t vec_src_48, const uint8_t *ref,
-                        uint16x8_t *vec_sum_ref_lo,
-                        uint16x8_t *vec_sum_ref_hi) {
-  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-  const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
-  const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
-
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
-                             vget_low_u8(vec_ref_00));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
-                             vget_high_u8(vec_ref_00));
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
-                             vget_low_u8(vec_ref_16));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
-                             vget_high_u8(vec_ref_16));
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32),
-                             vget_low_u8(vec_ref_32));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32),
-                             vget_high_u8(vec_ref_32));
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48),
-                             vget_low_u8(vec_ref_48));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48),
-                             vget_high_u8(vec_ref_48));
-}
-
-// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
-// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
-static void sad_neon_32(const uint8x16_t vec_src_00,
-                        const uint8x16_t vec_src_16, const uint8_t *ref,
-                        uint16x8_t *vec_sum_ref_lo,
-                        uint16x8_t *vec_sum_ref_hi) {
-  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
-                             vget_low_u8(vec_ref_00));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
-                             vget_high_u8(vec_ref_00));
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
-                             vget_low_u8(vec_ref_16));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
-                             vget_high_u8(vec_ref_16));
-}
-
-void aom_sad64x64x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t *res) {
-  int i;
-  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
-
-  for (i = 0; i < 64; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
-    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
-
-    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0,
-                &vec_sum_ref0_lo, &vec_sum_ref0_hi);
-    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1,
-                &vec_sum_ref1_lo, &vec_sum_ref1_hi);
-    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2,
-                &vec_sum_ref2_lo, &vec_sum_ref2_hi);
-    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3,
-                &vec_sum_ref3_lo, &vec_sum_ref3_hi);
-
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
-  }
-
-  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
-  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
-  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
-  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
-}
-
-void aom_sad32x32x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t *res) {
-  int i;
-  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
-
-  for (i = 0; i < 32; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-
-    sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo,
-                &vec_sum_ref0_hi);
-    sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo,
-                &vec_sum_ref1_hi);
-    sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo,
-                &vec_sum_ref2_hi);
-    sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo,
-                &vec_sum_ref3_hi);
-
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
-  }
-
-  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
-  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
-  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
-  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
-}
-
-void aom_sad16x16x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t *res) {
-  int i;
-  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
-
-  for (i = 0; i < 16; ++i) {
-    const uint8x16_t vec_src = vld1q_u8(src);
-    const uint8x16_t vec_ref0 = vld1q_u8(ref0);
-    const uint8x16_t vec_ref1 = vld1q_u8(ref1);
-    const uint8x16_t vec_ref2 = vld1q_u8(ref2);
-    const uint8x16_t vec_ref3 = vld1q_u8(ref3);
-
-    vec_sum_ref0_lo =
-        vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0));
-    vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
-                               vget_high_u8(vec_ref0));
-    vec_sum_ref1_lo =
-        vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1));
-    vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
-                               vget_high_u8(vec_ref1));
-    vec_sum_ref2_lo =
-        vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2));
-    vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
-                               vget_high_u8(vec_ref2));
-    vec_sum_ref3_lo =
-        vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3));
-    vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
-                               vget_high_u8(vec_ref3));
-
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
-  }
-
-  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
-  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
-  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
-  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
-}
-
-static INLINE unsigned int horizontal_add_16x4(const uint16x4_t vec_16x4) {
-  const uint32x2_t a = vpaddl_u16(vec_16x4);
-  const uint64x1_t b = vpaddl_u32(a);
-  return vget_lane_u32(vreinterpret_u32_u64(b), 0);
-}
-
-static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
-  const uint32x4_t a = vpaddlq_u16(vec_16x8);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
-}
-
-static void sad_row4_neon(uint16x4_t *vec_src, const uint8x8_t q0,
-                          const uint8x8_t ref) {
-  uint8x8_t q2 = vabd_u8(q0, ref);
-  *vec_src = vpadal_u8(*vec_src, q2);
-}
-
-static void sad_row8_neon(uint16x4_t *vec_src, const uint8x8_t *q0,
-                          const uint8_t *ref_ptr) {
-  uint8x8_t q1 = vld1_u8(ref_ptr);
-  uint8x8_t q2 = vabd_u8(*q0, q1);
-  *vec_src = vpadal_u8(*vec_src, q2);
-}
-
-static void sad_row16_neon(uint16x8_t *vec_src, const uint8x16_t *q0,
-                           const uint8_t *ref_ptr) {
-  uint8x16_t q1 = vld1q_u8(ref_ptr);
-  uint8x16_t q2 = vabdq_u8(*q0, q1);
-  *vec_src = vpadalq_u8(*vec_src, q2);
-}
-
-void aom_sadMxNx4d_neon(int width, int height, const uint8_t *src,
-                        int src_stride, const uint8_t *const ref[4],
-                        int ref_stride, uint32_t res[4]) {
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
-
-  res[0] = 0;
-  res[1] = 0;
-  res[2] = 0;
-  res[3] = 0;
-
-  switch (width) {
-    case 4: {
-      uint32_t src4, ref40, ref41, ref42, ref43;
-      uint32x2_t q8 = vdup_n_u32(0);
-      uint32x2_t q4 = vdup_n_u32(0);
-      uint32x2_t q5 = vdup_n_u32(0);
-      uint32x2_t q6 = vdup_n_u32(0);
-      uint32x2_t q7 = vdup_n_u32(0);
-
-      for (int i = 0; i < height / 2; i++) {
-        uint16x4_t q0 = vdup_n_u16(0);
-        uint16x4_t q1 = vdup_n_u16(0);
-        uint16x4_t q2 = vdup_n_u16(0);
-        uint16x4_t q3 = vdup_n_u16(0);
-
-        memcpy(&src4, src, 4);
-        memcpy(&ref40, ref0, 4);
-        memcpy(&ref41, ref1, 4);
-        memcpy(&ref42, ref2, 4);
-        memcpy(&ref43, ref3, 4);
-
-        src += src_stride;
-        ref0 += ref_stride;
-        ref1 += ref_stride;
-        ref2 += ref_stride;
-        ref3 += ref_stride;
-
-        q8 = vset_lane_u32(src4, q8, 0);
-        q4 = vset_lane_u32(ref40, q4, 0);
-        q5 = vset_lane_u32(ref41, q5, 0);
-        q6 = vset_lane_u32(ref42, q6, 0);
-        q7 = vset_lane_u32(ref43, q7, 0);
-
-        memcpy(&src4, src, 4);
-        memcpy(&ref40, ref0, 4);
-        memcpy(&ref41, ref1, 4);
-        memcpy(&ref42, ref2, 4);
-        memcpy(&ref43, ref3, 4);
-
-        src += src_stride;
-        ref0 += ref_stride;
-        ref1 += ref_stride;
-        ref2 += ref_stride;
-        ref3 += ref_stride;
-
-        q8 = vset_lane_u32(src4, q8, 1);
-        q4 = vset_lane_u32(ref40, q4, 1);
-        q5 = vset_lane_u32(ref41, q5, 1);
-        q6 = vset_lane_u32(ref42, q6, 1);
-        q7 = vset_lane_u32(ref43, q7, 1);
-
-        sad_row4_neon(&q0, vreinterpret_u8_u32(q8), vreinterpret_u8_u32(q4));
-        sad_row4_neon(&q1, vreinterpret_u8_u32(q8), vreinterpret_u8_u32(q5));
-        sad_row4_neon(&q2, vreinterpret_u8_u32(q8), vreinterpret_u8_u32(q6));
-        sad_row4_neon(&q3, vreinterpret_u8_u32(q8), vreinterpret_u8_u32(q7));
-
-        res[0] += horizontal_add_16x4(q0);
-        res[1] += horizontal_add_16x4(q1);
-        res[2] += horizontal_add_16x4(q2);
-        res[3] += horizontal_add_16x4(q3);
-      }
-      break;
-    }
-    case 8: {
-      for (int i = 0; i < height; i++) {
-        uint16x4_t q0 = vdup_n_u16(0);
-        uint16x4_t q1 = vdup_n_u16(0);
-        uint16x4_t q2 = vdup_n_u16(0);
-        uint16x4_t q3 = vdup_n_u16(0);
-
-        uint8x8_t q5 = vld1_u8(src);
-
-        sad_row8_neon(&q0, &q5, ref0);
-        sad_row8_neon(&q1, &q5, ref1);
-        sad_row8_neon(&q2, &q5, ref2);
-        sad_row8_neon(&q3, &q5, ref3);
-
-        src += src_stride;
-        ref0 += ref_stride;
-        ref1 += ref_stride;
-        ref2 += ref_stride;
-        ref3 += ref_stride;
-
-        res[0] += horizontal_add_16x4(q0);
-        res[1] += horizontal_add_16x4(q1);
-        res[2] += horizontal_add_16x4(q2);
-        res[3] += horizontal_add_16x4(q3);
-      }
-      break;
-    }
-    case 16: {
-      for (int i = 0; i < height; i++) {
-        uint16x8_t q0 = vdupq_n_u16(0);
-        uint16x8_t q1 = vdupq_n_u16(0);
-        uint16x8_t q2 = vdupq_n_u16(0);
-        uint16x8_t q3 = vdupq_n_u16(0);
-
-        uint8x16_t q4 = vld1q_u8(src);
-
-        sad_row16_neon(&q0, &q4, ref0);
-        sad_row16_neon(&q1, &q4, ref1);
-        sad_row16_neon(&q2, &q4, ref2);
-        sad_row16_neon(&q3, &q4, ref3);
-
-        src += src_stride;
-        ref0 += ref_stride;
-        ref1 += ref_stride;
-        ref2 += ref_stride;
-        ref3 += ref_stride;
-
-        res[0] += horizontal_add_16x8(q0);
-        res[1] += horizontal_add_16x8(q1);
-        res[2] += horizontal_add_16x8(q2);
-        res[3] += horizontal_add_16x8(q3);
-      }
-      break;
-    }
-    case 32: {
-      for (int i = 0; i < height; i++) {
-        uint16x8_t q0 = vdupq_n_u16(0);
-        uint16x8_t q1 = vdupq_n_u16(0);
-        uint16x8_t q2 = vdupq_n_u16(0);
-        uint16x8_t q3 = vdupq_n_u16(0);
-
-        uint8x16_t q4 = vld1q_u8(src);
-
-        sad_row16_neon(&q0, &q4, ref0);
-        sad_row16_neon(&q1, &q4, ref1);
-        sad_row16_neon(&q2, &q4, ref2);
-        sad_row16_neon(&q3, &q4, ref3);
-
-        q4 = vld1q_u8(src + 16);
-
-        sad_row16_neon(&q0, &q4, ref0 + 16);
-        sad_row16_neon(&q1, &q4, ref1 + 16);
-        sad_row16_neon(&q2, &q4, ref2 + 16);
-        sad_row16_neon(&q3, &q4, ref3 + 16);
-
-        src += src_stride;
-        ref0 += ref_stride;
-        ref1 += ref_stride;
-        ref2 += ref_stride;
-        ref3 += ref_stride;
-
-        res[0] += horizontal_add_16x8(q0);
-        res[1] += horizontal_add_16x8(q1);
-        res[2] += horizontal_add_16x8(q2);
-        res[3] += horizontal_add_16x8(q3);
-      }
-      break;
-    }
-    case 64: {
-      for (int i = 0; i < height; i++) {
-        uint16x8_t q0 = vdupq_n_u16(0);
-        uint16x8_t q1 = vdupq_n_u16(0);
-        uint16x8_t q2 = vdupq_n_u16(0);
-        uint16x8_t q3 = vdupq_n_u16(0);
-
-        uint8x16_t q4 = vld1q_u8(src);
-
-        sad_row16_neon(&q0, &q4, ref0);
-        sad_row16_neon(&q1, &q4, ref1);
-        sad_row16_neon(&q2, &q4, ref2);
-        sad_row16_neon(&q3, &q4, ref3);
-
-        q4 = vld1q_u8(src + 16);
-
-        sad_row16_neon(&q0, &q4, ref0 + 16);
-        sad_row16_neon(&q1, &q4, ref1 + 16);
-        sad_row16_neon(&q2, &q4, ref2 + 16);
-        sad_row16_neon(&q3, &q4, ref3 + 16);
-
-        q4 = vld1q_u8(src + 32);
-
-        sad_row16_neon(&q0, &q4, ref0 + 32);
-        sad_row16_neon(&q1, &q4, ref1 + 32);
-        sad_row16_neon(&q2, &q4, ref2 + 32);
-        sad_row16_neon(&q3, &q4, ref3 + 32);
-
-        q4 = vld1q_u8(src + 48);
-
-        sad_row16_neon(&q0, &q4, ref0 + 48);
-        sad_row16_neon(&q1, &q4, ref1 + 48);
-        sad_row16_neon(&q2, &q4, ref2 + 48);
-        sad_row16_neon(&q3, &q4, ref3 + 48);
-
-        src += src_stride;
-        ref0 += ref_stride;
-        ref1 += ref_stride;
-        ref2 += ref_stride;
-        ref3 += ref_stride;
-
-        res[0] += horizontal_add_16x8(q0);
-        res[1] += horizontal_add_16x8(q1);
-        res[2] += horizontal_add_16x8(q2);
-        res[3] += horizontal_add_16x8(q3);
-      }
-      break;
-    }
-    case 128: {
-      for (int i = 0; i < height; i++) {
-        uint16x8_t q0 = vdupq_n_u16(0);
-        uint16x8_t q1 = vdupq_n_u16(0);
-        uint16x8_t q2 = vdupq_n_u16(0);
-        uint16x8_t q3 = vdupq_n_u16(0);
-
-        uint8x16_t q4 = vld1q_u8(src);
-
-        sad_row16_neon(&q0, &q4, ref0);
-        sad_row16_neon(&q1, &q4, ref1);
-        sad_row16_neon(&q2, &q4, ref2);
-        sad_row16_neon(&q3, &q4, ref3);
-
-        q4 = vld1q_u8(src + 16);
-
-        sad_row16_neon(&q0, &q4, ref0 + 16);
-        sad_row16_neon(&q1, &q4, ref1 + 16);
-        sad_row16_neon(&q2, &q4, ref2 + 16);
-        sad_row16_neon(&q3, &q4, ref3 + 16);
-
-        q4 = vld1q_u8(src + 32);
-
-        sad_row16_neon(&q0, &q4, ref0 + 32);
-        sad_row16_neon(&q1, &q4, ref1 + 32);
-        sad_row16_neon(&q2, &q4, ref2 + 32);
-        sad_row16_neon(&q3, &q4, ref3 + 32);
-
-        q4 = vld1q_u8(src + 48);
-
-        sad_row16_neon(&q0, &q4, ref0 + 48);
-        sad_row16_neon(&q1, &q4, ref1 + 48);
-        sad_row16_neon(&q2, &q4, ref2 + 48);
-        sad_row16_neon(&q3, &q4, ref3 + 48);
-
-        q4 = vld1q_u8(src + 64);
-
-        sad_row16_neon(&q0, &q4, ref0 + 64);
-        sad_row16_neon(&q1, &q4, ref1 + 64);
-        sad_row16_neon(&q2, &q4, ref2 + 64);
-        sad_row16_neon(&q3, &q4, ref3 + 64);
-
-        q4 = vld1q_u8(src + 80);
-
-        sad_row16_neon(&q0, &q4, ref0 + 80);
-        sad_row16_neon(&q1, &q4, ref1 + 80);
-        sad_row16_neon(&q2, &q4, ref2 + 80);
-        sad_row16_neon(&q3, &q4, ref3 + 80);
-
-        q4 = vld1q_u8(src + 96);
-
-        sad_row16_neon(&q0, &q4, ref0 + 96);
-        sad_row16_neon(&q1, &q4, ref1 + 96);
-        sad_row16_neon(&q2, &q4, ref2 + 96);
-        sad_row16_neon(&q3, &q4, ref3 + 96);
-
-        q4 = vld1q_u8(src + 112);
-
-        sad_row16_neon(&q0, &q4, ref0 + 112);
-        sad_row16_neon(&q1, &q4, ref1 + 112);
-        sad_row16_neon(&q2, &q4, ref2 + 112);
-        sad_row16_neon(&q3, &q4, ref3 + 112);
-
-        src += src_stride;
-        ref0 += ref_stride;
-        ref1 += ref_stride;
-        ref2 += ref_stride;
-        ref3 += ref_stride;
-
-        res[0] += horizontal_add_16x8(q0);
-        res[1] += horizontal_add_16x8(q1);
-        res[2] += horizontal_add_16x8(q2);
-        res[3] += horizontal_add_16x8(q3);
-      }
-    }
-  }
-}
-
-#define sad_skip_MxN_neon(m, n)                                             \
-  void aom_sad_skip_##m##x##n##x4d_neon(const uint8_t *src, int src_stride, \
-                                        const uint8_t *const ref[4],        \
-                                        int ref_stride, uint32_t res[4]) {  \
-    aom_sadMxNx4d_neon(m, ((n) >> 1), src, 2 * src_stride, ref,             \
-                       2 * ref_stride, res);                                \
-    res[0] <<= 1;                                                           \
-    res[1] <<= 1;                                                           \
-    res[2] <<= 1;                                                           \
-    res[3] <<= 1;                                                           \
-  }
-
-sad_skip_MxN_neon(4, 8);
-sad_skip_MxN_neon(4, 16);
-sad_skip_MxN_neon(4, 32);
-
-sad_skip_MxN_neon(8, 8);
-sad_skip_MxN_neon(8, 16);
-sad_skip_MxN_neon(8, 32);
-
-sad_skip_MxN_neon(16, 8);
-sad_skip_MxN_neon(16, 16);
-sad_skip_MxN_neon(16, 32);
-sad_skip_MxN_neon(16, 64);
-
-sad_skip_MxN_neon(32, 8);
-sad_skip_MxN_neon(32, 16);
-sad_skip_MxN_neon(32, 32);
-sad_skip_MxN_neon(32, 64);
-
-sad_skip_MxN_neon(64, 16);
-sad_skip_MxN_neon(64, 32);
-sad_skip_MxN_neon(64, 64);
-sad_skip_MxN_neon(64, 128);
-
-sad_skip_MxN_neon(128, 64);
-sad_skip_MxN_neon(128, 128);
-#undef sad_skip_MxN_neon

diff --git a/aom_dsp/arm/sad_neon.c b/aom_dsp/arm/sad_neon.c
deleted file mode 100644
index c3740a7..0000000
--- a/aom_dsp/arm/sad_neon.c
+++ /dev/null

@@ -1,558 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <arm_neon.h>
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-#include "aom/aom_integer.h"
-
-unsigned int aom_sad8x16_neon(const uint8_t *src_ptr, int src_stride,
-                              const uint8_t *ref_ptr, int ref_stride) {
-  uint8x8_t d0, d8;
-  uint16x8_t q12;
-  uint32x4_t q1;
-  uint64x2_t q3;
-  uint32x2_t d5;
-  int i;
-
-  d0 = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  d8 = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  q12 = vabdl_u8(d0, d8);
-
-  for (i = 0; i < 15; i++) {
-    d0 = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    d8 = vld1_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabal_u8(q12, d0, d8);
-  }
-
-  q1 = vpaddlq_u16(q12);
-  q3 = vpaddlq_u32(q1);
-  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                vreinterpret_u32_u64(vget_high_u64(q3)));
-
-  return vget_lane_u32(d5, 0);
-}
-
-unsigned int aom_sad4x4_neon(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *ref_ptr, int ref_stride) {
-  uint8x8_t d0, d8;
-  uint16x8_t q12;
-  uint32x2_t d1;
-  uint64x1_t d3;
-  int i;
-
-  d0 = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  d8 = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  q12 = vabdl_u8(d0, d8);
-
-  for (i = 0; i < 3; i++) {
-    d0 = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    d8 = vld1_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabal_u8(q12, d0, d8);
-  }
-
-  d1 = vpaddl_u16(vget_low_u16(q12));
-  d3 = vpaddl_u32(d1);
-
-  return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
-}
-
-unsigned int aom_sad16x8_neon(const uint8_t *src_ptr, int src_stride,
-                              const uint8_t *ref_ptr, int ref_stride) {
-  uint8x16_t q0, q4;
-  uint16x8_t q12, q13;
-  uint32x4_t q1;
-  uint64x2_t q3;
-  uint32x2_t d5;
-  int i;
-
-  q0 = vld1q_u8(src_ptr);
-  src_ptr += src_stride;
-  q4 = vld1q_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
-  q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
-
-  for (i = 0; i < 7; i++) {
-    q0 = vld1q_u8(src_ptr);
-    src_ptr += src_stride;
-    q4 = vld1q_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
-    q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
-  }
-
-  q12 = vaddq_u16(q12, q13);
-  q1 = vpaddlq_u16(q12);
-  q3 = vpaddlq_u32(q1);
-  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                vreinterpret_u32_u64(vget_high_u64(q3)));
-
-  return vget_lane_u32(d5, 0);
-}
-
-static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
-                                                    const uint16x8_t vec_hi) {
-  const uint32x4_t vec_l_lo =
-      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
-  const uint32x4_t vec_l_hi =
-      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
-  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
-}
-static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
-  const uint32x4_t a = vpaddlq_u16(vec_16x8);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
-}
-
-unsigned int aom_sad64x64_neon(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride) {
-  int i;
-  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
-  for (i = 0; i < 64; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
-    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
-    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-    const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
-    const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
-                            vget_low_u8(vec_ref_00));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
-                            vget_high_u8(vec_ref_00));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
-                            vget_low_u8(vec_ref_16));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
-                            vget_high_u8(vec_ref_16));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
-                            vget_low_u8(vec_ref_32));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
-                            vget_high_u8(vec_ref_32));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
-                            vget_low_u8(vec_ref_48));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
-                            vget_high_u8(vec_ref_48));
-  }
-  return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
-}
-
-unsigned int aom_sad128x128_neon(const uint8_t *src, int src_stride,
-                                 const uint8_t *ref, int ref_stride) {
-  uint16x8_t vec_accum_lo, vec_accum_hi;
-  uint32x4_t vec_accum_32lo = vdupq_n_u32(0);
-  uint32x4_t vec_accum_32hi = vdupq_n_u32(0);
-  uint16x8_t tmp;
-  for (int i = 0; i < 128; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
-    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
-    const uint8x16_t vec_src_64 = vld1q_u8(src + 64);
-    const uint8x16_t vec_src_80 = vld1q_u8(src + 80);
-    const uint8x16_t vec_src_96 = vld1q_u8(src + 96);
-    const uint8x16_t vec_src_112 = vld1q_u8(src + 112);
-    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-    const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
-    const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
-    const uint8x16_t vec_ref_64 = vld1q_u8(ref + 64);
-    const uint8x16_t vec_ref_80 = vld1q_u8(ref + 80);
-    const uint8x16_t vec_ref_96 = vld1q_u8(ref + 96);
-    const uint8x16_t vec_ref_112 = vld1q_u8(ref + 112);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum_lo = vdupq_n_u16(0);
-    vec_accum_hi = vdupq_n_u16(0);
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
-                            vget_low_u8(vec_ref_00));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
-                            vget_high_u8(vec_ref_00));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
-                            vget_low_u8(vec_ref_16));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
-                            vget_high_u8(vec_ref_16));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
-                            vget_low_u8(vec_ref_32));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
-                            vget_high_u8(vec_ref_32));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
-                            vget_low_u8(vec_ref_48));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
-                            vget_high_u8(vec_ref_48));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_64),
-                            vget_low_u8(vec_ref_64));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_64),
-                            vget_high_u8(vec_ref_64));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_80),
-                            vget_low_u8(vec_ref_80));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_80),
-                            vget_high_u8(vec_ref_80));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_96),
-                            vget_low_u8(vec_ref_96));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_96),
-                            vget_high_u8(vec_ref_96));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_112),
-                            vget_low_u8(vec_ref_112));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_112),
-                            vget_high_u8(vec_ref_112));
-
-    tmp = vaddq_u16(vec_accum_lo, vec_accum_hi);
-    vec_accum_32lo = vaddw_u16(vec_accum_32lo, vget_low_u16(tmp));
-    vec_accum_32hi = vaddw_u16(vec_accum_32hi, vget_high_u16(tmp));
-  }
-  const uint32x4_t a = vaddq_u32(vec_accum_32lo, vec_accum_32hi);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
-}
-
-unsigned int aom_sad32x32_neon(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride) {
-  int i;
-  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
-
-  for (i = 0; i < 32; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
-                            vget_low_u8(vec_ref_00));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
-                            vget_high_u8(vec_ref_00));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
-                            vget_low_u8(vec_ref_16));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
-                            vget_high_u8(vec_ref_16));
-  }
-  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
-}
-
-unsigned int aom_sad16x16_neon(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride) {
-  int i;
-  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
-
-  for (i = 0; i < 16; ++i) {
-    const uint8x16_t vec_src = vld1q_u8(src);
-    const uint8x16_t vec_ref = vld1q_u8(ref);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum_lo =
-        vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref));
-    vec_accum_hi =
-        vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref));
-  }
-  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
-}
-
-unsigned int aom_sad8x8_neon(const uint8_t *src, int src_stride,
-                             const uint8_t *ref, int ref_stride) {
-  int i;
-  uint16x8_t vec_accum = vdupq_n_u16(0);
-
-  for (i = 0; i < 8; ++i) {
-    const uint8x8_t vec_src = vld1_u8(src);
-    const uint8x8_t vec_ref = vld1_u8(ref);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
-  }
-  return horizontal_add_16x8(vec_accum);
-}
-
-static INLINE unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
-                                         const uint8_t *ref_ptr, int ref_stride,
-                                         int h) {
-  int sum = 0;
-  for (int i = 0; i < h; i++) {
-    uint16x8_t q3 = vdupq_n_u16(0);
-
-    uint8x16_t q0 = vld1q_u8(src_ptr);
-    uint8x16_t q1 = vld1q_u8(ref_ptr);
-    uint8x16_t q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
-
-    q0 = vld1q_u8(src_ptr + 16);
-    q1 = vld1q_u8(ref_ptr + 16);
-    q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
-
-    q0 = vld1q_u8(src_ptr + 32);
-    q1 = vld1q_u8(ref_ptr + 32);
-    q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
-
-    q0 = vld1q_u8(src_ptr + 48);
-    q1 = vld1q_u8(ref_ptr + 48);
-    q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
-
-    q0 = vld1q_u8(src_ptr + 64);
-    q1 = vld1q_u8(ref_ptr + 64);
-    q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
-
-    q0 = vld1q_u8(src_ptr + 80);
-    q1 = vld1q_u8(ref_ptr + 80);
-    q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
-
-    q0 = vld1q_u8(src_ptr + 96);
-    q1 = vld1q_u8(ref_ptr + 96);
-    q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
-
-    q0 = vld1q_u8(src_ptr + 112);
-    q1 = vld1q_u8(ref_ptr + 112);
-    q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-
-    sum += horizontal_add_16x8(q3);
-  }
-
-  return sum;
-}
-
-static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
-                                        const uint8_t *ref_ptr, int ref_stride,
-                                        int h) {
-  int sum = 0;
-  for (int i = 0; i < h; i++) {
-    uint16x8_t q3 = vdupq_n_u16(0);
-
-    uint8x16_t q0 = vld1q_u8(src_ptr);
-    uint8x16_t q1 = vld1q_u8(ref_ptr);
-    uint8x16_t q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
-
-    q0 = vld1q_u8(src_ptr + 16);
-    q1 = vld1q_u8(ref_ptr + 16);
-    q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
-
-    q0 = vld1q_u8(src_ptr + 32);
-    q1 = vld1q_u8(ref_ptr + 32);
-    q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
-
-    q0 = vld1q_u8(src_ptr + 48);
-    q1 = vld1q_u8(ref_ptr + 48);
-    q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-
-    sum += horizontal_add_16x8(q3);
-  }
-
-  return sum;
-}
-
-static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
-                                        const uint8_t *ref_ptr, int ref_stride,
-                                        int h) {
-  int sum = 0;
-  for (int i = 0; i < h; i++) {
-    uint16x8_t q3 = vdupq_n_u16(0);
-
-    uint8x16_t q0 = vld1q_u8(src_ptr);
-    uint8x16_t q1 = vld1q_u8(ref_ptr);
-    uint8x16_t q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
-
-    q0 = vld1q_u8(src_ptr + 16);
-    q1 = vld1q_u8(ref_ptr + 16);
-    q2 = vabdq_u8(q0, q1);
-    q3 = vpadalq_u8(q3, q2);
-
-    sum += horizontal_add_16x8(q3);
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  }
-
-  return sum;
-}
-
-static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
-                                        const uint8_t *ref_ptr, int ref_stride,
-                                        int h) {
-  int sum = 0;
-  for (int i = 0; i < h; i++) {
-    uint8x8_t q0 = vld1_u8(src_ptr);
-    uint8x8_t q1 = vld1_u8(ref_ptr);
-    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 0);
-    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 1);
-    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 2);
-    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 3);
-    q0 = vld1_u8(src_ptr + 8);
-    q1 = vld1_u8(ref_ptr + 8);
-    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 0);
-    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 1);
-    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 2);
-    sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 3);
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  }
-
-  return sum;
-}
-
-static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
-                                       const uint8_t *ref_ptr, int ref_stride,
-                                       int h) {
-  uint16x8_t q3 = vdupq_n_u16(0);
-  for (int y = 0; y < h; y++) {
-    uint8x8_t q0 = vld1_u8(src_ptr);
-    uint8x8_t q1 = vld1_u8(ref_ptr);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-    q3 = vabal_u8(q3, q0, q1);
-  }
-  return horizontal_add_16x8(q3);
-}
-
-static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
-                                       const uint8_t *ref_ptr, int ref_stride,
-                                       int h) {
-  uint16x8_t q3 = vdupq_n_u16(0);
-  uint32x2_t q0 = vdup_n_u32(0);
-  uint32x2_t q1 = vdup_n_u32(0);
-  uint32_t src4, ref4;
-  for (int y = 0; y < h / 2; y++) {
-    memcpy(&src4, src_ptr, 4);
-    memcpy(&ref4, ref_ptr, 4);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-    q0 = vset_lane_u32(src4, q0, 0);
-    q1 = vset_lane_u32(ref4, q1, 0);
-
-    memcpy(&src4, src_ptr, 4);
-    memcpy(&ref4, ref_ptr, 4);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-    q0 = vset_lane_u32(src4, q0, 1);
-    q1 = vset_lane_u32(ref4, q1, 1);
-
-    q3 = vabal_u8(q3, vreinterpret_u8_u32(q0), vreinterpret_u8_u32(q1));
-  }
-  return horizontal_add_16x8(q3);
-}
-
-#define FSADS128_H(h)                                                    \
-  unsigned int aom_sad_skip_128x##h##_neon(                              \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,    \
-      int ref_stride) {                                                  \
-    const uint32_t sum = sad128xh_neon(src_ptr, 2 * src_stride, ref_ptr, \
-                                       2 * ref_stride, h / 2);           \
-    return 2 * sum;                                                      \
-  }
-FSADS128_H(128);
-FSADS128_H(64);
-#undef FSADS128_H
-
-#define FSADS64_H(h)                                                          \
-  unsigned int aom_sad_skip_64x##h##_neon(                                    \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
-      int ref_stride) {                                                       \
-    return 2 * sad64xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
-                            h / 2);                                           \
-  }
-
-FSADS64_H(128);
-FSADS64_H(64);
-FSADS64_H(32);
-FSADS64_H(16);
-#undef FSADS64_H
-
-#define FSADS32_H(h)                                                          \
-  unsigned int aom_sad_skip_32x##h##_neon(                                    \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
-      int ref_stride) {                                                       \
-    return 2 * sad32xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
-                            h / 2);                                           \
-  }
-
-FSADS32_H(64);
-FSADS32_H(32);
-FSADS32_H(16);
-FSADS32_H(8);
-#undef FSADS32_H
-
-#define FSADS16_H(h)                                                          \
-  unsigned int aom_sad_skip_16x##h##_neon(                                    \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
-      int ref_stride) {                                                       \
-    return 2 * sad16xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
-                            h / 2);                                           \
-  }
-
-FSADS16_H(64);
-FSADS16_H(32);
-FSADS16_H(16);
-FSADS16_H(8);
-#undef FSADS16_H
-
-#define FSADS8_H(h)                                                          \
-  unsigned int aom_sad_skip_8x##h##_neon(                                    \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
-      int ref_stride) {                                                      \
-    return 2 * sad8xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
-                           h / 2);                                           \
-  }
-
-FSADS8_H(32);
-FSADS8_H(16);
-FSADS8_H(8);
-#undef FSADS8_H
-
-#define FSADS4_H(h)                                                          \
-  unsigned int aom_sad_skip_4x##h##_neon(                                    \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
-      int ref_stride) {                                                      \
-    return 2 * sad4xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
-                           h / 2);                                           \
-  }
-
-FSADS4_H(16);
-FSADS4_H(8);
-#undef FSADS4_H

diff --git a/aom_dsp/arm/sse_neon.c b/aom_dsp/arm/sse_neon.c
index 2a0c566..95fe124 100644
--- a/aom_dsp/arm/sse_neon.c
+++ b/aom_dsp/arm/sse_neon.c

@@ -16,171 +16,6 @@
 #include "av1/common/arm/mem_neon.h"
 #include "av1/common/arm/transpose_neon.h"
 
-static INLINE void sse_w16_neon(uint32x4_t *sum, const uint8_t *a,
-                                const uint8_t *b) {
-  const uint8x16_t v_a0 = vld1q_u8(a);
-  const uint8x16_t v_b0 = vld1q_u8(b);
-  const uint8x16_t diff = vabdq_u8(v_a0, v_b0);
-  const uint8x8_t diff_lo = vget_low_u8(diff);
-  const uint8x8_t diff_hi = vget_high_u8(diff);
-  *sum = vpadalq_u16(*sum, vmull_u8(diff_lo, diff_lo));
-  *sum = vpadalq_u16(*sum, vmull_u8(diff_hi, diff_hi));
-}
-static INLINE void aom_sse4x2_neon(const uint8_t *a, int a_stride,
-                                   const uint8_t *b, int b_stride,
-                                   uint32x4_t *sum) {
-  uint8x8_t v_a0, v_b0;
-  v_a0 = v_b0 = vcreate_u8(0);
-  // above line is only to shadow [-Werror=uninitialized]
-  v_a0 = vreinterpret_u8_u32(
-      vld1_lane_u32((uint32_t *)a, vreinterpret_u32_u8(v_a0), 0));
-  v_a0 = vreinterpret_u8_u32(
-      vld1_lane_u32((uint32_t *)(a + a_stride), vreinterpret_u32_u8(v_a0), 1));
-  v_b0 = vreinterpret_u8_u32(
-      vld1_lane_u32((uint32_t *)b, vreinterpret_u32_u8(v_b0), 0));
-  v_b0 = vreinterpret_u8_u32(
-      vld1_lane_u32((uint32_t *)(b + b_stride), vreinterpret_u32_u8(v_b0), 1));
-  const uint8x8_t v_a_w = vabd_u8(v_a0, v_b0);
-  *sum = vpadalq_u16(*sum, vmull_u8(v_a_w, v_a_w));
-}
-static INLINE void aom_sse8_neon(const uint8_t *a, const uint8_t *b,
-                                 uint32x4_t *sum) {
-  const uint8x8_t v_a_w = vld1_u8(a);
-  const uint8x8_t v_b_w = vld1_u8(b);
-  const uint8x8_t v_d_w = vabd_u8(v_a_w, v_b_w);
-  *sum = vpadalq_u16(*sum, vmull_u8(v_d_w, v_d_w));
-}
-int64_t aom_sse_neon(const uint8_t *a, int a_stride, const uint8_t *b,
-                     int b_stride, int width, int height) {
-  int y = 0;
-  int64_t sse = 0;
-  uint32x4_t sum = vdupq_n_u32(0);
-  switch (width) {
-    case 4:
-      do {
-        aom_sse4x2_neon(a, a_stride, b, b_stride, &sum);
-        a += a_stride << 1;
-        b += b_stride << 1;
-        y += 2;
-      } while (y < height);
-#if defined(__aarch64__)
-      sse = vaddvq_u32(sum);
-#else
-      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-#endif  // __aarch64__
-      break;
-    case 8:
-      do {
-        aom_sse8_neon(a, b, &sum);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-#if defined(__aarch64__)
-      sse = vaddvq_u32(sum);
-#else
-      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-#endif  // __aarch64__
-      break;
-    case 16:
-      do {
-        sse_w16_neon(&sum, a, b);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-#if defined(__aarch64__)
-      sse = vaddvq_u32(sum);
-#else
-      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-#endif  // __aarch64__
-      break;
-    case 32:
-      do {
-        sse_w16_neon(&sum, a, b);
-        sse_w16_neon(&sum, a + 16, b + 16);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-#if defined(__aarch64__)
-      sse = vaddvq_u32(sum);
-#else
-      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-#endif  // __aarch64__
-      break;
-    case 64:
-      do {
-        sse_w16_neon(&sum, a, b);
-        sse_w16_neon(&sum, a + 16 * 1, b + 16 * 1);
-        sse_w16_neon(&sum, a + 16 * 2, b + 16 * 2);
-        sse_w16_neon(&sum, a + 16 * 3, b + 16 * 3);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-#if defined(__aarch64__)
-      sse = vaddvq_u32(sum);
-#else
-      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-#endif  // __aarch64__
-      break;
-    case 128:
-      do {
-        sse_w16_neon(&sum, a, b);
-        sse_w16_neon(&sum, a + 16 * 1, b + 16 * 1);
-        sse_w16_neon(&sum, a + 16 * 2, b + 16 * 2);
-        sse_w16_neon(&sum, a + 16 * 3, b + 16 * 3);
-        sse_w16_neon(&sum, a + 16 * 4, b + 16 * 4);
-        sse_w16_neon(&sum, a + 16 * 5, b + 16 * 5);
-        sse_w16_neon(&sum, a + 16 * 6, b + 16 * 6);
-        sse_w16_neon(&sum, a + 16 * 7, b + 16 * 7);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-#if defined(__aarch64__)
-      sse = vaddvq_u32(sum);
-#else
-      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-#endif  // __aarch64__
-      break;
-    default:
-      if (width & 0x07) {
-        do {
-          int i = 0;
-          do {
-            aom_sse8_neon(a + i, b + i, &sum);
-            aom_sse8_neon(a + i + a_stride, b + i + b_stride, &sum);
-            i += 8;
-          } while (i + 4 < width);
-          aom_sse4x2_neon(a + i, a_stride, b + i, b_stride, &sum);
-          a += (a_stride << 1);
-          b += (b_stride << 1);
-          y += 2;
-        } while (y < height);
-      } else {
-        do {
-          int i = 0;
-          do {
-            aom_sse8_neon(a + i, b + i, &sum);
-            i += 8;
-          } while (i < width);
-          a += a_stride;
-          b += b_stride;
-          y += 1;
-        } while (y < height);
-      }
-#if defined(__aarch64__)
-      sse = vaddvq_u32(sum);
-#else
-      sse = horizontal_add_s32x4(vreinterpretq_s32_u32(sum));
-#endif  // __aarch64__
-      break;
-  }
-  return sse;
-}
-
 static INLINE uint32_t highbd_sse_W8x1_neon(uint16x8_t q2, uint16x8_t q3) {
   uint32_t sse;
   const uint32_t sse1 = 0;

diff --git a/aom_dsp/arm/subpel_variance_neon.c b/aom_dsp/arm/subpel_variance_neon.c
deleted file mode 100644
index d75387e..0000000
--- a/aom_dsp/arm/subpel_variance_neon.c
+++ /dev/null

@@ -1,438 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_dsp_rtcd.h"
-#include "config/aom_config.h"
-
-#include "aom_ports/mem.h"
-#include "aom/aom_integer.h"
-
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/variance.h"
-
-// Load 2 sets of 4 bytes when alignment is not guaranteed.
-static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
-  uint32_t a;
-  uint32x2_t a_u32 = vdup_n_u32(0);
-  if (stride == 4) return vld1_u8(buf);
-  memcpy(&a, buf, 4);
-  buf += stride;
-  a_u32 = vld1_lane_u32(&a, a_u32, 0);
-  memcpy(&a, buf, 4);
-  a_u32 = vld1_lane_u32(&a, a_u32, 1);
-  return vreinterpret_u8_u32(a_u32);
-}
-
-// Process a block exactly 4 wide and a multiple of 2 high.
-static void var_filter_block2d_bil_w4(const uint8_t *src_ptr,
-                                      uint8_t *output_ptr,
-                                      unsigned int src_pixels_per_line,
-                                      int pixel_step,
-                                      unsigned int output_height,
-                                      const uint8_t *filter) {
-  const uint8x8_t f0 = vdup_n_u8(filter[0]);
-  const uint8x8_t f1 = vdup_n_u8(filter[1]);
-  unsigned int i;
-  for (i = 0; i < output_height; i += 2) {
-    const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line);
-    const uint8x8_t src_1 =
-        load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line);
-    const uint16x8_t a = vmull_u8(src_0, f0);
-    const uint16x8_t b = vmlal_u8(a, src_1, f1);
-    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
-    vst1_u8(output_ptr, out);
-    src_ptr += 2 * src_pixels_per_line;
-    output_ptr += 8;
-  }
-}
-
-static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
-                                      uint8_t *output_ptr,
-                                      unsigned int src_pixels_per_line,
-                                      int pixel_step,
-                                      unsigned int output_height,
-                                      unsigned int output_width,
-                                      const uint8_t *filter) {
-  const uint8x8_t f0 = vdup_n_u8(filter[0]);
-  const uint8x8_t f1 = vdup_n_u8(filter[1]);
-  unsigned int i;
-  for (i = 0; i < output_height; ++i) {
-    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
-    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
-    const uint16x8_t a = vmull_u8(src_0, f0);
-    const uint16x8_t b = vmlal_u8(a, src_1, f1);
-    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
-    vst1_u8(output_ptr, out);
-    // Next row...
-    src_ptr += src_pixels_per_line;
-    output_ptr += output_width;
-  }
-}
-
-// Process a block which is a mutiple of 16 wide and any height.
-static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
-                                       uint8_t *output_ptr,
-                                       unsigned int src_pixels_per_line,
-                                       int pixel_step,
-                                       unsigned int output_height,
-                                       unsigned int output_width,
-                                       const uint8_t *filter) {
-  const uint8x8_t f0 = vdup_n_u8(filter[0]);
-  const uint8x8_t f1 = vdup_n_u8(filter[1]);
-  unsigned int i, j;
-  for (i = 0; i < output_height; ++i) {
-    for (j = 0; j < output_width; j += 16) {
-      const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
-      const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
-      const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
-      const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
-      const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
-      const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
-      const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
-      const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
-      vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi));
-    }
-    src_ptr += src_pixels_per_line;
-    output_ptr += output_width;
-  }
-}
-
-unsigned int aom_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride,
-                                            int xoffset, int yoffset,
-                                            const uint8_t *dst, int dst_stride,
-                                            unsigned int *sse) {
-  DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
-  DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
-
-  var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8,
-                            bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8,
-                            bilinear_filters_2t[yoffset]);
-  return aom_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance16x16_neon(const uint8_t *src,
-                                              int src_stride, int xoffset,
-                                              int yoffset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
-  DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
-
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16,
-                             bilinear_filters_2t[yoffset]);
-  return aom_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance32x32_neon(const uint8_t *src,
-                                              int src_stride, int xoffset,
-                                              int yoffset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
-  DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
-
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32,
-                             bilinear_filters_2t[yoffset]);
-  return aom_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance64x64_neon(const uint8_t *src,
-                                              int src_stride, int xoffset,
-                                              int yoffset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
-  DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
-
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64,
-                             bilinear_filters_2t[yoffset]);
-  return aom_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance4x4_neon(const uint8_t *a, int a_stride,
-                                            int xoffset, int yoffset,
-                                            const uint8_t *b, int b_stride,
-                                            uint32_t *sse) {
-  uint8_t temp0[4 * (4 + 2)];
-  uint8_t temp1[4 * 4];
-
-  var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (4 + 2),
-                            bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w4(temp0, temp1, 4, 4, 4,
-                            bilinear_filters_2t[yoffset]);
-
-  return aom_variance4x4(temp1, 4, b, b_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance4x8_neon(const uint8_t *a, int a_stride,
-                                            int xoffset, int yoffset,
-                                            const uint8_t *b, int b_stride,
-                                            uint32_t *sse) {
-  uint8_t temp0[4 * (8 + 2)];
-  uint8_t temp1[4 * 8];
-
-  var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (8 + 2),
-                            bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w4(temp0, temp1, 4, 4, 8,
-                            bilinear_filters_2t[yoffset]);
-
-  return aom_variance4x8(temp1, 4, b, b_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance4x16_neon(const uint8_t *a, int a_stride,
-                                             int xoffset, int yoffset,
-                                             const uint8_t *b, int b_stride,
-                                             uint32_t *sse) {
-  uint8_t temp0[4 * (16 + 2)];
-  uint8_t temp1[4 * 16];
-
-  var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (16 + 2),
-                            bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w4(temp0, temp1, 4, 4, 16,
-                            bilinear_filters_2t[yoffset]);
-
-  return aom_variance4x16(temp1, 4, b, b_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance8x4_neon(const uint8_t *a, int a_stride,
-                                            int xoffset, int yoffset,
-                                            const uint8_t *b, int b_stride,
-                                            uint32_t *sse) {
-  uint8_t temp0[8 * (4 + 1)];
-  uint8_t temp1[8 * 4];
-
-  var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (4 + 1), 8,
-                            bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w8(temp0, temp1, 8, 8, 4, 8,
-                            bilinear_filters_2t[yoffset]);
-
-  return aom_variance8x4(temp1, 8, b, b_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance8x16_neon(const uint8_t *a, int a_stride,
-                                             int xoffset, int yoffset,
-                                             const uint8_t *b, int b_stride,
-                                             uint32_t *sse) {
-  uint8_t temp0[8 * (16 + 1)];
-  uint8_t temp1[8 * 16];
-
-  var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (16 + 1), 8,
-                            bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w8(temp0, temp1, 8, 8, 16, 8,
-                            bilinear_filters_2t[yoffset]);
-
-  return aom_variance8x16(temp1, 8, b, b_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance8x32_neon(const uint8_t *a, int a_stride,
-                                             int xoffset, int yoffset,
-                                             const uint8_t *b, int b_stride,
-                                             uint32_t *sse) {
-  uint8_t temp0[8 * (32 + 1)];
-  uint8_t temp1[8 * 32];
-
-  var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (32 + 1), 8,
-                            bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w8(temp0, temp1, 8, 8, 32, 8,
-                            bilinear_filters_2t[yoffset]);
-
-  return aom_variance8x32(temp1, 8, b, b_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance16x4_neon(const uint8_t *a, int a_stride,
-                                             int xoffset, int yoffset,
-                                             const uint8_t *b, int b_stride,
-                                             uint32_t *sse) {
-  uint8_t temp0[16 * (4 + 1)];
-  uint8_t temp1[16 * 4];
-
-  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (4 + 1), 16,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(temp0, temp1, 16, 16, 4, 16,
-                             bilinear_filters_2t[yoffset]);
-
-  return aom_variance16x4(temp1, 16, b, b_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance16x8_neon(const uint8_t *a, int a_stride,
-                                             int xoffset, int yoffset,
-                                             const uint8_t *b, int b_stride,
-                                             uint32_t *sse) {
-  uint8_t temp0[16 * (8 + 1)];
-  uint8_t temp1[16 * 8];
-
-  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (8 + 1), 16,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(temp0, temp1, 16, 16, 8, 16,
-                             bilinear_filters_2t[yoffset]);
-
-  return aom_variance16x8(temp1, 16, b, b_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance16x32_neon(const uint8_t *a, int a_stride,
-                                              int xoffset, int yoffset,
-                                              const uint8_t *b, int b_stride,
-                                              uint32_t *sse) {
-  uint8_t temp0[16 * (32 + 1)];
-  uint8_t temp1[16 * 32];
-
-  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (32 + 1), 16,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(temp0, temp1, 16, 16, 32, 16,
-                             bilinear_filters_2t[yoffset]);
-
-  return aom_variance16x32(temp1, 16, b, b_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance16x64_neon(const uint8_t *a, int a_stride,
-                                              int xoffset, int yoffset,
-                                              const uint8_t *b, int b_stride,
-                                              uint32_t *sse) {
-  uint8_t temp0[16 * (64 + 1)];
-  uint8_t temp1[16 * 64];
-
-  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (64 + 1), 16,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(temp0, temp1, 16, 16, 64, 16,
-                             bilinear_filters_2t[yoffset]);
-
-  return aom_variance16x64(temp1, 16, b, b_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance32x8_neon(const uint8_t *a, int a_stride,
-                                             int xoffset, int yoffset,
-                                             const uint8_t *b, int b_stride,
-                                             uint32_t *sse) {
-  uint8_t temp0[32 * (8 + 1)];
-  uint8_t temp1[32 * 8];
-
-  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (8 + 1), 32,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(temp0, temp1, 32, 32, 8, 32,
-                             bilinear_filters_2t[yoffset]);
-
-  return aom_variance32x8(temp1, 32, b, b_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance32x16_neon(const uint8_t *a, int a_stride,
-                                              int xoffset, int yoffset,
-                                              const uint8_t *b, int b_stride,
-                                              uint32_t *sse) {
-  uint8_t temp0[32 * (16 + 1)];
-  uint8_t temp1[32 * 16];
-
-  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (16 + 1), 32,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(temp0, temp1, 32, 32, 16, 32,
-                             bilinear_filters_2t[yoffset]);
-
-  return aom_variance32x16(temp1, 32, b, b_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance32x64_neon(const uint8_t *a, int a_stride,
-                                              int xoffset, int yoffset,
-                                              const uint8_t *b, int b_stride,
-                                              uint32_t *sse) {
-  uint8_t temp0[32 * (64 + 1)];
-  uint8_t temp1[32 * 64];
-
-  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (64 + 1), 32,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(temp0, temp1, 32, 32, 64, 32,
-                             bilinear_filters_2t[yoffset]);
-
-  return aom_variance32x64(temp1, 32, b, b_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance64x16_neon(const uint8_t *a, int a_stride,
-                                              int xoffset, int yoffset,
-                                              const uint8_t *b, int b_stride,
-                                              uint32_t *sse) {
-  uint8_t temp0[64 * (16 + 1)];
-  uint8_t temp1[64 * 16];
-
-  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (16 + 1), 64,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(temp0, temp1, 64, 64, 16, 64,
-                             bilinear_filters_2t[yoffset]);
-
-  return aom_variance64x16(temp1, 64, b, b_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance64x32_neon(const uint8_t *a, int a_stride,
-                                              int xoffset, int yoffset,
-                                              const uint8_t *b, int b_stride,
-                                              uint32_t *sse) {
-  uint8_t temp0[64 * (32 + 1)];
-  uint8_t temp1[64 * 32];
-
-  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (32 + 1), 64,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(temp0, temp1, 64, 64, 32, 64,
-                             bilinear_filters_2t[yoffset]);
-
-  return aom_variance64x32(temp1, 64, b, b_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance64x128_neon(const uint8_t *a, int a_stride,
-                                               int xoffset, int yoffset,
-                                               const uint8_t *b, int b_stride,
-                                               uint32_t *sse) {
-  uint8_t temp0[64 * (128 + 1)];
-  uint8_t temp1[64 * 128];
-
-  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (128 + 1), 64,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(temp0, temp1, 64, 64, 128, 64,
-                             bilinear_filters_2t[yoffset]);
-
-  return aom_variance64x128(temp1, 64, b, b_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance128x64_neon(const uint8_t *a, int a_stride,
-                                               int xoffset, int yoffset,
-                                               const uint8_t *b, int b_stride,
-                                               uint32_t *sse) {
-  uint8_t temp0[128 * (64 + 1)];
-  uint8_t temp1[128 * 64];
-
-  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (64 + 1), 128,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(temp0, temp1, 128, 128, 64, 128,
-                             bilinear_filters_2t[yoffset]);
-
-  return aom_variance128x64(temp1, 128, b, b_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance128x128_neon(const uint8_t *a, int a_stride,
-                                                int xoffset, int yoffset,
-                                                const uint8_t *b, int b_stride,
-                                                uint32_t *sse) {
-  uint8_t temp0[128 * (128 + 1)];
-  uint8_t temp1[128 * 128];
-
-  var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (128 + 1), 128,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(temp0, temp1, 128, 128, 128, 128,
-                             bilinear_filters_2t[yoffset]);
-
-  return aom_variance128x128(temp1, 128, b, b_stride, sse);
-}

diff --git a/aom_dsp/arm/variance_neon.c b/aom_dsp/arm/variance_neon.c
deleted file mode 100644
index d455238..0000000
--- a/aom_dsp/arm/variance_neon.c
+++ /dev/null

@@ -1,656 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_dsp_rtcd.h"
-#include "config/aom_config.h"
-#include "aom_dsp/arm/sum_neon.h"
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-
-// w * h must be less than 2048 or local variable v_sum may overflow.
-static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b,
-                             int b_stride, int w, int h, uint32_t *sse,
-                             int *sum) {
-  int i, j;
-  int16x8_t v_sum = vdupq_n_s16(0);
-  int32x4_t v_sse_lo = vdupq_n_s32(0);
-  int32x4_t v_sse_hi = vdupq_n_s32(0);
-
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; j += 8) {
-      const uint8x8_t v_a = vld1_u8(&a[j]);
-      const uint8x8_t v_b = vld1_u8(&b[j]);
-      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
-      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
-      v_sum = vaddq_s16(v_sum, sv_diff);
-      v_sse_lo =
-          vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff));
-      v_sse_hi =
-          vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff));
-    }
-    a += a_stride;
-    b += b_stride;
-  }
-
-  *sum = horizontal_add_s16x8(v_sum);
-  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
-}
-
-void aom_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
-                        int b_stride, unsigned int *sse, int *sum) {
-  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
-}
-
-void aom_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
-                          int b_stride, unsigned int *sse, int *sum) {
-  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
-}
-
-unsigned int aom_variance8x8_neon(const uint8_t *a, int a_stride,
-                                  const uint8_t *b, int b_stride,
-                                  unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
-  return *sse - ((sum * sum) >> 6);
-}
-
-unsigned int aom_variance16x16_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
-  return *sse - (((unsigned int)((int64_t)sum * sum)) >> 8);
-}
-
-unsigned int aom_variance32x32_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
-}
-
-unsigned int aom_variance32x64_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
-  variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride,
-                   32, 32, &sse2, &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
-}
-
-unsigned int aom_variance64x32_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
-  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
-                   64, 16, &sse2, &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
-}
-
-unsigned int aom_variance64x64_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-
-  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
-  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
-                   64, 16, &sse2, &sum2);
-  sse1 += sse2;
-  sum1 += sum2;
-
-  variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride),
-                   b_stride, 64, 16, &sse2, &sum2);
-  sse1 += sse2;
-  sum1 += sum2;
-
-  variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride),
-                   b_stride, 64, 16, &sse2, &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
-}
-
-unsigned int aom_variance128x128_neon(const uint8_t *a, int a_stride,
-                                      const uint8_t *b, int b_stride,
-                                      unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-  sum1 = sse1 = 0;
-  for (int i = 0; i < 16; i++) {
-    variance_neon_w8(a + (8 * i * a_stride), a_stride, b + (8 * i * b_stride),
-                     b_stride, 128, 8, &sse2, &sum2);
-    sse1 += sse2;
-    sum1 += sum2;
-  }
-
-  *sse = sse1;
-
-  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 14);
-}
-
-unsigned int aom_variance16x8_neon(const unsigned char *src_ptr,
-                                   int source_stride,
-                                   const unsigned char *ref_ptr,
-                                   int recon_stride, unsigned int *sse) {
-  int i;
-  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-  uint32x2_t d0u32, d10u32;
-  int64x1_t d0s64, d1s64;
-  uint8x16_t q0u8, q1u8, q2u8, q3u8;
-  uint16x8_t q11u16, q12u16, q13u16, q14u16;
-  int32x4_t q8s32, q9s32, q10s32;
-  int64x2_t q0s64, q1s64, q5s64;
-
-  q8s32 = vdupq_n_s32(0);
-  q9s32 = vdupq_n_s32(0);
-  q10s32 = vdupq_n_s32(0);
-
-  for (i = 0; i < 4; i++) {
-    q0u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
-    q1u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
-    __builtin_prefetch(src_ptr);
-
-    q2u8 = vld1q_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    q3u8 = vld1q_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    __builtin_prefetch(ref_ptr);
-
-    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
-    q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
-    q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
-    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
-    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-  }
-
-  q10s32 = vaddq_s32(q10s32, q9s32);
-  q0s64 = vpaddlq_s32(q8s32);
-  q1s64 = vpaddlq_s32(q10s32);
-
-  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
-  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
-  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-  return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int aom_variance8x16_neon(const unsigned char *src_ptr,
-                                   int source_stride,
-                                   const unsigned char *ref_ptr,
-                                   int recon_stride, unsigned int *sse) {
-  int i;
-  uint8x8_t d0u8, d2u8, d4u8, d6u8;
-  int16x4_t d22s16, d23s16, d24s16, d25s16;
-  uint32x2_t d0u32, d10u32;
-  int64x1_t d0s64, d1s64;
-  uint16x8_t q11u16, q12u16;
-  int32x4_t q8s32, q9s32, q10s32;
-  int64x2_t q0s64, q1s64, q5s64;
-
-  q8s32 = vdupq_n_s32(0);
-  q9s32 = vdupq_n_s32(0);
-  q10s32 = vdupq_n_s32(0);
-
-  for (i = 0; i < 8; i++) {
-    d0u8 = vld1_u8(src_ptr);
-    src_ptr += source_stride;
-    d2u8 = vld1_u8(src_ptr);
-    src_ptr += source_stride;
-    __builtin_prefetch(src_ptr);
-
-    d4u8 = vld1_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    d6u8 = vld1_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    __builtin_prefetch(ref_ptr);
-
-    q11u16 = vsubl_u8(d0u8, d4u8);
-    q12u16 = vsubl_u8(d2u8, d6u8);
-
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-  }
-
-  q10s32 = vaddq_s32(q10s32, q9s32);
-  q0s64 = vpaddlq_s32(q8s32);
-  q1s64 = vpaddlq_s32(q10s32);
-
-  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
-  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
-  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-  return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int aom_mse16x16_neon(const unsigned char *src_ptr, int source_stride,
-                               const unsigned char *ref_ptr, int recon_stride,
-                               unsigned int *sse) {
-  int i;
-  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-  int64x1_t d0s64;
-  uint8x16_t q0u8, q1u8, q2u8, q3u8;
-  int32x4_t q7s32, q8s32, q9s32, q10s32;
-  uint16x8_t q11u16, q12u16, q13u16, q14u16;
-  int64x2_t q1s64;
-
-  q7s32 = vdupq_n_s32(0);
-  q8s32 = vdupq_n_s32(0);
-  q9s32 = vdupq_n_s32(0);
-  q10s32 = vdupq_n_s32(0);
-
-  for (i = 0; i < 8; i++) {  // mse16x16_neon_loop
-    q0u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
-    q1u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
-    q2u8 = vld1q_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    q3u8 = vld1q_u8(ref_ptr);
-    ref_ptr += recon_stride;
-
-    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
-    q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
-
-    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-    q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
-    q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
-
-    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-  }
-
-  q7s32 = vaddq_s32(q7s32, q8s32);
-  q9s32 = vaddq_s32(q9s32, q10s32);
-  q10s32 = vaddq_s32(q7s32, q9s32);
-
-  q1s64 = vpaddlq_s32(q10s32);
-  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
-  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
-}
-
-unsigned int aom_get4x4sse_cs_neon(const unsigned char *src_ptr,
-                                   int source_stride,
-                                   const unsigned char *ref_ptr,
-                                   int recon_stride) {
-  int16x4_t d22s16, d24s16, d26s16, d28s16;
-  int64x1_t d0s64;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-  int32x4_t q7s32, q8s32, q9s32, q10s32;
-  uint16x8_t q11u16, q12u16, q13u16, q14u16;
-  int64x2_t q1s64;
-
-  d0u8 = vld1_u8(src_ptr);
-  src_ptr += source_stride;
-  d4u8 = vld1_u8(ref_ptr);
-  ref_ptr += recon_stride;
-  d1u8 = vld1_u8(src_ptr);
-  src_ptr += source_stride;
-  d5u8 = vld1_u8(ref_ptr);
-  ref_ptr += recon_stride;
-  d2u8 = vld1_u8(src_ptr);
-  src_ptr += source_stride;
-  d6u8 = vld1_u8(ref_ptr);
-  ref_ptr += recon_stride;
-  d3u8 = vld1_u8(src_ptr);
-  d7u8 = vld1_u8(ref_ptr);
-
-  q11u16 = vsubl_u8(d0u8, d4u8);
-  q12u16 = vsubl_u8(d1u8, d5u8);
-  q13u16 = vsubl_u8(d2u8, d6u8);
-  q14u16 = vsubl_u8(d3u8, d7u8);
-
-  d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
-  d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
-  d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
-  d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
-
-  q7s32 = vmull_s16(d22s16, d22s16);
-  q8s32 = vmull_s16(d24s16, d24s16);
-  q9s32 = vmull_s16(d26s16, d26s16);
-  q10s32 = vmull_s16(d28s16, d28s16);
-
-  q7s32 = vaddq_s32(q7s32, q8s32);
-  q9s32 = vaddq_s32(q9s32, q10s32);
-  q9s32 = vaddq_s32(q7s32, q9s32);
-
-  q1s64 = vpaddlq_s32(q9s32);
-  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
-}
-
-// Load 4 sets of 4 bytes when alignment is not guaranteed.
-static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
-  uint32_t a;
-  uint32x4_t a_u32 = vdupq_n_u32(0);
-  if (stride == 4) return vld1q_u8(buf);
-  memcpy(&a, buf, 4);
-  buf += stride;
-  a_u32 = vld1q_lane_u32(&a, a_u32, 0);
-  memcpy(&a, buf, 4);
-  buf += stride;
-  a_u32 = vld1q_lane_u32(&a, a_u32, 1);
-  memcpy(&a, buf, 4);
-  buf += stride;
-  a_u32 = vld1q_lane_u32(&a, a_u32, 2);
-  memcpy(&a, buf, 4);
-  buf += stride;
-  a_u32 = vld1q_lane_u32(&a, a_u32, 3);
-  return vreinterpretq_u8_u32(a_u32);
-}
-
-// The variance helper functions use int16_t for sum. 8 values are accumulated
-// and then added (at which point they expand up to int32_t). To avoid overflow,
-// there can be no more than 32767 / 255 ~= 128 values accumulated in each
-// column. For a 32x32 buffer, this results in 32 / 8 = 4 values per row * 32
-// rows = 128. Asserts have been added to each function to warn against reaching
-// this limit.
-
-// Process a block of width 4 four rows at a time.
-static void variance_neon_w4x4(const uint8_t *a, int a_stride, const uint8_t *b,
-                               int b_stride, int h, uint32_t *sse, int *sum) {
-  const int32x4_t zero = vdupq_n_s32(0);
-  int16x8_t sum_s16 = vreinterpretq_s16_s32(zero);
-  int32x4_t sse_s32 = zero;
-
-  // Since width is only 4, sum_s16 only loads a half row per loop.
-  assert(h <= 256);
-
-  int i;
-  for (i = 0; i < h; i += 4) {
-    const uint8x16_t a_u8 = load_unaligned_u8q(a, a_stride);
-    const uint8x16_t b_u8 = load_unaligned_u8q(b, b_stride);
-    const int16x8_t diff_lo_s16 =
-        vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)));
-    const int16x8_t diff_hi_s16 =
-        vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8)));
-
-    sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
-    sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
-
-    sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff_lo_s16),
-                        vget_low_s16(diff_lo_s16));
-    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_lo_s16),
-                        vget_high_s16(diff_lo_s16));
-
-    sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff_hi_s16),
-                        vget_low_s16(diff_hi_s16));
-    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_hi_s16),
-                        vget_high_s16(diff_hi_s16));
-
-    a += 4 * a_stride;
-    b += 4 * b_stride;
-  }
-
-#if defined(__aarch64__)
-  *sum = vaddvq_s32(vpaddlq_s16(sum_s16));
-  *sse = (uint32_t)vaddvq_s32(sse_s32);
-#else
-  *sum = horizontal_add_s16x8(sum_s16);
-  *sse = (uint32_t)horizontal_add_s32x4(sse_s32);
-#endif
-}
-
-// Process a block of any size where the width is divisible by 16.
-static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b,
-                              int b_stride, int w, int h, uint32_t *sse,
-                              int *sum) {
-  const int32x4_t zero = vdupq_n_s32(0);
-  int16x8_t sum_s16 = vreinterpretq_s16_s32(zero);
-  int32x4_t sse_s32 = zero;
-
-  // The loop loads 16 values at a time but doubles them up when accumulating
-  // into sum_s16.
-  assert(w / 8 * h <= 128);
-
-  int i, j;
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; j += 16) {
-      const uint8x16_t a_u8 = vld1q_u8(a + j);
-      const uint8x16_t b_u8 = vld1q_u8(b + j);
-
-      const int16x8_t diff_lo_s16 =
-          vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)));
-      const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(
-          vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8)));
-
-      sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
-      sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
-
-      sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff_lo_s16),
-                          vget_low_s16(diff_lo_s16));
-      sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_lo_s16),
-                          vget_high_s16(diff_lo_s16));
-
-      sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff_hi_s16),
-                          vget_low_s16(diff_hi_s16));
-      sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_hi_s16),
-                          vget_high_s16(diff_hi_s16));
-    }
-    a += a_stride;
-    b += b_stride;
-  }
-
-#if defined(__aarch64__)
-  *sum = vaddvq_s32(vpaddlq_s16(sum_s16));
-  *sse = (uint32_t)vaddvq_s32(sse_s32);
-#else
-  *sum = horizontal_add_s16x8(sum_s16);
-  *sse = (uint32_t)horizontal_add_s32x4(sse_s32);
-#endif
-}
-
-// Process a block of width 8 two rows at a time.
-static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b,
-                               int b_stride, int h, uint32_t *sse, int *sum) {
-  const int32x4_t zero = vdupq_n_s32(0);
-  int16x8_t sum_s16 = vreinterpretq_s16_s32(zero);
-  int32x4_t sse_s32 = zero;
-
-  // Each column has it's own accumulator entry in sum_s16.
-  assert(h <= 128);
-
-  int i = 0;
-  do {
-    const uint8x8_t a_0_u8 = vld1_u8(a);
-    const uint8x8_t a_1_u8 = vld1_u8(a + a_stride);
-    const uint8x8_t b_0_u8 = vld1_u8(b);
-    const uint8x8_t b_1_u8 = vld1_u8(b + b_stride);
-    const int16x8_t diff_0_s16 =
-        vreinterpretq_s16_u16(vsubl_u8(a_0_u8, b_0_u8));
-    const int16x8_t diff_1_s16 =
-        vreinterpretq_s16_u16(vsubl_u8(a_1_u8, b_1_u8));
-    sum_s16 = vaddq_s16(sum_s16, diff_0_s16);
-    sum_s16 = vaddq_s16(sum_s16, diff_1_s16);
-    sse_s32 =
-        vmlal_s16(sse_s32, vget_low_s16(diff_0_s16), vget_low_s16(diff_0_s16));
-    sse_s32 =
-        vmlal_s16(sse_s32, vget_low_s16(diff_1_s16), vget_low_s16(diff_1_s16));
-    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_0_s16),
-                        vget_high_s16(diff_0_s16));
-    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_1_s16),
-                        vget_high_s16(diff_1_s16));
-    a += a_stride + a_stride;
-    b += b_stride + b_stride;
-    i += 2;
-  } while (i < h);
-
-#if defined(__aarch64__)
-  *sum = vaddvq_s32(vpaddlq_s16(sum_s16));
-  *sse = (uint32_t)vaddvq_s32(sse_s32);
-#else
-  *sum = horizontal_add_s16x8(sum_s16);
-  *sse = (uint32_t)horizontal_add_s32x4(sse_s32);
-#endif
-}
-
-#define varianceNxM(n, m, shift)                                            \
-  unsigned int aom_variance##n##x##m##_neon(const uint8_t *a, int a_stride, \
-                                            const uint8_t *b, int b_stride, \
-                                            unsigned int *sse) {            \
-    int sum;                                                                \
-    if (n == 4)                                                             \
-      variance_neon_w4x4(a, a_stride, b, b_stride, m, sse, &sum);           \
-    else if (n == 8)                                                        \
-      variance_neon_w8x2(a, a_stride, b, b_stride, m, sse, &sum);           \
-    else                                                                    \
-      variance_neon_w16(a, a_stride, b, b_stride, n, m, sse, &sum);         \
-    if (n * m < 16 * 16)                                                    \
-      return *sse - ((sum * sum) >> shift);                                 \
-    else                                                                    \
-      return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);              \
-  }
-
-static void variance_neon_wide_block(const uint8_t *a, int a_stride,
-                                     const uint8_t *b, int b_stride, int w,
-                                     int h, uint32_t *sse, int *sum) {
-  const int32x4_t zero = vdupq_n_s32(0);
-  int32x4_t v_diff = zero;
-  int64x2_t v_sse = vreinterpretq_s64_s32(zero);
-
-  int s, i, j;
-  for (s = 0; s < 16; s++) {
-    int32x4_t sse_s32 = zero;
-    int16x8_t sum_s16 = vreinterpretq_s16_s32(zero);
-    for (i = (s * h) >> 4; i < (((s + 1) * h) >> 4); ++i) {
-      for (j = 0; j < w; j += 16) {
-        const uint8x16_t a_u8 = vld1q_u8(a + j);
-        const uint8x16_t b_u8 = vld1q_u8(b + j);
-
-        const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(
-            vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)));
-        const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(
-            vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8)));
-
-        sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
-        sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
-
-        sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff_lo_s16),
-                            vget_low_s16(diff_lo_s16));
-        sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_lo_s16),
-                            vget_high_s16(diff_lo_s16));
-        sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff_hi_s16),
-                            vget_low_s16(diff_hi_s16));
-        sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff_hi_s16),
-                            vget_high_s16(diff_hi_s16));
-      }
-
-      a += a_stride;
-      b += b_stride;
-    }
-
-    v_diff = vpadalq_s16(v_diff, sum_s16);
-    v_sse = vpadalq_s32(v_sse, sse_s32);
-  }
-#if defined(__aarch64__)
-  int diff = vaddvq_s32(v_diff);
-  uint32_t sq = (uint32_t)vaddvq_u64(vreinterpretq_u64_s64(v_sse));
-#else
-  int diff = horizontal_add_s32x4(v_diff);
-  uint32_t sq = vget_lane_u32(
-      vreinterpret_u32_s64(vadd_s64(vget_low_s64(v_sse), vget_high_s64(v_sse))),
-      0);
-#endif
-
-  *sum = diff;
-  *sse = sq;
-}
-
-#define varianceNxM_wide(W, H)                                              \
-  unsigned int aom_variance##W##x##H##_neon(const uint8_t *a, int a_stride, \
-                                            const uint8_t *b, int b_stride, \
-                                            uint32_t *sse) {                \
-    int sum;                                                                \
-    variance_neon_wide_block(a, a_stride, b, b_stride, W, H, sse, &sum);    \
-    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));               \
-  }
-
-varianceNxM(4, 4, 4);
-varianceNxM(4, 8, 5);
-varianceNxM(8, 4, 5);
-varianceNxM(16, 32, 9);
-varianceNxM(32, 16, 9);
-varianceNxM_wide(128, 64);
-varianceNxM_wide(64, 128);

diff --git a/aom_dsp/avg.c b/aom_dsp/avg.c
index a638b02..76cc25b 100644
--- a/aom_dsp/avg.c
+++ b/aom_dsp/avg.c

@@ -15,40 +15,6 @@
 #include "config/aom_dsp_rtcd.h"
 #include "aom_ports/mem.h"
 
-void aom_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
-                      int *min, int *max) {
-  int i, j;
-  *min = 255;
-  *max = 0;
-  for (i = 0; i < 8; ++i, s += p, d += dp) {
-    for (j = 0; j < 8; ++j) {
-      int diff = abs(s[j] - d[j]);
-      *min = diff < *min ? diff : *min;
-      *max = diff > *max ? diff : *max;
-    }
-  }
-}
-
-unsigned int aom_avg_4x4_c(const uint8_t *s, int p) {
-  int i, j;
-  int sum = 0;
-  for (i = 0; i < 4; ++i, s += p)
-    for (j = 0; j < 4; sum += s[j], ++j) {
-    }
-
-  return (sum + 8) >> 4;
-}
-
-unsigned int aom_avg_8x8_c(const uint8_t *s, int p) {
-  int i, j;
-  int sum = 0;
-  for (i = 0; i < 8; ++i, s += p)
-    for (j = 0; j < 8; sum += s[j], ++j) {
-    }
-
-  return (sum + 32) >> 6;
-}
-
 unsigned int aom_highbd_avg_8x8_c(const uint8_t *s8, int p) {
   int i, j;
   int sum = 0;
@@ -87,181 +53,6 @@
   }
 }
 
-// src_diff: first pass, 9 bit, dynamic range [-255, 255]
-//           second pass, 12 bit, dynamic range [-2040, 2040]
-static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride,
-                          int16_t *coeff) {
-  int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
-  int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
-  int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
-  int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
-  int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
-  int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
-  int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
-  int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
-
-  int16_t c0 = b0 + b2;
-  int16_t c1 = b1 + b3;
-  int16_t c2 = b0 - b2;
-  int16_t c3 = b1 - b3;
-  int16_t c4 = b4 + b6;
-  int16_t c5 = b5 + b7;
-  int16_t c6 = b4 - b6;
-  int16_t c7 = b5 - b7;
-
-  coeff[0] = c0 + c4;
-  coeff[7] = c1 + c5;
-  coeff[3] = c2 + c6;
-  coeff[4] = c3 + c7;
-  coeff[2] = c0 - c4;
-  coeff[6] = c1 - c5;
-  coeff[1] = c2 - c6;
-  coeff[5] = c3 - c7;
-}
-
-// The order of the output coeff of the hadamard is not important. For
-// optimization purposes the final transpose may be skipped.
-void aom_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
-                        tran_low_t *coeff) {
-  int idx;
-  int16_t buffer[64];
-  int16_t buffer2[64];
-  int16_t *tmp_buf = &buffer[0];
-  for (idx = 0; idx < 8; ++idx) {
-    hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
-                                                   // dynamic range [-255, 255]
-    tmp_buf += 8;
-    ++src_diff;
-  }
-
-  tmp_buf = &buffer[0];
-  for (idx = 0; idx < 8; ++idx) {
-    hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx);  // tmp_buf: 12 bit
-    // dynamic range [-2040, 2040]
-    // buffer2: 15 bit
-    // dynamic range [-16320, 16320]
-    ++tmp_buf;
-  }
-
-  for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];
-}
-
-void aom_hadamard_lp_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
-                           int16_t *coeff) {
-  int16_t buffer[64];
-  int16_t buffer2[64];
-  int16_t *tmp_buf = &buffer[0];
-  for (int idx = 0; idx < 8; ++idx) {
-    hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
-                                                   // dynamic range [-255, 255]
-    tmp_buf += 8;
-    ++src_diff;
-  }
-
-  tmp_buf = &buffer[0];
-  for (int idx = 0; idx < 8; ++idx) {
-    hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx);  // tmp_buf: 12 bit
-    // dynamic range [-2040, 2040]
-    // buffer2: 15 bit
-    // dynamic range [-16320, 16320]
-    ++tmp_buf;
-  }
-
-  for (int idx = 0; idx < 64; ++idx) coeff[idx] = buffer2[idx];
-}
-
-// In place 16x16 2D Hadamard transform
-void aom_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
-                          tran_low_t *coeff) {
-  int idx;
-  for (idx = 0; idx < 4; ++idx) {
-    // src_diff: 9 bit, dynamic range [-255, 255]
-    const int16_t *src_ptr =
-        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
-    aom_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
-  }
-
-  // coeff: 15 bit, dynamic range [-16320, 16320]
-  for (idx = 0; idx < 64; ++idx) {
-    tran_low_t a0 = coeff[0];
-    tran_low_t a1 = coeff[64];
-    tran_low_t a2 = coeff[128];
-    tran_low_t a3 = coeff[192];
-
-    tran_low_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
-    tran_low_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
-    tran_low_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
-    tran_low_t b3 = (a2 - a3) >> 1;
-
-    coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
-    coeff[64] = b1 + b3;
-    coeff[128] = b0 - b2;
-    coeff[192] = b1 - b3;
-
-    ++coeff;
-  }
-}
-
-void aom_hadamard_lp_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
-                             int16_t *coeff) {
-  for (int idx = 0; idx < 4; ++idx) {
-    // src_diff: 9 bit, dynamic range [-255, 255]
-    const int16_t *src_ptr =
-        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
-    aom_hadamard_lp_8x8_c(src_ptr, src_stride, coeff + idx * 64);
-  }
-
-  for (int idx = 0; idx < 64; ++idx) {
-    int16_t a0 = coeff[0];
-    int16_t a1 = coeff[64];
-    int16_t a2 = coeff[128];
-    int16_t a3 = coeff[192];
-
-    int16_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
-    int16_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
-    int16_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
-    int16_t b3 = (a2 - a3) >> 1;
-
-    coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
-    coeff[64] = b1 + b3;
-    coeff[128] = b0 - b2;
-    coeff[192] = b1 - b3;
-
-    ++coeff;
-  }
-}
-
-void aom_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
-                          tran_low_t *coeff) {
-  int idx;
-  for (idx = 0; idx < 4; ++idx) {
-    // src_diff: 9 bit, dynamic range [-255, 255]
-    const int16_t *src_ptr =
-        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
-    aom_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256);
-  }
-
-  // coeff: 15 bit, dynamic range [-16320, 16320]
-  for (idx = 0; idx < 256; ++idx) {
-    tran_low_t a0 = coeff[0];
-    tran_low_t a1 = coeff[256];
-    tran_low_t a2 = coeff[512];
-    tran_low_t a3 = coeff[768];
-
-    tran_low_t b0 = (a0 + a1) >> 2;  // (a0 + a1): 16 bit, [-32640, 32640]
-    tran_low_t b1 = (a0 - a1) >> 2;  // b0-b3: 15 bit, dynamic range
-    tran_low_t b2 = (a2 + a3) >> 2;  // [-16320, 16320]
-    tran_low_t b3 = (a2 - a3) >> 2;
-
-    coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
-    coeff[256] = b1 + b3;
-    coeff[512] = b0 - b2;
-    coeff[768] = b1 - b3;
-
-    ++coeff;
-  }
-}
-
 static void hadamard_highbd_col8_first_pass(const int16_t *src_diff,
                                             ptrdiff_t src_stride,
                                             int16_t *coeff) {

diff --git a/aom_dsp/blend_a64_hmask.c b/aom_dsp/blend_a64_hmask.c
index d41f749..1542a01 100644
--- a/aom_dsp/blend_a64_hmask.c
+++ b/aom_dsp/blend_a64_hmask.c

@@ -19,28 +19,6 @@
 
 #include "config/aom_dsp_rtcd.h"
 
-void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride,
-                           const uint8_t *src0, uint32_t src0_stride,
-                           const uint8_t *src1, uint32_t src1_stride,
-                           const uint8_t *mask, int w, int h) {
-  int i, j;
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; ++j) {
-      dst[i * dst_stride + j] = AOM_BLEND_A64(
-          mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
-    }
-  }
-}
-
 void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride,
                                   const uint8_t *src0_8, uint32_t src0_stride,
                                   const uint8_t *src1_8, uint32_t src1_stride,

diff --git a/aom_dsp/blend_a64_mask.c b/aom_dsp/blend_a64_mask.c
index 4c42a77..91f00ed 100644
--- a/aom_dsp/blend_a64_mask.c
+++ b/aom_dsp/blend_a64_mask.c

@@ -34,93 +34,6 @@
 // In contrast, the output of the non-d16 functions will not be further rounded,
 // so we *should* use ROUND_POWER_OF_TWO there.
 
-void aom_lowbd_blend_a64_d16_mask_c(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
-    ConvolveParams *conv_params) {
-  int i, j;
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
-                           (1 << (offset_bits - conv_params->round_1 - 1));
-  const int round_bits =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-
-  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 4);
-  assert(w >= 4);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  if (subw == 0 && subh == 0) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        int32_t res;
-        const int m = mask[i * mask_stride + j];
-        res = ((m * (int32_t)src0[i * src0_stride + j] +
-                (AOM_BLEND_A64_MAX_ALPHA - m) *
-                    (int32_t)src1[i * src1_stride + j]) >>
-               AOM_BLEND_A64_ROUND_BITS);
-        res -= round_offset;
-        dst[i * dst_stride + j] =
-            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
-      }
-    }
-  } else if (subw == 1 && subh == 1) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        int32_t res;
-        const int m = ROUND_POWER_OF_TWO(
-            mask[(2 * i) * mask_stride + (2 * j)] +
-                mask[(2 * i + 1) * mask_stride + (2 * j)] +
-                mask[(2 * i) * mask_stride + (2 * j + 1)] +
-                mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
-            2);
-        res = ((m * (int32_t)src0[i * src0_stride + j] +
-                (AOM_BLEND_A64_MAX_ALPHA - m) *
-                    (int32_t)src1[i * src1_stride + j]) >>
-               AOM_BLEND_A64_ROUND_BITS);
-        res -= round_offset;
-        dst[i * dst_stride + j] =
-            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
-      }
-    }
-  } else if (subw == 1 && subh == 0) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        int32_t res;
-        const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
-                                    mask[i * mask_stride + (2 * j + 1)]);
-        res = ((m * (int32_t)src0[i * src0_stride + j] +
-                (AOM_BLEND_A64_MAX_ALPHA - m) *
-                    (int32_t)src1[i * src1_stride + j]) >>
-               AOM_BLEND_A64_ROUND_BITS);
-        res -= round_offset;
-        dst[i * dst_stride + j] =
-            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
-      }
-    }
-  } else {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        int32_t res;
-        const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
-                                    mask[(2 * i + 1) * mask_stride + j]);
-        res = ((int32_t)(m * (int32_t)src0[i * src0_stride + j] +
-                         (AOM_BLEND_A64_MAX_ALPHA - m) *
-                             (int32_t)src1[i * src1_stride + j]) >>
-               AOM_BLEND_A64_ROUND_BITS);
-        res -= round_offset;
-        dst[i * dst_stride + j] =
-            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
-      }
-    }
-  }
-}
-
 void aom_highbd_blend_a64_d16_mask_c(
     uint8_t *dst_8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
@@ -225,63 +138,6 @@
 // as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
 // be the same as dst, or dst can be different from both sources.
 
-void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
-                          const uint8_t *src0, uint32_t src0_stride,
-                          const uint8_t *src1, uint32_t src1_stride,
-                          const uint8_t *mask, uint32_t mask_stride, int w,
-                          int h, int subw, int subh) {
-  int i, j;
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  if (subw == 0 && subh == 0) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        const int m = mask[i * mask_stride + j];
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
-      }
-    }
-  } else if (subw == 1 && subh == 1) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        const int m = ROUND_POWER_OF_TWO(
-            mask[(2 * i) * mask_stride + (2 * j)] +
-                mask[(2 * i + 1) * mask_stride + (2 * j)] +
-                mask[(2 * i) * mask_stride + (2 * j + 1)] +
-                mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
-            2);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
-      }
-    }
-  } else if (subw == 1 && subh == 0) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
-                                    mask[i * mask_stride + (2 * j + 1)]);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
-      }
-    }
-  } else {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
-                                    mask[(2 * i + 1) * mask_stride + j]);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
-      }
-    }
-  }
-}
-
 void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
                                  const uint8_t *src0_8, uint32_t src0_stride,
                                  const uint8_t *src1_8, uint32_t src1_stride,

diff --git a/aom_dsp/blend_a64_vmask.c b/aom_dsp/blend_a64_vmask.c
index 413b773..d7586ea 100644
--- a/aom_dsp/blend_a64_vmask.c
+++ b/aom_dsp/blend_a64_vmask.c

@@ -19,29 +19,6 @@
 
 #include "config/aom_dsp_rtcd.h"
 
-void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride,
-                           const uint8_t *src0, uint32_t src0_stride,
-                           const uint8_t *src1, uint32_t src1_stride,
-                           const uint8_t *mask, int w, int h) {
-  int i, j;
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  for (i = 0; i < h; ++i) {
-    const int m = mask[i];
-    for (j = 0; j < w; ++j) {
-      dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                              src1[i * src1_stride + j]);
-    }
-  }
-}
-
 void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride,
                                   const uint8_t *src0_8, uint32_t src0_stride,
                                   const uint8_t *src1_8, uint32_t src1_stride,

diff --git a/aom_dsp/fastssim.c b/aom_dsp/fastssim.c
index 37e4849..ce2a78f 100644
--- a/aom_dsp/fastssim.c
+++ b/aom_dsp/fastssim.c

@@ -141,8 +141,8 @@
 
 static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1,
                                  int _s1ystride, const uint8_t *_src2,
-                                 int _s2ystride, int _w, int _h, uint32_t shift,
-                                 int buf_is_hbd) {
+                                 int _s2ystride, int _w, int _h,
+                                 uint32_t shift) {
   uint32_t *dst1;
   uint32_t *dst2;
   int w;
@@ -163,25 +163,16 @@
       int i1;
       i0 = 2 * i;
       i1 = FS_MINI(i0 + 1, _w);
-      if (!buf_is_hbd) {
-        dst1[j * w + i] =
-            _src1[j0 * _s1ystride + i0] + _src1[j0 * _s1ystride + i1] +
-            _src1[j1 * _s1ystride + i0] + _src1[j1 * _s1ystride + i1];
-        dst2[j * w + i] =
-            _src2[j0 * _s2ystride + i0] + _src2[j0 * _s2ystride + i1] +
-            _src2[j1 * _s2ystride + i0] + _src2[j1 * _s2ystride + i1];
-      } else {
-        uint16_t *src1s = CONVERT_TO_SHORTPTR(_src1);
-        uint16_t *src2s = CONVERT_TO_SHORTPTR(_src2);
-        dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift) +
-                          (src1s[j0 * _s1ystride + i1] >> shift) +
-                          (src1s[j1 * _s1ystride + i0] >> shift) +
-                          (src1s[j1 * _s1ystride + i1] >> shift);
-        dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift) +
-                          (src2s[j0 * _s2ystride + i1] >> shift) +
-                          (src2s[j1 * _s2ystride + i0] >> shift) +
-                          (src2s[j1 * _s2ystride + i1] >> shift);
-      }
+      uint16_t *src1s = CONVERT_TO_SHORTPTR(_src1);
+      uint16_t *src2s = CONVERT_TO_SHORTPTR(_src2);
+      dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift) +
+                        (src1s[j0 * _s1ystride + i1] >> shift) +
+                        (src1s[j1 * _s1ystride + i0] >> shift) +
+                        (src1s[j1 * _s1ystride + i1] >> shift);
+      dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift) +
+                        (src2s[j0 * _s2ystride + i1] >> shift) +
+                        (src2s[j1 * _s2ystride + i0] >> shift) +
+                        (src2s[j1 * _s2ystride + i1] >> shift);
     }
   }
 }
@@ -442,14 +433,13 @@
 
 static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst,
                         int _dystride, int _w, int _h, uint32_t _bd,
-                        uint32_t _shift, int buf_is_hbd) {
+                        uint32_t _shift) {
   fs_ctx ctx;
   double ret;
   int l;
   ret = 1;
   fs_ctx_init(&ctx, _w, _h, FS_NLEVELS);
-  fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _shift,
-                       buf_is_hbd);
+  fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _shift);
   for (l = 0; l < FS_NLEVELS - 1; l++) {
     fs_calc_structure(&ctx, l, _bd);
     ret *= fs_average(&ctx, l);
@@ -471,18 +461,17 @@
   aom_clear_system_state();
   assert(bd >= in_bd);
   assert(source->flags == dest->flags);
-  int buf_is_hbd = source->flags & YV12_FLAG_HIGHBITDEPTH;
   bd_shift = bd - in_bd;
 
   *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer,
                       dest->y_stride, source->y_crop_width,
-                      source->y_crop_height, in_bd, bd_shift, buf_is_hbd);
+                      source->y_crop_height, in_bd, bd_shift);
   *ssim_u = calc_ssim(source->u_buffer, source->uv_stride, dest->u_buffer,
                       dest->uv_stride, source->uv_crop_width,
-                      source->uv_crop_height, in_bd, bd_shift, buf_is_hbd);
+                      source->uv_crop_height, in_bd, bd_shift);
   *ssim_v = calc_ssim(source->v_buffer, source->uv_stride, dest->v_buffer,
                       dest->uv_stride, source->uv_crop_width,
-                      source->uv_crop_height, in_bd, bd_shift, buf_is_hbd);
+                      source->uv_crop_height, in_bd, bd_shift);
   ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v));
   return convert_ssim_db(ssimv, 1.0);
 }

diff --git a/aom_dsp/fwd_txfm.c b/aom_dsp/fwd_txfm.c
index a592f6e..853e5ec 100644
--- a/aom_dsp/fwd_txfm.c
+++ b/aom_dsp/fwd_txfm.c

@@ -138,7 +138,8 @@
   }
 }
 
-void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
+void aom_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
+                          int stride) {
   int i, j;
   tran_low_t intermediate[64];
   int pass;
@@ -221,8 +222,3 @@
     for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
   }
 }
-
-void aom_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
-                          int stride) {
-  aom_fdct8x8_c(input, final_output, stride);
-}

diff --git a/aom_dsp/intrapred.c b/aom_dsp/intrapred.c
index c9e545f..5d39cd0 100644
--- a/aom_dsp/intrapred.c
+++ b/aom_dsp/intrapred.c

@@ -258,109 +258,6 @@
 // constants for smaller block sizes, where the range of the 'sum' is
 // restricted to fewer bits.
 
-#define DC_MULTIPLIER_1X2 0x5556
-#define DC_MULTIPLIER_1X4 0x3334
-
-#define DC_SHIFT2 16
-
-static INLINE void dc_predictor_rect(uint8_t *dst, ptrdiff_t stride, int bw,
-                                     int bh, const uint8_t *above,
-                                     const uint8_t *left, int shift1,
-                                     int multiplier) {
-  int sum = 0;
-
-  for (int i = 0; i < bw; i++) {
-    sum += above[i];
-  }
-  for (int i = 0; i < bh; i++) {
-    sum += left[i];
-  }
-
-  const int expected_dc = divide_using_multiply_shift(
-      sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2);
-  assert(expected_dc < (1 << 8));
-
-  for (int r = 0; r < bh; r++) {
-    memset(dst, expected_dc, bw);
-    dst += stride;
-  }
-}
-
-#undef DC_SHIFT2
-
-void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 4, 8, above, left, 2, DC_MULTIPLIER_1X2);
-}
-
-void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 8, 4, above, left, 2, DC_MULTIPLIER_1X2);
-}
-
-void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t stride,
-                             const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 4, 16, above, left, 2, DC_MULTIPLIER_1X4);
-}
-
-void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t stride,
-                             const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 16, 4, above, left, 2, DC_MULTIPLIER_1X4);
-}
-
-void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t stride,
-                             const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 8, 16, above, left, 3, DC_MULTIPLIER_1X2);
-}
-
-void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t stride,
-                             const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 16, 8, above, left, 3, DC_MULTIPLIER_1X2);
-}
-
-void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t stride,
-                             const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 8, 32, above, left, 3, DC_MULTIPLIER_1X4);
-}
-
-void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t stride,
-                             const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 32, 8, above, left, 3, DC_MULTIPLIER_1X4);
-}
-
-void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 16, 32, above, left, 4, DC_MULTIPLIER_1X2);
-}
-
-void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 32, 16, above, left, 4, DC_MULTIPLIER_1X2);
-}
-
-void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 16, 64, above, left, 4, DC_MULTIPLIER_1X4);
-}
-
-void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 64, 16, above, left, 4, DC_MULTIPLIER_1X4);
-}
-
-void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 32, 64, above, left, 5, DC_MULTIPLIER_1X2);
-}
-
-void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 64, 32, above, left, 5, DC_MULTIPLIER_1X2);
-}
-
-#undef DC_MULTIPLIER_1X2
-#undef DC_MULTIPLIER_1X4
-
 static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
                                       int bh, const uint16_t *above,
                                       const uint16_t *left, int bd) {

diff --git a/aom_dsp/loopfilter.c b/aom_dsp/loopfilter.c
index 5cec583..eb14d7d 100644
--- a/aom_dsp/loopfilter.c
+++ b/aom_dsp/loopfilter.c

@@ -59,27 +59,6 @@
   }
 }
 
-static INLINE void filt_generic(int q_threshold, int width, uint8_t *s,
-                                const int pitch) {
-  if (width < 1) return;
-
-  int delta_m2 = (3 * (s[0] - s[-1 * pitch]) - (s[pitch] - s[-2 * pitch])) * 4;
-
-  int q_thresh_clamp = q_threshold * q_thresh_mults[width - 1];
-  delta_m2 = clamp(delta_m2, -q_thresh_clamp, q_thresh_clamp);
-
-  delta_m2 *= w_mult[width - 1];
-
-  for (int i = 0; i < width; i++) {
-    s[(-i - 1) * pitch] =
-        clip_pixel(s[(-i - 1) * pitch] +
-                   ROUND_POWER_OF_TWO(delta_m2 * (width - i), 3 + DF_SHIFT));
-    s[i * pitch] =
-        clip_pixel(s[i * pitch] -
-                   ROUND_POWER_OF_TWO(delta_m2 * (width - i), 3 + DF_SHIFT));
-  }
-}
-
 #define DBL_CUSTOM_DECIS 3
 #define DBL_REG_DECIS_LEN MAX_DBL_FLT_LEN - DBL_CUSTOM_DECIS
 
@@ -246,152 +225,6 @@
   return MAX_DBL_FLT_LEN;
 }
 
-// Determining number of samples to be modified for the current row/column
-static INLINE int filt_choice(uint8_t *s, int pitch, int max_filt,
-                              uint16_t q_thresh, uint16_t side_thresh) {
-  if (!q_thresh || !side_thresh) return 0;
-
-  int max_samples = max_filt / 2 - 1;
-
-  int16_t second_derivs_buf[SEC_DERIV_ARRAY_LEN];
-  int16_t *second_deriv = &second_derivs_buf[(SEC_DERIV_ARRAY_LEN >> 1)];
-
-  int8_t mask = 0;
-
-  // Testing for 1 sample modification
-  //-----------------------------------------------
-  second_deriv[-2] = abs(s[-3 * pitch] - (s[-2 * pitch] << 1) + s[-pitch]);
-  second_deriv[1] = abs(s[0] - (s[pitch] << 1) + s[2 * pitch]);
-
-  mask |= (second_deriv[-2] > side_thresh) * -1;
-  mask |= (second_deriv[1] > side_thresh) * -1;
-
-  if (mask) return 0;
-
-  if (max_samples == 1) return 1;
-
-  // Testing for 2 sample modification
-  //-----------------------------------------------
-  const int side_thresh2 = side_thresh >> 2;
-
-  mask |= (second_deriv[-2] > side_thresh2) * -1;
-  mask |= (second_deriv[1] > side_thresh2) * -1;
-
-  second_deriv[-1] = abs(s[-2 * pitch] - (s[-pitch] << 1) + s[0]);
-  second_deriv[0] = abs(s[-1 * pitch] - (s[0] << 1) + s[pitch]);
-
-  mask |= ((second_deriv[-1] + second_deriv[0]) > q_thresh * DF_6_THRESH) * -1;
-
-  if (mask) return 1;
-
-  if (max_samples == 2) return 2;
-
-  // Testing 3 sample modification
-  //-----------------------------------------------
-  const int side_thresh3 = side_thresh >> FILT_8_THRESH_SHIFT;
-
-  mask |= (second_deriv[-2] > side_thresh3) * -1;
-  mask |= (second_deriv[1] > side_thresh3) * -1;
-
-#if !DF_SHORT_DEC
-  second_deriv[-3] = abs(s[-4 * pitch] - (s[-3 * pitch] << 1) + s[-2 * pitch]);
-  second_deriv[2] = abs(s[pitch] - (s[2 * pitch] << 1) + s[3 * pitch]);
-
-  mask |= (second_deriv[-3] > side_thresh3) * -1;
-  mask |= (second_deriv[2] > side_thresh3) * -1;
-#endif  //! DF_SHORT_DEC
-
-  mask |= ((second_deriv[-1] + second_deriv[0]) > q_thresh * DF_8_THRESH) * -1;
-
-  int end_dir_thresh = (side_thresh * 3) >> 4;
-  mask |= (abs((s[-1 * pitch] - s[(-3 - 1) * pitch]) -
-               3 * (s[-1 * pitch] - s[-2 * pitch])) > end_dir_thresh) *
-          -1;
-  mask |= (abs((s[0] - s[3 * pitch]) - 3 * (s[0] - s[1 * pitch])) >
-           end_dir_thresh) *
-          -1;
-
-  if (mask) return 2;
-
-  if (max_samples == 3) return 3;
-
-    // Testing  4 sample modification and above
-    //-----------------------------------------------
-#if !DF_SHORT_DEC
-#if DF_SPARSE
-  int p_deriv_sum = 0;
-  int q_deriv_sum = 0;
-#else
-  int p_deriv_sum = second_deriv[-3] << DF_SIDE_SUM_SHIFT;
-  int q_deriv_sum = second_deriv[2] << DF_SIDE_SUM_SHIFT;
-#endif  // DF_SPARSE
-#endif  // !DF_SHORT_DEC
-
-  int p_first_deriv_scaled = second_deriv[-2] << DF_SIDE_FIRST_SHIFT;
-  int q_first_deriv_scaled = second_deriv[1] << DF_SIDE_FIRST_SHIFT;
-
-  int transition = (second_deriv[-1] + second_deriv[0]) << DF_Q_THRESH_SHIFT;
-
-#if DF_SPARSE
-  for (int dist = 4; dist < MAX_DBL_FLT_LEN + 1; dist += 2) {
-#if !DF_SHORT_DEC
-    second_deriv[-(dist - 1)] =
-        abs(s[-(dist)*pitch] - (s[-(dist - 1) * pitch] << 1) +
-            s[-(dist - 2) * pitch]);
-    second_deriv[dist - 2] =
-        abs(s[(dist - 3) * pitch] - (s[(dist - 2) * pitch] << 1) +
-            s[(dist - 1) * pitch]);
-
-    p_deriv_sum += (second_deriv[-(dist - 1)] << DF_SIDE_SUM_SHIFT);
-    q_deriv_sum += (second_deriv[dist - 2] << DF_SIDE_SUM_SHIFT);
-#endif  // !DF_SHORT_DEC
-#else
-  for (int dist = 4; dist < MAX_DBL_FLT_LEN + 1; ++dist) {
-#endif  // DF_SPARSE
-
-    second_deriv[-dist] = abs(s[(-dist - 1) * pitch] - (s[-dist * pitch] << 1) +
-                              s[(-dist + 1) * pitch]);
-    second_deriv[dist - 1] = abs(
-        s[(dist - 2) * pitch] - (s[(dist - 1) * pitch] << 1) + s[dist * pitch]);
-#if !DF_SHORT_DEC
-    p_deriv_sum += (second_deriv[-dist] << DF_SIDE_SUM_SHIFT);
-    q_deriv_sum += (second_deriv[dist - 1] << DF_SIDE_SUM_SHIFT);
-
-    const int sum_side_thresh4 = side_thresh * side_sum[dist - 4];
-#endif
-
-    int side_thresh4 = side_thresh * side_first[dist - 4];
-
-    const int q_thresh4 = q_thresh * q_first[dist - 4];
-#if !DF_SHORT_DEC
-    mask |= (p_deriv_sum > sum_side_thresh4) * -1;
-    mask |= (q_deriv_sum > sum_side_thresh4) * -1;
-#endif
-
-    mask |= (p_first_deriv_scaled > side_thresh4) * -1;
-    mask |= (q_first_deriv_scaled > side_thresh4) * -1;
-
-    mask |= (transition > q_thresh4) * -1;
-
-    end_dir_thresh = (side_thresh * dist) >> 4;
-    mask |= (abs((s[-1 * pitch] - s[(-dist - 1) * pitch]) -
-                 dist * (s[-1 * pitch] - s[-2 * pitch])) > end_dir_thresh) *
-            -1;
-    mask |= (abs((s[0] - s[dist * pitch]) - dist * (s[0] - s[1 * pitch])) >
-             end_dir_thresh) *
-            -1;
-
-#if DF_SPARSE
-    if (mask) return dist - 2;
-    if (max_samples <= dist) return ((dist >> 1) << 1);
-#else
-    if (mask) return dist - 1;
-    if (max_samples == dist) return dist;
-#endif  // DF_SPARSE
-  }
-  return MAX_DBL_FLT_LEN;
-}
-
 void aom_highbd_lpf_horizontal_generic_c(uint16_t *s, int pitch, int filt_width,
                                          const uint16_t *q_thresh,
                                          const uint16_t *side_thresh, int bd) {
@@ -449,68 +282,8 @@
   }
 }
 
-void aom_lpf_horizontal_generic_c(uint8_t *s, int pitch, int filt_width,
-                                  const uint16_t *q_thresh,
-                                  const uint16_t *side_thresh) {
-  int i;
-  int count = 4;
-
-#if EDGE_DECISION
-
-  const int filter0 =
-      filt_choice(s, pitch, filt_width, *q_thresh, *side_thresh);
-  s += count - 1;
-  const int filter3 =
-      filt_choice(s, pitch, filt_width, *q_thresh, *side_thresh);
-  s -= count - 1;
-
-  int filter = AOMMIN(filter0, filter3);
-#endif  // EDGE_DECISION
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < count; ++i) {
-#if !EDGE_DECISION
-    int filter = filt_choice(s, pitch, filt_width, *q_thresh, *side_thresh);
-#endif
-
-    filt_generic(*q_thresh, filter, s, pitch);
-
-    ++s;
-  }
-}
-
-void aom_lpf_vertical_generic_c(uint8_t *s, int pitch, int filt_width,
-                                const uint16_t *q_thresh,
-                                const uint16_t *side_thresh) {
-  int i;
-  int count = 4;
-
-#if EDGE_DECISION
-  const int filter0 = filt_choice(s, 1, filt_width, *q_thresh, *side_thresh);
-  const int filter3 = filt_choice(s + (count - 1) * pitch, 1, filt_width,
-                                  *q_thresh, *side_thresh);
-  int filter = AOMMIN(filter0, filter3);
-#endif  // EDGE_DECISION
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < count; ++i) {
-#if !EDGE_DECISION
-    int filter = filt_choice(s, 1, filt_width, *q_thresh, *side_thresh);
-#endif
-
-    filt_generic(*q_thresh, filter, s, 1);
-
-    s += pitch;
-  }
-}
-
 #else  // !CONFIG_NEW_DF
 
-static INLINE int8_t signed_char_clamp(int t) {
-  return (int8_t)clamp(t, -128, 127);
-}
-
 static INLINE int16_t signed_char_clamp_high(int t, int bd) {
   switch (bd) {
     case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
@@ -520,418 +293,6 @@
   }
 }
 
-// should we apply any filter at all: 11111111 yes, 00000000 no
-static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1,
-                                  uint8_t p0, uint8_t q0, uint8_t q1) {
-  int8_t mask = 0;
-  mask |= (abs(p1 - p0) > limit) * -1;
-  mask |= (abs(q1 - q0) > limit) * -1;
-  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
-  return ~mask;
-}
-
-static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
-                                 uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
-                                 uint8_t q1, uint8_t q2, uint8_t q3) {
-  int8_t mask = 0;
-  mask |= (abs(p3 - p2) > limit) * -1;
-  mask |= (abs(p2 - p1) > limit) * -1;
-  mask |= (abs(p1 - p0) > limit) * -1;
-  mask |= (abs(q1 - q0) > limit) * -1;
-  mask |= (abs(q2 - q1) > limit) * -1;
-  mask |= (abs(q3 - q2) > limit) * -1;
-  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
-  return ~mask;
-}
-
-static INLINE int8_t filter_mask3_chroma(uint8_t limit, uint8_t blimit,
-                                         uint8_t p2, uint8_t p1, uint8_t p0,
-                                         uint8_t q0, uint8_t q1, uint8_t q2) {
-  int8_t mask = 0;
-  mask |= (abs(p2 - p1) > limit) * -1;
-  mask |= (abs(p1 - p0) > limit) * -1;
-  mask |= (abs(q1 - q0) > limit) * -1;
-  mask |= (abs(q2 - q1) > limit) * -1;
-  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
-  return ~mask;
-}
-
-static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1,
-                                       uint8_t p0, uint8_t q0, uint8_t q1,
-                                       uint8_t q2) {
-  int8_t mask = 0;
-  mask |= (abs(p1 - p0) > thresh) * -1;
-  mask |= (abs(q1 - q0) > thresh) * -1;
-  mask |= (abs(p2 - p0) > thresh) * -1;
-  mask |= (abs(q2 - q0) > thresh) * -1;
-  return ~mask;
-}
-
-static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
-                                uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
-                                uint8_t q2, uint8_t q3) {
-  int8_t mask = 0;
-  mask |= (abs(p1 - p0) > thresh) * -1;
-  mask |= (abs(q1 - q0) > thresh) * -1;
-  mask |= (abs(p2 - p0) > thresh) * -1;
-  mask |= (abs(q2 - q0) > thresh) * -1;
-  mask |= (abs(p3 - p0) > thresh) * -1;
-  mask |= (abs(q3 - q0) > thresh) * -1;
-  return ~mask;
-}
-
-// is there high edge variance internal edge: 11111111 yes, 00000000 no
-static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
-                              uint8_t q0, uint8_t q1) {
-  int8_t hev = 0;
-  hev |= (abs(p1 - p0) > thresh) * -1;
-  hev |= (abs(q1 - q0) > thresh) * -1;
-  return hev;
-}
-
-static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
-                           uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
-  int8_t filter1, filter2;
-
-  const int8_t ps1 = (int8_t)(*op1 ^ 0x80);
-  const int8_t ps0 = (int8_t)(*op0 ^ 0x80);
-  const int8_t qs0 = (int8_t)(*oq0 ^ 0x80);
-  const int8_t qs1 = (int8_t)(*oq1 ^ 0x80);
-  const int8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
-
-  // add outer taps if we have high edge variance
-  int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
-
-  // inner taps
-  filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
-
-  // save bottom 3 bits so that we round one side +4 and the other +3
-  // if it equals 4 we'll set to adjust by -1 to account for the fact
-  // we'd round 3 the other way
-  filter1 = signed_char_clamp(filter + 4) >> 3;
-  filter2 = signed_char_clamp(filter + 3) >> 3;
-
-  *oq0 = (uint8_t)(signed_char_clamp(qs0 - filter1) ^ 0x80);
-  *op0 = (uint8_t)(signed_char_clamp(ps0 + filter2) ^ 0x80);
-
-  // outer tap adjustments
-  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
-
-  *oq1 = (uint8_t)(signed_char_clamp(qs1 - filter) ^ 0x80);
-  *op1 = (uint8_t)(signed_char_clamp(ps1 + filter) ^ 0x80);
-}
-
-void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
-                            const uint8_t *blimit, const uint8_t *limit,
-                            const uint8_t *thresh) {
-  int i;
-  int count = 4;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < count; ++i) {
-    const uint8_t p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p];
-    const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
-    filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
-    ++s;
-  }
-}
-
-void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
-                                 const uint8_t *limit0, const uint8_t *thresh0,
-                                 const uint8_t *blimit1, const uint8_t *limit1,
-                                 const uint8_t *thresh1) {
-  aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
-  aom_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
-                          const uint8_t *limit, const uint8_t *thresh) {
-  int i;
-  int count = 4;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < count; ++i) {
-    const uint8_t p1 = s[-2], p0 = s[-1];
-    const uint8_t q0 = s[0], q1 = s[1];
-    const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
-    filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
-    s += pitch;
-  }
-}
-
-void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
-                               const uint8_t *limit0, const uint8_t *thresh0,
-                               const uint8_t *blimit1, const uint8_t *limit1,
-                               const uint8_t *thresh1) {
-  aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
-  aom_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
-}
-
-static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat,
-                           uint8_t *op2, uint8_t *op1, uint8_t *op0,
-                           uint8_t *oq0, uint8_t *oq1, uint8_t *oq2) {
-  if (flat && mask) {
-    const uint8_t p2 = *op2, p1 = *op1, p0 = *op0;
-    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2;
-
-    // 5-tap filter [1, 2, 2, 2, 1]
-    *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3);
-    *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3);
-    *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3);
-    *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3);
-  } else {
-    filter4(mask, thresh, op1, op0, oq0, oq1);
-  }
-}
-
-static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
-                           uint8_t *op3, uint8_t *op2, uint8_t *op1,
-                           uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
-                           uint8_t *oq2, uint8_t *oq3) {
-  if (flat && mask) {
-    const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
-    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
-
-    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
-    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
-    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
-    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
-    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
-    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
-    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
-  } else {
-    filter4(mask, thresh, op1, op0, oq0, oq1);
-  }
-}
-
-void aom_lpf_horizontal_6_c(uint8_t *s, int p, const uint8_t *blimit,
-                            const uint8_t *limit, const uint8_t *thresh) {
-  int i;
-  int count = 4;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < count; ++i) {
-    const uint8_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p];
-
-    const int8_t mask =
-        filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2);
-    const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
-    filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
-            s + 2 * p);
-    ++s;
-  }
-}
-
-void aom_lpf_horizontal_6_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
-                                 const uint8_t *limit0, const uint8_t *thresh0,
-                                 const uint8_t *blimit1, const uint8_t *limit1,
-                                 const uint8_t *thresh1) {
-  aom_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0);
-  aom_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
-                            const uint8_t *limit, const uint8_t *thresh) {
-  int i;
-  int count = 4;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < count; ++i) {
-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
-
-    const int8_t mask =
-        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
-            s + 1 * p, s + 2 * p, s + 3 * p);
-    ++s;
-  }
-}
-
-void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
-                                 const uint8_t *limit0, const uint8_t *thresh0,
-                                 const uint8_t *blimit1, const uint8_t *limit1,
-                                 const uint8_t *thresh1) {
-  aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
-  aom_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit,
-                          const uint8_t *limit, const uint8_t *thresh) {
-  int i;
-  int count = 4;
-
-  for (i = 0; i < count; ++i) {
-    const uint8_t p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2];
-    const int8_t mask =
-        filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2);
-    const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
-    filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2);
-    s += pitch;
-  }
-}
-
-void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
-                               const uint8_t *limit0, const uint8_t *thresh0,
-                               const uint8_t *blimit1, const uint8_t *limit1,
-                               const uint8_t *thresh1) {
-  aom_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0);
-  aom_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
-                          const uint8_t *limit, const uint8_t *thresh) {
-  int i;
-  int count = 4;
-
-  for (i = 0; i < count; ++i) {
-    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
-    const int8_t mask =
-        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2,
-            s + 3);
-    s += pitch;
-  }
-}
-
-void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
-                               const uint8_t *limit0, const uint8_t *thresh0,
-                               const uint8_t *blimit1, const uint8_t *limit1,
-                               const uint8_t *thresh1) {
-  aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
-  aom_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
-}
-
-static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat,
-                            int8_t flat2, uint8_t *op6, uint8_t *op5,
-                            uint8_t *op4, uint8_t *op3, uint8_t *op2,
-                            uint8_t *op1, uint8_t *op0, uint8_t *oq0,
-                            uint8_t *oq1, uint8_t *oq2, uint8_t *oq3,
-                            uint8_t *oq4, uint8_t *oq5, uint8_t *oq6) {
-  if (flat2 && flat && mask) {
-    const uint8_t p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2,
-                  p1 = *op1, p0 = *op0;
-    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
-                  q5 = *oq5, q6 = *oq6;
-
-    // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1]
-    *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0,
-                              4);
-    *op4 = ROUND_POWER_OF_TWO(
-        p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4);
-    *op3 = ROUND_POWER_OF_TWO(
-        p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4);
-    *op2 = ROUND_POWER_OF_TWO(
-        p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3,
-        4);
-    *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 +
-                                  q0 + q1 + q2 + q3 + q4,
-                              4);
-    *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
-                                  q0 * 2 + q1 + q2 + q3 + q4 + q5,
-                              4);
-    *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
-                                  q1 * 2 + q2 + q3 + q4 + q5 + q6,
-                              4);
-    *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
-                                  q2 * 2 + q3 + q4 + q5 + q6 * 2,
-                              4);
-    *oq2 = ROUND_POWER_OF_TWO(
-        p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3,
-        4);
-    *oq3 = ROUND_POWER_OF_TWO(
-        p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4);
-    *oq4 = ROUND_POWER_OF_TWO(
-        p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4);
-    *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7,
-                              4);
-  } else {
-    filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
-  }
-}
-
-static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
-                                     const uint8_t *limit,
-                                     const uint8_t *thresh, int count) {
-  int i;
-  int step = 4;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < step * count; ++i) {
-    const uint8_t p6 = s[-7 * p], p5 = s[-6 * p], p4 = s[-5 * p],
-                  p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p],
-                  q4 = s[4 * p], q5 = s[5 * p], q6 = s[6 * p];
-    const int8_t mask =
-        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
-
-    filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
-             s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
-             s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p);
-    ++s;
-  }
-}
-
-void aom_lpf_horizontal_14_c(uint8_t *s, int p, const uint8_t *blimit,
-                             const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
-}
-
-void aom_lpf_horizontal_14_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0, const uint8_t *thresh0,
-                                  const uint8_t *blimit1, const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1);
-  mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1);
-}
-
-static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
-                                   const uint8_t *limit, const uint8_t *thresh,
-                                   int count) {
-  int i;
-
-  for (i = 0; i < count; ++i) {
-    const uint8_t p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4], p2 = s[-3],
-                  p1 = s[-2], p0 = s[-1];
-    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3], q4 = s[4],
-                  q5 = s[5], q6 = s[6];
-    const int8_t mask =
-        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
-
-    filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, s - 3,
-             s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6);
-    s += p;
-  }
-}
-
-void aom_lpf_vertical_14_c(uint8_t *s, int p, const uint8_t *blimit,
-                           const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4);
-}
-
-void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
-                                const uint8_t *limit0, const uint8_t *thresh0,
-                                const uint8_t *blimit1, const uint8_t *limit1,
-                                const uint8_t *thresh1) {
-  mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4);
-  mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1, 4);
-}
-
 // Should we apply any filter at all: 11111111 yes, 00000000 no ?
 static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
                                          uint16_t p1, uint16_t p0, uint16_t q0,

diff --git a/aom_dsp/mips/intrapred_msa.c b/aom_dsp/mips/intrapred_msa.c
deleted file mode 100644
index 262a531..0000000
--- a/aom_dsp/mips/intrapred_msa.c
+++ /dev/null

@@ -1,551 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/macros_msa.h"
-
-#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \
-  {                                             \
-    out0 = __msa_subs_u_h(out0, in0);           \
-    out1 = __msa_subs_u_h(out1, in1);           \
-  }
-
-static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst,
-                                       int32_t dst_stride) {
-  uint32_t src_data;
-
-  src_data = LW(src);
-
-  SW4(src_data, src_data, src_data, src_data, dst, dst_stride);
-}
-
-static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst,
-                                       int32_t dst_stride) {
-  uint32_t row;
-  uint32_t src_data1, src_data2;
-
-  src_data1 = LW(src);
-  src_data2 = LW(src + 4);
-
-  for (row = 8; row--;) {
-    SW(src_data1, dst);
-    SW(src_data2, (dst + 4));
-    dst += dst_stride;
-  }
-}
-
-static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst,
-                                         int32_t dst_stride) {
-  uint32_t row;
-  v16u8 src0;
-
-  src0 = LD_UB(src);
-
-  for (row = 16; row--;) {
-    ST_UB(src0, dst);
-    dst += dst_stride;
-  }
-}
-
-static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
-                                         int32_t dst_stride) {
-  uint32_t row;
-  v16u8 src1, src2;
-
-  src1 = LD_UB(src);
-  src2 = LD_UB(src + 16);
-
-  for (row = 32; row--;) {
-    ST_UB2(src1, src2, dst, 16);
-    dst += dst_stride;
-  }
-}
-
-static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst,
-                                        int32_t dst_stride) {
-  uint32_t out0, out1, out2, out3;
-
-  out0 = src[0] * 0x01010101;
-  out1 = src[1] * 0x01010101;
-  out2 = src[2] * 0x01010101;
-  out3 = src[3] * 0x01010101;
-
-  SW4(out0, out1, out2, out3, dst, dst_stride);
-}
-
-static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst,
-                                        int32_t dst_stride) {
-  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
-
-  out0 = src[0] * 0x0101010101010101ull;
-  out1 = src[1] * 0x0101010101010101ull;
-  out2 = src[2] * 0x0101010101010101ull;
-  out3 = src[3] * 0x0101010101010101ull;
-  out4 = src[4] * 0x0101010101010101ull;
-  out5 = src[5] * 0x0101010101010101ull;
-  out6 = src[6] * 0x0101010101010101ull;
-  out7 = src[7] * 0x0101010101010101ull;
-
-  SD4(out0, out1, out2, out3, dst, dst_stride);
-  dst += (4 * dst_stride);
-  SD4(out4, out5, out6, out7, dst, dst_stride);
-}
-
-static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst,
-                                          int32_t dst_stride) {
-  uint32_t row;
-  uint8_t inp0, inp1, inp2, inp3;
-  v16u8 src0, src1, src2, src3;
-
-  for (row = 4; row--;) {
-    inp0 = src[0];
-    inp1 = src[1];
-    inp2 = src[2];
-    inp3 = src[3];
-    src += 4;
-
-    src0 = (v16u8)__msa_fill_b(inp0);
-    src1 = (v16u8)__msa_fill_b(inp1);
-    src2 = (v16u8)__msa_fill_b(inp2);
-    src3 = (v16u8)__msa_fill_b(inp3);
-
-    ST_UB4(src0, src1, src2, src3, dst, dst_stride);
-    dst += (4 * dst_stride);
-  }
-}
-
-static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst,
-                                          int32_t dst_stride) {
-  uint32_t row;
-  uint8_t inp0, inp1, inp2, inp3;
-  v16u8 src0, src1, src2, src3;
-
-  for (row = 8; row--;) {
-    inp0 = src[0];
-    inp1 = src[1];
-    inp2 = src[2];
-    inp3 = src[3];
-    src += 4;
-
-    src0 = (v16u8)__msa_fill_b(inp0);
-    src1 = (v16u8)__msa_fill_b(inp1);
-    src2 = (v16u8)__msa_fill_b(inp2);
-    src3 = (v16u8)__msa_fill_b(inp3);
-
-    ST_UB2(src0, src0, dst, 16);
-    dst += dst_stride;
-    ST_UB2(src1, src1, dst, 16);
-    dst += dst_stride;
-    ST_UB2(src2, src2, dst, 16);
-    dst += dst_stride;
-    ST_UB2(src3, src3, dst, 16);
-    dst += dst_stride;
-  }
-}
-
-static void intra_predict_dc_4x4_msa(const uint8_t *src_top,
-                                     const uint8_t *src_left, uint8_t *dst,
-                                     int32_t dst_stride) {
-  uint32_t val0, val1;
-  v16i8 store, src = { 0 };
-  v8u16 sum_h;
-  v4u32 sum_w;
-  v2u64 sum_d;
-
-  val0 = LW(src_top);
-  val1 = LW(src_left);
-  INSERT_W2_SB(val0, val1, src);
-  sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src);
-  sum_w = __msa_hadd_u_w(sum_h, sum_h);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
-  store = __msa_splati_b((v16i8)sum_w, 0);
-  val0 = __msa_copy_u_w((v4i32)store, 0);
-
-  SW4(val0, val0, val0, val0, dst, dst_stride);
-}
-
-static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst,
-                                        int32_t dst_stride) {
-  uint32_t val0;
-  v16i8 store, data = { 0 };
-  v8u16 sum_h;
-  v4u32 sum_w;
-
-  val0 = LW(src);
-  data = (v16i8)__msa_insert_w((v4i32)data, 0, val0);
-  sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data);
-  sum_w = __msa_hadd_u_w(sum_h, sum_h);
-  sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2);
-  store = __msa_splati_b((v16i8)sum_w, 0);
-  val0 = __msa_copy_u_w((v4i32)store, 0);
-
-  SW4(val0, val0, val0, val0, dst, dst_stride);
-}
-
-static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) {
-  uint32_t out;
-  const v16i8 store = __msa_ldi_b(128);
-
-  out = __msa_copy_u_w((v4i32)store, 0);
-
-  SW4(out, out, out, out, dst, dst_stride);
-}
-
-static void intra_predict_dc_8x8_msa(const uint8_t *src_top,
-                                     const uint8_t *src_left, uint8_t *dst,
-                                     int32_t dst_stride) {
-  uint64_t val0, val1;
-  v16i8 store;
-  v16u8 src = { 0 };
-  v8u16 sum_h;
-  v4u32 sum_w;
-  v2u64 sum_d;
-
-  val0 = LD(src_top);
-  val1 = LD(src_left);
-  INSERT_D2_UB(val0, val1, src);
-  sum_h = __msa_hadd_u_h(src, src);
-  sum_w = __msa_hadd_u_w(sum_h, sum_h);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
-  store = __msa_splati_b((v16i8)sum_w, 0);
-  val0 = __msa_copy_u_d((v2i64)store, 0);
-
-  SD4(val0, val0, val0, val0, dst, dst_stride);
-  dst += (4 * dst_stride);
-  SD4(val0, val0, val0, val0, dst, dst_stride);
-}
-
-static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst,
-                                        int32_t dst_stride) {
-  uint64_t val0;
-  v16i8 store;
-  v16u8 data = { 0 };
-  v8u16 sum_h;
-  v4u32 sum_w;
-  v2u64 sum_d;
-
-  val0 = LD(src);
-  data = (v16u8)__msa_insert_d((v2i64)data, 0, val0);
-  sum_h = __msa_hadd_u_h(data, data);
-  sum_w = __msa_hadd_u_w(sum_h, sum_h);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
-  store = __msa_splati_b((v16i8)sum_w, 0);
-  val0 = __msa_copy_u_d((v2i64)store, 0);
-
-  SD4(val0, val0, val0, val0, dst, dst_stride);
-  dst += (4 * dst_stride);
-  SD4(val0, val0, val0, val0, dst, dst_stride);
-}
-
-static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) {
-  uint64_t out;
-  const v16i8 store = __msa_ldi_b(128);
-
-  out = __msa_copy_u_d((v2i64)store, 0);
-
-  SD4(out, out, out, out, dst, dst_stride);
-  dst += (4 * dst_stride);
-  SD4(out, out, out, out, dst, dst_stride);
-}
-
-static void intra_predict_dc_16x16_msa(const uint8_t *src_top,
-                                       const uint8_t *src_left, uint8_t *dst,
-                                       int32_t dst_stride) {
-  v16u8 top, left, out;
-  v8u16 sum_h, sum_top, sum_left;
-  v4u32 sum_w;
-  v2u64 sum_d;
-
-  top = LD_UB(src_top);
-  left = LD_UB(src_left);
-  HADD_UB2_UH(top, left, sum_top, sum_left);
-  sum_h = sum_top + sum_left;
-  sum_w = __msa_hadd_u_w(sum_h, sum_h);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
-  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
-
-  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-  dst += (8 * dst_stride);
-  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-}
-
-static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst,
-                                          int32_t dst_stride) {
-  v16u8 data, out;
-  v8u16 sum_h;
-  v4u32 sum_w;
-  v2u64 sum_d;
-
-  data = LD_UB(src);
-  sum_h = __msa_hadd_u_h(data, data);
-  sum_w = __msa_hadd_u_w(sum_h, sum_h);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
-  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
-
-  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-  dst += (8 * dst_stride);
-  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-}
-
-static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) {
-  const v16u8 out = (v16u8)__msa_ldi_b(128);
-
-  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-  dst += (8 * dst_stride);
-  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-}
-
-static void intra_predict_dc_32x32_msa(const uint8_t *src_top,
-                                       const uint8_t *src_left, uint8_t *dst,
-                                       int32_t dst_stride) {
-  uint32_t row;
-  v16u8 top0, top1, left0, left1, out;
-  v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
-  v4u32 sum_w;
-  v2u64 sum_d;
-
-  LD_UB2(src_top, 16, top0, top1);
-  LD_UB2(src_left, 16, left0, left1);
-  HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
-  HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
-  sum_h = sum_top0 + sum_top1;
-  sum_h += sum_left0 + sum_left1;
-  sum_w = __msa_hadd_u_w(sum_h, sum_h);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6);
-  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
-
-  for (row = 16; row--;) {
-    ST_UB2(out, out, dst, 16);
-    dst += dst_stride;
-    ST_UB2(out, out, dst, 16);
-    dst += dst_stride;
-  }
-}
-
-static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst,
-                                          int32_t dst_stride) {
-  uint32_t row;
-  v16u8 data0, data1, out;
-  v8u16 sum_h, sum_data0, sum_data1;
-  v4u32 sum_w;
-  v2u64 sum_d;
-
-  LD_UB2(src, 16, data0, data1);
-  HADD_UB2_UH(data0, data1, sum_data0, sum_data1);
-  sum_h = sum_data0 + sum_data1;
-  sum_w = __msa_hadd_u_w(sum_h, sum_h);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
-  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
-
-  for (row = 16; row--;) {
-    ST_UB2(out, out, dst, 16);
-    dst += dst_stride;
-    ST_UB2(out, out, dst, 16);
-    dst += dst_stride;
-  }
-}
-
-static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) {
-  uint32_t row;
-  const v16u8 out = (v16u8)__msa_ldi_b(128);
-
-  for (row = 16; row--;) {
-    ST_UB2(out, out, dst, 16);
-    dst += dst_stride;
-    ST_UB2(out, out, dst, 16);
-    dst += dst_stride;
-  }
-}
-
-void aom_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
-                             const uint8_t *above, const uint8_t *left) {
-  (void)left;
-
-  intra_predict_vert_4x4_msa(above, dst, y_stride);
-}
-
-void aom_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
-                             const uint8_t *above, const uint8_t *left) {
-  (void)left;
-
-  intra_predict_vert_8x8_msa(above, dst, y_stride);
-}
-
-void aom_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
-                               const uint8_t *above, const uint8_t *left) {
-  (void)left;
-
-  intra_predict_vert_16x16_msa(above, dst, y_stride);
-}
-
-void aom_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
-                               const uint8_t *above, const uint8_t *left) {
-  (void)left;
-
-  intra_predict_vert_32x32_msa(above, dst, y_stride);
-}
-
-void aom_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
-                             const uint8_t *above, const uint8_t *left) {
-  (void)above;
-
-  intra_predict_horiz_4x4_msa(left, dst, y_stride);
-}
-
-void aom_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
-                             const uint8_t *above, const uint8_t *left) {
-  (void)above;
-
-  intra_predict_horiz_8x8_msa(left, dst, y_stride);
-}
-
-void aom_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
-                               const uint8_t *above, const uint8_t *left) {
-  (void)above;
-
-  intra_predict_horiz_16x16_msa(left, dst, y_stride);
-}
-
-void aom_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
-                               const uint8_t *above, const uint8_t *left) {
-  (void)above;
-
-  intra_predict_horiz_32x32_msa(left, dst, y_stride);
-}
-
-void aom_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
-                              const uint8_t *above, const uint8_t *left) {
-  intra_predict_dc_4x4_msa(above, left, dst, y_stride);
-}
-
-void aom_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
-                              const uint8_t *above, const uint8_t *left) {
-  intra_predict_dc_8x8_msa(above, left, dst, y_stride);
-}
-
-void aom_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                const uint8_t *above, const uint8_t *left) {
-  intra_predict_dc_16x16_msa(above, left, dst, y_stride);
-}
-
-void aom_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                const uint8_t *above, const uint8_t *left) {
-  intra_predict_dc_32x32_msa(above, left, dst, y_stride);
-}
-
-void aom_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                  const uint8_t *above, const uint8_t *left) {
-  (void)left;
-
-  intra_predict_dc_tl_4x4_msa(above, dst, y_stride);
-}
-
-void aom_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                  const uint8_t *above, const uint8_t *left) {
-  (void)left;
-
-  intra_predict_dc_tl_8x8_msa(above, dst, y_stride);
-}
-
-void aom_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)left;
-
-  intra_predict_dc_tl_16x16_msa(above, dst, y_stride);
-}
-
-void aom_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)left;
-
-  intra_predict_dc_tl_32x32_msa(above, dst, y_stride);
-}
-
-void aom_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)above;
-
-  intra_predict_dc_tl_4x4_msa(left, dst, y_stride);
-}
-
-void aom_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)above;
-
-  intra_predict_dc_tl_8x8_msa(left, dst, y_stride);
-}
-
-void aom_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-
-  intra_predict_dc_tl_16x16_msa(left, dst, y_stride);
-}
-
-void aom_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-
-  intra_predict_dc_tl_32x32_msa(left, dst, y_stride);
-}
-
-void aom_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                  const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-
-  intra_predict_128dc_4x4_msa(dst, y_stride);
-}
-
-void aom_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                  const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-
-  intra_predict_128dc_8x8_msa(dst, y_stride);
-}
-
-void aom_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-
-  intra_predict_128dc_16x16_msa(dst, y_stride);
-}
-
-void aom_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-
-  intra_predict_128dc_32x32_msa(dst, y_stride);
-}

diff --git a/aom_dsp/noise_model.c b/aom_dsp/noise_model.c
index cc47019..a6b5396 100644
--- a/aom_dsp/noise_model.c
+++ b/aom_dsp/noise_model.c

@@ -43,16 +43,13 @@
     return block_mean / (max_w * max_h);                                    \
   }
 
-GET_BLOCK_MEAN(uint8_t, lowbd);
 GET_BLOCK_MEAN(uint16_t, highbd);
 
 static INLINE double get_block_mean(const uint8_t *data, int w, int h,
                                     int stride, int x_o, int y_o,
-                                    int block_size, int use_highbd) {
-  if (use_highbd)
-    return get_block_mean_highbd((const uint16_t *)data, w, h, stride, x_o, y_o,
-                                 block_size);
-  return get_block_mean_lowbd(data, w, h, stride, x_o, y_o, block_size);
+                                    int block_size) {
+  return get_block_mean_highbd((const uint16_t *)data, w, h, stride, x_o, y_o,
+                               block_size);
 }
 
 // Defines a function that can be used to obtain the variance of a block
@@ -77,19 +74,14 @@
     return noise_var / (max_w * max_h) - noise_mean * noise_mean;        \
   }
 
-GET_NOISE_VAR(uint8_t, lowbd);
 GET_NOISE_VAR(uint16_t, highbd);
 
 static INLINE double get_noise_var(const uint8_t *data, const uint8_t *denoised,
                                    int w, int h, int stride, int x_o, int y_o,
-                                   int block_size_x, int block_size_y,
-                                   int use_highbd) {
-  if (use_highbd)
-    return get_noise_var_highbd((const uint16_t *)data,
-                                (const uint16_t *)denoised, w, h, stride, x_o,
-                                y_o, block_size_x, block_size_y);
-  return get_noise_var_lowbd(data, denoised, w, h, stride, x_o, y_o,
-                             block_size_x, block_size_y);
+                                   int block_size_x, int block_size_y) {
+  return get_noise_var_highbd((const uint16_t *)data,
+                              (const uint16_t *)denoised, w, h, stride, x_o,
+                              y_o, block_size_x, block_size_y);
 }
 
 static void equation_system_clear(aom_equation_system_t *eqns) {
@@ -422,7 +414,7 @@
 }
 
 int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder,
-                               int block_size, int bit_depth, int use_highbd) {
+                               int block_size, int bit_depth) {
   const int n = block_size * block_size;
   aom_equation_system_t eqns;
   double *AtA_inv = 0;
@@ -453,7 +445,6 @@
   block_finder->AtA_inv = AtA_inv;
   block_finder->block_size = block_size;
   block_finder->normalization = (1 << bit_depth) - 1;
-  block_finder->use_highbd = use_highbd;
 
   for (y = 0; y < block_size; ++y) {
     const double yd = ((double)y - block_size / 2.) / (block_size / 2.);
@@ -506,24 +497,13 @@
   double AtA_inv_b[kLowPolyNumParams];
   int xi, yi, i;
 
-  if (block_finder->use_highbd) {
-    const uint16_t *const data16 = (const uint16_t *const)data;
-    for (yi = 0; yi < block_size; ++yi) {
-      const int y = clamp(offsy + yi, 0, h - 1);
-      for (xi = 0; xi < block_size; ++xi) {
-        const int x = clamp(offsx + xi, 0, w - 1);
-        block[yi * block_size + xi] =
-            ((double)data16[y * stride + x]) / block_finder->normalization;
-      }
-    }
-  } else {
-    for (yi = 0; yi < block_size; ++yi) {
-      const int y = clamp(offsy + yi, 0, h - 1);
-      for (xi = 0; xi < block_size; ++xi) {
-        const int x = clamp(offsx + xi, 0, w - 1);
-        block[yi * block_size + xi] =
-            ((double)data[y * stride + x]) / block_finder->normalization;
-      }
+  const uint16_t *const data16 = (const uint16_t *const)data;
+  for (yi = 0; yi < block_size; ++yi) {
+    const int y = clamp(offsy + yi, 0, h - 1);
+    for (xi = 0; xi < block_size; ++xi) {
+      const int x = clamp(offsx + xi, 0, w - 1);
+      block[yi * block_size + xi] =
+          ((double)data16[y * stride + x]) / block_finder->normalization;
     }
   }
   multiply_mat(block, A, AtA_inv_b, 1, n, kLowPolyNumParams);
@@ -788,7 +768,6 @@
     return val;                                                            \
   }
 
-EXTRACT_AR_ROW(uint8_t, lowbd);
 EXTRACT_AR_ROW(uint16_t, highbd);
 
 static int add_block_observations(
@@ -829,19 +808,12 @@
               : ((block_size >> sub_log2[0]) - lag));
       for (int y = y_start; y < y_end; ++y) {
         for (int x = x_start; x < x_end; ++x) {
-          const double val =
-              noise_model->params.use_highbd
-                  ? extract_ar_row_highbd(noise_model->coords, num_coords,
-                                          (const uint16_t *const)data,
-                                          (const uint16_t *const)denoised,
-                                          stride, sub_log2,
-                                          (const uint16_t *const)alt_data,
-                                          (const uint16_t *const)alt_denoised,
-                                          alt_stride, x + x_o, y + y_o, buffer)
-                  : extract_ar_row_lowbd(noise_model->coords, num_coords, data,
-                                         denoised, stride, sub_log2, alt_data,
-                                         alt_denoised, alt_stride, x + x_o,
-                                         y + y_o, buffer);
+          const double val = extract_ar_row_highbd(
+              noise_model->coords, num_coords, (const uint16_t *const)data,
+              (const uint16_t *const)denoised, stride, sub_log2,
+              (const uint16_t *const)alt_data,
+              (const uint16_t *const)alt_denoised, alt_stride, x + x_o, y + y_o,
+              buffer);
           for (int i = 0; i < n; ++i) {
             for (int j = 0; j < n; ++j) {
               A[i * n + j] +=
@@ -890,12 +862,10 @@
       if (num_samples_w * num_samples_h > block_size) {
         const double block_mean = get_block_mean(
             alt_data ? alt_data : data, w, h, alt_data ? alt_stride : stride,
-            x_o << sub_log2[0], y_o << sub_log2[1], block_size,
-            noise_model->params.use_highbd);
+            x_o << sub_log2[0], y_o << sub_log2[1], block_size);
         const double noise_var = get_noise_var(
             data, denoised, stride, w >> sub_log2[0], h >> sub_log2[1], x_o,
-            y_o, block_size >> sub_log2[0], block_size >> sub_log2[1],
-            noise_model->params.use_highbd);
+            y_o, block_size >> sub_log2[0], block_size >> sub_log2[1]);
         // We want to remove the part of the noise that came from being
         // correlated with luma. Note that the noise solver for luma must
         // have already been run.
@@ -1330,13 +1300,11 @@
     }                                                                       \
   }
 
-DITHER_AND_QUANTIZE(uint8_t, lowbd);
 DITHER_AND_QUANTIZE(uint16_t, highbd);
 
 int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
                           int w, int h, int stride[3], int chroma_sub[2],
-                          float *noise_psd[3], int block_size, int bit_depth,
-                          int use_highbd) {
+                          float *noise_psd[3], int block_size, int bit_depth) {
   float *plane = NULL, *block = NULL, *window_full = NULL,
         *window_chroma = NULL;
   double *block_d = NULL, *plane_d = NULL;
@@ -1357,8 +1325,8 @@
             "subsampling");
     return 0;
   }
-  init_success &= aom_flat_block_finder_init(&block_finder_full, block_size,
-                                             bit_depth, use_highbd);
+  init_success &=
+      aom_flat_block_finder_init(&block_finder_full, block_size, bit_depth);
   result = (float *)aom_malloc((num_blocks_h + 2) * block_size * result_stride *
                                sizeof(*result));
   plane = (float *)aom_malloc(block_size * block_size * sizeof(*plane));
@@ -1370,9 +1338,8 @@
   tx_full = aom_noise_tx_malloc(block_size);
 
   if (chroma_sub[0] != 0) {
-    init_success &= aom_flat_block_finder_init(&block_finder_chroma,
-                                               block_size >> chroma_sub[0],
-                                               bit_depth, use_highbd);
+    init_success &= aom_flat_block_finder_init(
+        &block_finder_chroma, block_size >> chroma_sub[0], bit_depth);
     window_chroma = get_half_cos_window(block_size >> chroma_sub[0]);
     tx_chroma = aom_noise_tx_malloc(block_size >> chroma_sub[0]);
   } else {
@@ -1440,15 +1407,9 @@
         }
       }
     }
-    if (use_highbd) {
-      dither_and_quantize_highbd(result, result_stride, (uint16_t *)denoised[c],
-                                 w, h, stride[c], chroma_sub_w, chroma_sub_h,
-                                 block_size, kBlockNormalization);
-    } else {
-      dither_and_quantize_lowbd(result, result_stride, denoised[c], w, h,
-                                stride[c], chroma_sub_w, chroma_sub_h,
-                                block_size, kBlockNormalization);
-    }
+    dither_and_quantize_highbd(result, result_stride, (uint16_t *)denoised[c],
+                               w, h, stride[c], chroma_sub_w, chroma_sub_h,
+                               block_size, kBlockNormalization);
   }
   aom_free(result);
   aom_free(plane);
@@ -1536,7 +1497,6 @@
   if (ctx->width == sd->y_width && ctx->height == sd->y_height &&
       ctx->y_stride == sd->y_stride && ctx->uv_stride == sd->uv_stride)
     return 1;
-  const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
   const int block_size = ctx->block_size;
 
   ctx->width = sd->y_width;
@@ -1551,9 +1511,9 @@
   aom_free(ctx->flat_blocks);
   ctx->flat_blocks = NULL;
 
-  ctx->denoised[0] = aom_malloc((sd->y_stride * sd->y_height) << use_highbd);
-  ctx->denoised[1] = aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd);
-  ctx->denoised[2] = aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd);
+  ctx->denoised[0] = aom_malloc((sd->y_stride * sd->y_height) << 1);
+  ctx->denoised[1] = aom_malloc((sd->uv_stride * sd->uv_height) << 1);
+  ctx->denoised[2] = aom_malloc((sd->uv_stride * sd->uv_height) << 1);
   if (!ctx->denoised[0] || !ctx->denoised[1] || !ctx->denoised[2]) {
     fprintf(stderr, "Unable to allocate denoise buffers\n");
     return 0;
@@ -1564,13 +1524,13 @@
 
   aom_flat_block_finder_free(&ctx->flat_block_finder);
   if (!aom_flat_block_finder_init(&ctx->flat_block_finder, ctx->block_size,
-                                  ctx->bit_depth, use_highbd)) {
+                                  ctx->bit_depth)) {
     fprintf(stderr, "Unable to init flat block finder\n");
     return 0;
   }
 
   const aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3,
-                                            ctx->bit_depth, use_highbd };
+                                            ctx->bit_depth };
   aom_noise_model_free(&ctx->noise_model);
   if (!aom_noise_model_init(&ctx->noise_model, params)) {
     fprintf(stderr, "Unable to init noise model\n");
@@ -1594,11 +1554,10 @@
                               YV12_BUFFER_CONFIG *sd,
                               aom_film_grain_t *film_grain) {
   const int block_size = ctx->block_size;
-  const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
   uint8_t *raw_data[3] = {
-    use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->y_buffer) : sd->y_buffer,
-    use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->u_buffer) : sd->u_buffer,
-    use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->v_buffer) : sd->v_buffer,
+    (uint8_t *)CONVERT_TO_SHORTPTR(sd->y_buffer),
+    (uint8_t *)CONVERT_TO_SHORTPTR(sd->u_buffer),
+    (uint8_t *)CONVERT_TO_SHORTPTR(sd->v_buffer),
   };
   const uint8_t *const data[3] = { raw_data[0], raw_data[1], raw_data[2] };
   int strides[3] = { sd->y_stride, sd->uv_stride, sd->uv_stride };
@@ -1614,7 +1573,7 @@
 
   if (!aom_wiener_denoise_2d(data, ctx->denoised, sd->y_width, sd->y_height,
                              strides, chroma_sub_log2, ctx->noise_psd,
-                             block_size, ctx->bit_depth, use_highbd)) {
+                             block_size, ctx->bit_depth)) {
     fprintf(stderr, "Unable to denoise image\n");
     return 0;
   }
@@ -1644,12 +1603,9 @@
     if (!film_grain->random_seed) {
       film_grain->random_seed = 7391;
     }
-    memcpy(raw_data[0], ctx->denoised[0],
-           (strides[0] * sd->y_height) << use_highbd);
-    memcpy(raw_data[1], ctx->denoised[1],
-           (strides[1] * sd->uv_height) << use_highbd);
-    memcpy(raw_data[2], ctx->denoised[2],
-           (strides[2] * sd->uv_height) << use_highbd);
+    memcpy(raw_data[0], ctx->denoised[0], (strides[0] * sd->y_height) << 1);
+    memcpy(raw_data[1], ctx->denoised[1], (strides[1] * sd->uv_height) << 1);
+    memcpy(raw_data[2], ctx->denoised[2], (strides[2] * sd->uv_height) << 1);
   }
   return 1;
 }

diff --git a/aom_dsp/noise_model.h b/aom_dsp/noise_model.h
index fcf9cb9..4a7a4f7 100644
--- a/aom_dsp/noise_model.h
+++ b/aom_dsp/noise_model.h

@@ -133,12 +133,11 @@
   int num_params;  // The number of parameters used for internal low-order model
   int block_size;  // The block size the finder was initialized with
   double normalization;  // Normalization factor (1 / (2^(bit_depth) - 1))
-  int use_highbd;        // Whether input data should be interpreted as uint16
 } aom_flat_block_finder_t;
 
 /*!\brief Init the block_finder with the given block size, bit_depth */
 int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder,
-                               int block_size, int bit_depth, int use_highbd);
+                               int block_size, int bit_depth);
 void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder);
 
 /*!\brief Helper to extract a block and low order "planar" model. */
@@ -171,7 +170,6 @@
   aom_noise_shape shape;
   int lag;
   int bit_depth;
-  int use_highbd;
 } aom_noise_model_params_t;
 
 /*!\brief State of a noise model estimate for a single channel.
@@ -274,14 +272,10 @@
  * \param[in]     noise_psd       The power spectral density of the noise
  * \param[in]     block_size      The size of blocks
  * \param[in]     bit_depth       Bit depth of the image
- * \param[in]     use_highbd      If true, uint8 pointers are interpreted as
- *                                uint16 and stride is measured in uint16.
- *                                This must be true when bit_depth >= 10.
  */
 int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
                           int w, int h, int stride[3], int chroma_sub_log2[2],
-                          float *noise_psd[3], int block_size, int bit_depth,
-                          int use_highbd);
+                          float *noise_psd[3], int block_size, int bit_depth);
 
 struct aom_denoise_and_model_t;
 

diff --git a/aom_dsp/psnr.c b/aom_dsp/psnr.c
index 04d4bd4..eeb7e87 100644
--- a/aom_dsp/psnr.c
+++ b/aom_dsp/psnr.c

@@ -27,26 +27,6 @@
   }
 }
 
-static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
-                             int b_stride, int w, int h, unsigned int *sse,
-                             int *sum) {
-  int i, j;
-
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
-    }
-
-    a += a_stride;
-    b += b_stride;
-  }
-}
-
 static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
                                       const uint8_t *b8, int b_stride, int w,
                                       int h, uint64_t *sse, int64_t *sum) {
@@ -80,46 +60,6 @@
   *sum = (int)sum_long;
 }
 
-static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
-                       int b_stride, int width, int height) {
-  const int dw = width % 16;
-  const int dh = height % 16;
-  int64_t total_sse = 0;
-  unsigned int sse = 0;
-  int sum = 0;
-  int x, y;
-
-  if (dw > 0) {
-    encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw,
-                     height, &sse, &sum);
-    total_sse += sse;
-  }
-
-  if (dh > 0) {
-    encoder_variance(&a[(height - dh) * a_stride], a_stride,
-                     &b[(height - dh) * b_stride], b_stride, width - dw, dh,
-                     &sse, &sum);
-    total_sse += sse;
-  }
-
-  for (y = 0; y < height / 16; ++y) {
-    const uint8_t *pa = a;
-    const uint8_t *pb = b;
-    for (x = 0; x < width / 16; ++x) {
-      aom_mse16x16(pa, a_stride, pb, b_stride, &sse);
-      total_sse += sse;
-
-      pa += 16;
-      pb += 16;
-    }
-
-    a += 16 * a_stride;
-    b += 16 * b_stride;
-  }
-
-  return total_sse;
-}
-
 static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
                                     const uint8_t *b8, int b_stride, int width,
                                     int height, unsigned int input_shift) {
@@ -173,78 +113,6 @@
   return total_sse;
 }
 
-uint64_t aom_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
-                       int vstart, int height) {
-  return aom_var_2d_u8(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride,
-                       width, height) /
-         (width * height);
-}
-
-uint64_t aom_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
-                       int vstart, int height) {
-  return aom_var_2d_u8(a->u_buffer + vstart * a->uv_stride + hstart,
-                       a->uv_stride, width, height) /
-         (width * height);
-}
-
-uint64_t aom_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
-                       int vstart, int height) {
-  return aom_var_2d_u8(a->v_buffer + vstart * a->uv_stride + hstart,
-                       a->uv_stride, width, height) /
-         (width * height);
-}
-
-int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
-                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
-                           int vstart, int height) {
-  return get_sse(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride,
-                 b->y_buffer + vstart * b->y_stride + hstart, b->y_stride,
-                 width, height);
-}
-
-int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a,
-                      const YV12_BUFFER_CONFIG *b) {
-  assert(a->y_crop_width == b->y_crop_width);
-  assert(a->y_crop_height == b->y_crop_height);
-
-  return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
-                 a->y_crop_width, a->y_crop_height);
-}
-
-int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
-                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
-                           int vstart, int height) {
-  return get_sse(a->u_buffer + vstart * a->uv_stride + hstart, a->uv_stride,
-                 b->u_buffer + vstart * b->uv_stride + hstart, b->uv_stride,
-                 width, height);
-}
-
-int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a,
-                      const YV12_BUFFER_CONFIG *b) {
-  assert(a->uv_crop_width == b->uv_crop_width);
-  assert(a->uv_crop_height == b->uv_crop_height);
-
-  return get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride,
-                 a->uv_crop_width, a->uv_crop_height);
-}
-
-int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
-                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
-                           int vstart, int height) {
-  return get_sse(a->v_buffer + vstart * a->uv_stride + hstart, a->uv_stride,
-                 b->v_buffer + vstart * b->uv_stride + hstart, b->uv_stride,
-                 width, height);
-}
-
-int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a,
-                      const YV12_BUFFER_CONFIG *b) {
-  assert(a->uv_crop_width == b->uv_crop_width);
-  assert(a->uv_crop_height == b->uv_crop_height);
-
-  return get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride,
-                 a->uv_crop_width, a->uv_crop_height);
-}
-
 uint64_t aom_highbd_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart,
                               int width, int vstart, int height) {
   return aom_var_2d_u16(a->y_buffer + vstart * a->y_stride + hstart,
@@ -278,8 +146,6 @@
                              const YV12_BUFFER_CONFIG *b) {
   assert(a->y_crop_width == b->y_crop_width);
   assert(a->y_crop_height == b->y_crop_height);
-  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
-  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
 
   return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
                         a->y_crop_width, a->y_crop_height);
@@ -298,8 +164,6 @@
                              const YV12_BUFFER_CONFIG *b) {
   assert(a->uv_crop_width == b->uv_crop_width);
   assert(a->uv_crop_height == b->uv_crop_height);
-  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
-  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
 
   return highbd_get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride,
                         a->uv_crop_width, a->uv_crop_height);
@@ -318,29 +182,18 @@
                              const YV12_BUFFER_CONFIG *b) {
   assert(a->uv_crop_width == b->uv_crop_width);
   assert(a->uv_crop_height == b->uv_crop_height);
-  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
-  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
 
   return highbd_get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride,
                         a->uv_crop_width, a->uv_crop_height);
 }
 
 int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
-                          const YV12_BUFFER_CONFIG *b, int plane, int highbd) {
-  if (highbd) {
-    switch (plane) {
-      case 0: return aom_highbd_get_y_sse(a, b);
-      case 1: return aom_highbd_get_u_sse(a, b);
-      case 2: return aom_highbd_get_v_sse(a, b);
-      default: assert(plane >= 0 && plane <= 2); return 0;
-    }
-  } else {
-    switch (plane) {
-      case 0: return aom_get_y_sse(a, b);
-      case 1: return aom_get_u_sse(a, b);
-      case 2: return aom_get_v_sse(a, b);
-      default: assert(plane >= 0 && plane <= 2); return 0;
-    }
+                          const YV12_BUFFER_CONFIG *b, int plane) {
+  switch (plane) {
+    case 0: return aom_highbd_get_y_sse(a, b);
+    case 1: return aom_highbd_get_u_sse(a, b);
+    case 2: return aom_highbd_get_v_sse(a, b);
+    default: assert(plane >= 0 && plane <= 2); return 0;
   }
 }
 
@@ -371,54 +224,14 @@
     const int h = heights[i];
     const uint32_t samples = w * h;
     uint64_t sse;
-    if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
-      if (input_shift) {
-        sse = highbd_get_sse_shift(a->buffers[i], a_strides[i], b->buffers[i],
-                                   b_strides[i], w, h, input_shift);
-      } else {
-        sse = highbd_get_sse(a->buffers[i], a_strides[i], b->buffers[i],
-                             b_strides[i], w, h);
-      }
+    if (input_shift) {
+      sse = highbd_get_sse_shift(a->buffers[i], a_strides[i], b->buffers[i],
+                                 b_strides[i], w, h, input_shift);
     } else {
-      sse = get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w,
-                    h);
+      sse = highbd_get_sse(a->buffers[i], a_strides[i], b->buffers[i],
+                           b_strides[i], w, h);
     }
-    psnr->sse[1 + i] = sse;
-    psnr->samples[1 + i] = samples;
-    psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse);
 
-    total_sse += sse;
-    total_samples += samples;
-  }
-
-  psnr->sse[0] = total_sse;
-  psnr->samples[0] = total_samples;
-  psnr->psnr[0] =
-      aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
-}
-
-void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
-                   PSNR_STATS *psnr) {
-  assert(a->y_crop_width == b->y_crop_width);
-  assert(a->y_crop_height == b->y_crop_height);
-  assert(a->uv_crop_width == b->uv_crop_width);
-  assert(a->uv_crop_height == b->uv_crop_height);
-  static const double peak = 255.0;
-  const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
-  const int heights[3] = { a->y_crop_height, a->uv_crop_height,
-                           a->uv_crop_height };
-  const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
-  const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
-  int i;
-  uint64_t total_sse = 0;
-  uint32_t total_samples = 0;
-
-  for (i = 0; i < 3; ++i) {
-    const int w = widths[i];
-    const int h = heights[i];
-    const uint32_t samples = w * h;
-    const uint64_t sse =
-        get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, h);
     psnr->sse[1 + i] = sse;
     psnr->samples[1 + i] = samples;
     psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse);

diff --git a/aom_dsp/psnr.h b/aom_dsp/psnr.h
index b3e0ec3..67f89ba 100644
--- a/aom_dsp/psnr.h
+++ b/aom_dsp/psnr.h

@@ -36,26 +36,8 @@
  * \param[in]    sse           Sum of squared errors
  */
 double aom_sse_to_psnr(double samples, double peak, double sse);
-uint64_t aom_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
-                       int vstart, int height);
-uint64_t aom_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
-                       int vstart, int height);
-uint64_t aom_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
-                       int vstart, int height);
-int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
-                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
-                           int vstart, int height);
-int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
-int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
-                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
-                           int vstart, int height);
-int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
-int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
-                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
-                           int vstart, int height);
-int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
 int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
-                          const YV12_BUFFER_CONFIG *b, int plane, int highbd);
+                          const YV12_BUFFER_CONFIG *b, int plane);
 uint64_t aom_highbd_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart,
                               int width, int vstart, int height);
 uint64_t aom_highbd_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart,
@@ -80,8 +62,6 @@
 void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
                           const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
                           unsigned int bit_depth, unsigned int in_bit_depth);
-void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
-                   PSNR_STATS *psnr);
 
 double aom_psnrhvs(const YV12_BUFFER_CONFIG *source,
                    const YV12_BUFFER_CONFIG *dest, double *phvs_y,

diff --git a/aom_dsp/psnrhvs.c b/aom_dsp/psnrhvs.c
index 421a6dc..0fe7e91 100644
--- a/aom_dsp/psnrhvs.c
+++ b/aom_dsp/psnrhvs.c

@@ -25,16 +25,6 @@
 #include "aom_dsp/ssim.h"
 #include "aom_ports/system_state.h"
 
-static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
-                           int xstride) {
-  int i, j;
-  (void)xstride;
-  aom_fdct8x8(x, y, ystride);
-  for (i = 0; i < 8; i++)
-    for (j = 0; j < 8; j++)
-      *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
-}
-
 static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
                                int xstride) {
   int i, j;
@@ -113,11 +103,8 @@
 static double calc_psnrhvs(const unsigned char *src, int _systride,
                            const unsigned char *dst, int _dystride, double _par,
                            int _w, int _h, int _step, const double _csf[8][8],
-                           uint32_t _shift, int buf_is_hbd, int16_t pix_max,
-                           int luma) {
+                           uint32_t _shift, int16_t pix_max, int luma) {
   double ret;
-  const uint8_t *_src8 = src;
-  const uint8_t *_dst8 = dst;
   const uint16_t *_src16 = CONVERT_TO_SHORTPTR(src);
   const uint16_t *_dst16 = CONVERT_TO_SHORTPTR(dst);
   DECLARE_ALIGNED(16, int16_t, dct_s[8 * 8]);
@@ -136,13 +123,8 @@
   sum1 = sum2 = delt = 0.0f;
   for (y = 0; y < _h; y++) {
     for (x = 0; x < _w; x++) {
-      if (!buf_is_hbd) {
-        sum1 += _src8[y * _systride + x];
-        sum2 += _dst8[y * _dystride + x];
-      } else {
-        sum1 += _src16[y * _systride + x] >> _shift;
-        sum2 += _dst16[y * _dystride + x] >> _shift;
-      }
+      sum1 += _src16[y * _systride + x] >> _shift;
+      sum2 += _dst16[y * _dystride + x] >> _shift;
     }
   }
   if (luma) delt = (sum1 - sum2) / (_w * _h);
@@ -183,13 +165,8 @@
       double s_mask = 0;
       for (i = 0; i < 8; i++) {
         for (j = 0; j < 8; j++) {
-          if (!buf_is_hbd) {
-            dct_s[i * 8 + j] = _src8[(y + i) * _systride + (j + x)];
-            dct_d[i * 8 + j] = _dst8[(y + i) * _dystride + (j + x)];
-          } else {
-            dct_s[i * 8 + j] = _src16[(y + i) * _systride + (j + x)] >> _shift;
-            dct_d[i * 8 + j] = _dst16[(y + i) * _dystride + (j + x)] >> _shift;
-          }
+          dct_s[i * 8 + j] = _src16[(y + i) * _systride + (j + x)] >> _shift;
+          dct_d[i * 8 + j] = _dst16[(y + i) * _dystride + (j + x)] >> _shift;
           dct_d[i * 8 + j] += (int)(delt + 0.5f);
         }
       }
@@ -211,13 +188,8 @@
         }
       }
       s_gvar = 1.f / (36 - n + 1) * s_gmean / 36.f;
-      if (!buf_is_hbd) {
-        od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
-        od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
-      } else {
-        hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
-        hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
-      }
+      hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+      hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
       for (i = 0; i < 8; i++)
         for (j = (i == 0); j < 8; j++)
           s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j];
@@ -251,7 +223,6 @@
   assert(bd == 8 || bd == 10 || bd == 12);
   assert(bd >= in_bd);
   assert(src->flags == dst->flags);
-  const int buf_is_hbd = src->flags & YV12_FLAG_HIGHBITDEPTH;
 
   int16_t pix_max = 255;
   if (in_bd == 10)
@@ -261,18 +232,17 @@
 
   bd_shift = bd - in_bd;
 
-  *y_psnrhvs =
-      calc_psnrhvs(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride,
-                   par, src->y_crop_width, src->y_crop_height, step, csf_y,
-                   bd_shift, buf_is_hbd, pix_max, 1);
+  *y_psnrhvs = calc_psnrhvs(
+      src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, par,
+      src->y_crop_width, src->y_crop_height, step, csf_y, bd_shift, pix_max, 1);
   *u_psnrhvs =
       calc_psnrhvs(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
                    par, src->uv_crop_width, src->uv_crop_height, step,
-                   csf_cb420, bd_shift, buf_is_hbd, pix_max, 0);
+                   csf_cb420, bd_shift, pix_max, 0);
   *v_psnrhvs =
       calc_psnrhvs(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
                    par, src->uv_crop_width, src->uv_crop_height, step,
-                   csf_cr420, bd_shift, buf_is_hbd, pix_max, 0);
+                   csf_cr420, bd_shift, pix_max, 0);
   psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs));
   return convert_score_db(psnrhvs, 1.0, pix_max);
 }

diff --git a/aom_dsp/quantize.c b/aom_dsp/quantize.c
index 98e2c75..372ff0a 100644
--- a/aom_dsp/quantize.c
+++ b/aom_dsp/quantize.c

@@ -14,164 +14,6 @@
 #include "aom_mem/aom_mem.h"
 #include "av1/encoder/av1_quantize.h"
 
-void aom_quantize_b_adaptive_helper_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr,
-    const int32_t *round_ptr, const int32_t *quant_ptr,
-    const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
-    const qm_val_t *iqm_ptr, const int log_scale) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
-  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-  int i, non_zero_count = (int)n_coeffs, eob = -1;
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  int prescan_add[2];
-  for (i = 0; i < 2; ++i)
-    prescan_add[i] =
-        ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7 + QUANT_TABLE_BITS);
-  // Pre-scan pass
-  for (i = (int)n_coeffs - 1; i >= 0; i--) {
-    const int rc = scan[i];
-    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
-    const int coeff = coeff_ptr[rc] * wt;
-    const int prescan_add_val = prescan_add[rc != 0];
-    if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
-        coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val))
-      non_zero_count--;
-    else
-      break;
-  }
-
-  // Quantization pass: All coefficients with index >= zero_flag are
-  // skippable. Note: zero_flag can be zero.
-#if SKIP_EOB_FACTOR_ADJUST
-  int first = -1;
-#endif  // SKIP_EOB_FACTOR_ADJUST
-  for (i = 0; i < non_zero_count; i++) {
-    const int rc = scan[i];
-    const int coeff = coeff_ptr[rc];
-    const int coeff_sign = AOMSIGN(coeff);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    int tmp32;
-
-    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
-    if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
-      int64_t tmp =
-          clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
-                INT16_MIN, INT16_MAX);
-      tmp *= wt;
-      tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-                     quant_shift_ptr[rc != 0]) >>
-                    (16 - log_scale + AOM_QM_BITS));  // quantization
-      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
-      const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
-      const int dequant =
-          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
-          AOM_QM_BITS;
-      const tran_low_t abs_dqcoeff = (tran_low_t)ROUND_POWER_OF_TWO_64(
-                                         tmp32 * dequant, QUANT_TABLE_BITS) >>
-                                     log_scale;
-      dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
-
-      if (tmp32) {
-        eob = i;
-#if SKIP_EOB_FACTOR_ADJUST
-        if (first == -1) first = i;
-#endif  // SKIP_EOB_FACTOR_ADJUST
-      }
-    }
-  }
-#if SKIP_EOB_FACTOR_ADJUST
-  if (eob >= 0 && first == eob) {
-    const int rc = scan[eob];
-    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
-      const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
-      const int coeff = coeff_ptr[rc] * wt;
-      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
-      const int prescan_add_val = ROUND_POWER_OF_TWO(
-          dequant_ptr[rc != 0] * factor, 7 + QUANT_TABLE_BITS);
-      if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
-          coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) {
-        qcoeff_ptr[rc] = 0;
-        dqcoeff_ptr[rc] = 0;
-        eob = -1;
-      }
-    }
-  }
-#endif  // SKIP_EOB_FACTOR_ADJUST
-  *eob_ptr = eob + 1;
-}
-
-void aom_quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             const int32_t *zbin_ptr, const int32_t *round_ptr,
-                             const int32_t *quant_ptr,
-                             const int32_t *quant_shift_ptr,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             const int32_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan,
-                             const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
-                             const int log_scale) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
-  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-  int i, non_zero_count = (int)n_coeffs, eob = -1;
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  // Pre-scan pass
-  for (i = (int)n_coeffs - 1; i >= 0; i--) {
-    const int rc = scan[i];
-    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
-    const int coeff = coeff_ptr[rc] * wt;
-
-    if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS)) &&
-        coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS)))
-      non_zero_count--;
-    else
-      break;
-  }
-
-  // Quantization pass: All coefficients with index >= zero_flag are
-  // skippable. Note: zero_flag can be zero.
-  for (i = 0; i < non_zero_count; i++) {
-    const int rc = scan[i];
-    const int coeff = coeff_ptr[rc];
-    const int coeff_sign = AOMSIGN(coeff);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    int tmp32;
-
-    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
-    if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
-      int64_t tmp =
-          clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
-                INT16_MIN, INT16_MAX);
-      tmp *= wt;
-      tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-                     quant_shift_ptr[rc != 0]) >>
-                    (16 - log_scale + AOM_QM_BITS));  // quantization
-      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
-      const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
-      const int dequant =
-          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
-          AOM_QM_BITS;
-      const tran_low_t abs_dqcoeff = (tran_low_t)ROUND_POWER_OF_TWO_64(
-                                         tmp32 * dequant, QUANT_TABLE_BITS) >>
-                                     log_scale;
-      dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
-
-      if (tmp32) eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
 void aom_highbd_quantize_b_adaptive_helper_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr,
     const int32_t *round_ptr, const int32_t *quant_ptr,
@@ -323,46 +165,6 @@
   *eob_ptr = eob + 1;
 }
 
-/* These functions should only be called when quantisation matrices
-   are not used. */
-void aom_quantize_b_adaptive_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                               const int32_t *zbin_ptr,
-                               const int32_t *round_ptr,
-                               const int32_t *quant_ptr,
-                               const int32_t *quant_shift_ptr,
-                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                               const int32_t *dequant_ptr, uint16_t *eob_ptr,
-                               const int16_t *scan, const int16_t *iscan) {
-  aom_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
-                                   quant_ptr, quant_shift_ptr, qcoeff_ptr,
-                                   dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
-                                   iscan, NULL, NULL, 0);
-}
-
-void aom_quantize_b_32x32_adaptive_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr,
-    const int32_t *round_ptr, const int32_t *quant_ptr,
-    const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
-  aom_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
-                                   quant_ptr, quant_shift_ptr, qcoeff_ptr,
-                                   dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
-                                   iscan, NULL, NULL, 1);
-}
-
-void aom_quantize_b_64x64_adaptive_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr,
-    const int32_t *round_ptr, const int32_t *quant_ptr,
-    const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
-  aom_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
-                                   quant_ptr, quant_shift_ptr, qcoeff_ptr,
-                                   dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
-                                   iscan, NULL, NULL, 2);
-}
-
 void aom_highbd_quantize_b_adaptive_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr,
     const int32_t *round_ptr, const int32_t *quant_ptr,
@@ -399,41 +201,6 @@
                                           eob_ptr, scan, iscan, NULL, NULL, 2);
 }
 
-void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      const int32_t *zbin_ptr, const int32_t *round_ptr,
-                      const int32_t *quant_ptr, const int32_t *quant_shift_ptr,
-                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                      const int32_t *dequant_ptr, uint16_t *eob_ptr,
-                      const int16_t *scan, const int16_t *iscan) {
-  aom_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
-                          quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
-                          eob_ptr, scan, iscan, NULL, NULL, 0);
-}
-
-void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            const int32_t *zbin_ptr, const int32_t *round_ptr,
-                            const int32_t *quant_ptr,
-                            const int32_t *quant_shift_ptr,
-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                            const int32_t *dequant_ptr, uint16_t *eob_ptr,
-                            const int16_t *scan, const int16_t *iscan) {
-  aom_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
-                          quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
-                          eob_ptr, scan, iscan, NULL, NULL, 1);
-}
-
-void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            const int32_t *zbin_ptr, const int32_t *round_ptr,
-                            const int32_t *quant_ptr,
-                            const int32_t *quant_shift_ptr,
-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                            const int32_t *dequant_ptr, uint16_t *eob_ptr,
-                            const int16_t *scan, const int16_t *iscan) {
-  aom_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
-                          quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
-                          eob_ptr, scan, iscan, NULL, NULL, 2);
-}
-
 void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              const int32_t *zbin_ptr, const int32_t *round_ptr,
                              const int32_t *quant_ptr,

diff --git a/aom_dsp/quantize.h b/aom_dsp/quantize.h
index 74f0867..8f8f1a5 100644
--- a/aom_dsp/quantize.h
+++ b/aom_dsp/quantize.h

@@ -21,37 +21,6 @@
 extern "C" {
 #endif
 
-void aom_quantize_b_adaptive_helper_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr,
-    const int32_t *round_ptr, const int32_t *quant_ptr,
-    const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
-    const qm_val_t *iqm_ptr, const int log_scale);
-
-void aom_quantize_b_adaptive_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                               const int32_t *zbin_ptr,
-                               const int32_t *round_ptr,
-                               const int32_t *quant_ptr,
-                               const int32_t *quant_shift_ptr,
-                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                               const int32_t *dequant_ptr, uint16_t *eob_ptr,
-                               const int16_t *scan, const int16_t *iscan);
-
-void aom_quantize_b_32x32_adaptive_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr,
-    const int32_t *round_ptr, const int32_t *quant_ptr,
-    const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan);
-
-void aom_quantize_b_64x64_adaptive_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr,
-    const int32_t *round_ptr, const int32_t *quant_ptr,
-    const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan);
-
 void aom_highbd_quantize_b_adaptive_helper_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr,
     const int32_t *round_ptr, const int32_t *quant_ptr,
@@ -81,23 +50,6 @@
     tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan);
 
-void aom_quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             const int32_t *zbin_ptr, const int32_t *round_ptr,
-                             const int32_t *quant_ptr,
-                             const int32_t *quant_shift_ptr,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             const int32_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan,
-                             const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
-                             const int log_scale);
-
-void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      const int32_t *zbin_ptr, const int32_t *round_ptr,
-                      const int32_t *quant_ptr, const int32_t *quant_shift_ptr,
-                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                      const int32_t *dequant_ptr, uint16_t *eob_ptr,
-                      const int16_t *scan, const int16_t *iscan);
-
 void aom_highbd_quantize_b_helper_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr,
     const int32_t *round_ptr, const int32_t *quant_ptr,

diff --git a/aom_dsp/sad.c b/aom_dsp/sad.c
index 0fedcbf..0e8de74 100644
--- a/aom_dsp/sad.c
+++ b/aom_dsp/sad.c

@@ -19,170 +19,6 @@
 #include "aom_ports/mem.h"
 #include "aom_dsp/blend.h"
 
-/* Sum the difference between every corresponding element of the buffers. */
-static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
-                               int b_stride, int width, int height) {
-  int y, x;
-  unsigned int sad = 0;
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) {
-      sad += abs(a[x] - b[x]);
-    }
-
-    a += a_stride;
-    b += b_stride;
-  }
-  return sad;
-}
-
-#define sadMxh(m)                                                          \
-  unsigned int aom_sad##m##xh_c(const uint8_t *a, int a_stride,            \
-                                const uint8_t *b, int b_stride, int width, \
-                                int height) {                              \
-    return sad(a, a_stride, b, b_stride, width, height);                   \
-  }
-
-#define sadMxN(m, n)                                                          \
-  unsigned int aom_sad##m##x##n##_c(const uint8_t *src, int src_stride,       \
-                                    const uint8_t *ref, int ref_stride) {     \
-    return sad(src, src_stride, ref, ref_stride, m, n);                       \
-  }                                                                           \
-  unsigned int aom_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride,   \
-                                        const uint8_t *ref, int ref_stride,   \
-                                        const uint8_t *second_pred) {         \
-    uint8_t comp_pred[m * n];                                                 \
-    aom_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride);         \
-    return sad(src, src_stride, comp_pred, m, m, n);                          \
-  }                                                                           \
-  unsigned int aom_dist_wtd_sad##m##x##n##_avg_c(                             \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
-    uint8_t comp_pred[m * n];                                                 \
-    aom_dist_wtd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref,           \
-                                 ref_stride, jcp_param);                      \
-    return sad(src, src_stride, comp_pred, m, m, n);                          \
-  }                                                                           \
-  unsigned int aom_sad_skip_##m##x##n##_c(const uint8_t *src, int src_stride, \
-                                          const uint8_t *ref,                 \
-                                          int ref_stride) {                   \
-    return 2 * sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2));   \
-  }
-
-// Calculate sad against 4 reference locations and store each in sad_array
-#define sadMxNx4D(m, n)                                                       \
-  void aom_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,            \
-                               const uint8_t *const ref_array[4],             \
-                               int ref_stride, uint32_t sad_array[4]) {       \
-    int i;                                                                    \
-    for (i = 0; i < 4; ++i) {                                                 \
-      sad_array[i] =                                                          \
-          aom_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride);    \
-    }                                                                         \
-  }                                                                           \
-  void aom_sad##m##x##n##x4d_avg_c(                                           \
-      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4],  \
-      int ref_stride, const uint8_t *second_pred, uint32_t sad_array[4]) {    \
-    int i;                                                                    \
-    for (i = 0; i < 4; ++i) {                                                 \
-      sad_array[i] = aom_sad##m##x##n##_avg_c(src, src_stride, ref_array[i],  \
-                                              ref_stride, second_pred);       \
-    }                                                                         \
-  }                                                                           \
-  void aom_sad_skip_##m##x##n##x4d_c(const uint8_t *src, int src_stride,      \
-                                     const uint8_t *const ref_array[4],       \
-                                     int ref_stride, uint32_t sad_array[4]) { \
-    int i;                                                                    \
-    for (i = 0; i < 4; ++i) {                                                 \
-      sad_array[i] = 2 * sad(src, 2 * src_stride, ref_array[i],               \
-                             2 * ref_stride, (m), (n / 2));                   \
-    }                                                                         \
-  }
-
-// 128x128
-sadMxN(128, 128);
-sadMxNx4D(128, 128);
-
-// 128x64
-sadMxN(128, 64);
-sadMxNx4D(128, 64);
-
-// 64x128
-sadMxN(64, 128);
-sadMxNx4D(64, 128);
-
-// 64x64
-sadMxN(64, 64);
-sadMxNx4D(64, 64);
-
-// 64x32
-sadMxN(64, 32);
-sadMxNx4D(64, 32);
-
-// 32x64
-sadMxN(32, 64);
-sadMxNx4D(32, 64);
-
-// 32x32
-sadMxN(32, 32);
-sadMxNx4D(32, 32);
-
-// 32x16
-sadMxN(32, 16);
-sadMxNx4D(32, 16);
-
-// 16x32
-sadMxN(16, 32);
-sadMxNx4D(16, 32);
-
-// 16x16
-sadMxN(16, 16);
-sadMxNx4D(16, 16);
-
-// 16x8
-sadMxN(16, 8);
-sadMxNx4D(16, 8);
-
-// 8x16
-sadMxN(8, 16);
-sadMxNx4D(8, 16);
-
-// 8x8
-sadMxN(8, 8);
-sadMxNx4D(8, 8);
-
-// 8x4
-sadMxN(8, 4);
-sadMxNx4D(8, 4);
-
-// 4x8
-sadMxN(4, 8);
-sadMxNx4D(4, 8);
-
-// 4x4
-sadMxN(4, 4);
-sadMxNx4D(4, 4);
-
-sadMxh(128);
-sadMxh(64);
-sadMxh(32);
-sadMxh(16);
-sadMxh(8);
-sadMxh(4);
-
-sadMxN(4, 16);
-sadMxNx4D(4, 16);
-sadMxN(16, 4);
-sadMxNx4D(16, 4);
-sadMxN(8, 32);
-sadMxNx4D(8, 32);
-sadMxN(32, 8);
-sadMxNx4D(32, 8);
-sadMxN(16, 64);
-sadMxNx4D(16, 64);
-sadMxN(64, 16);
-sadMxNx4D(64, 16);
-
 static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride,
                                       const uint8_t *b8, int b_stride,
                                       int width, int height) {

diff --git a/aom_dsp/sad_av1.c b/aom_dsp/sad_av1.c
index 808d9b3..4504ab6 100644
--- a/aom_dsp/sad_av1.c
+++ b/aom_dsp/sad_av1.c

@@ -19,85 +19,13 @@
 #include "aom_ports/mem.h"
 #include "aom_dsp/blend.h"
 
-static INLINE unsigned int masked_sad(const uint8_t *src, int src_stride,
-                                      const uint8_t *a, int a_stride,
-                                      const uint8_t *b, int b_stride,
-                                      const uint8_t *m, int m_stride, int width,
-                                      int height) {
-  int y, x;
-  unsigned int sad = 0;
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) {
-      const int16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]);
-      sad += abs(pred - src[x]);
-    }
-    src += src_stride;
-    a += a_stride;
-    b += b_stride;
-    m += m_stride;
-  }
-  return sad;
-}
-
-#define MASKSADMxN(m, n)                                                       \
-  unsigned int aom_masked_sad##m##x##n##_c(                                    \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
-      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,          \
-      int invert_mask) {                                                       \
-    if (!invert_mask)                                                          \
-      return masked_sad(src, src_stride, ref, ref_stride, second_pred, m, msk, \
-                        msk_stride, m, n);                                     \
-    else                                                                       \
-      return masked_sad(src, src_stride, second_pred, m, ref, ref_stride, msk, \
-                        msk_stride, m, n);                                     \
-  }                                                                            \
-  void aom_masked_sad##m##x##n##x4d_c(                                         \
-      const uint8_t *src, int src_stride, const uint8_t *ref[],                \
-      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,          \
-      int msk_stride, int invert_mask, unsigned sads[]) {                      \
-    if (!invert_mask)                                                          \
-      for (int i = 0; i < 4; i++) {                                            \
-        sads[i] = masked_sad(src, src_stride, ref[i], ref_stride, second_pred, \
-                             m, msk, msk_stride, m, n);                        \
-      }                                                                        \
-    else                                                                       \
-      for (int i = 0; i < 4; i++) {                                            \
-        sads[i] = masked_sad(src, src_stride, second_pred, m, ref[i],          \
-                             ref_stride, msk, msk_stride, m, n);               \
-      }                                                                        \
-  }
-
-/* clang-format off */
-MASKSADMxN(128, 128)
-MASKSADMxN(128, 64)
-MASKSADMxN(64, 128)
-MASKSADMxN(64, 64)
-MASKSADMxN(64, 32)
-MASKSADMxN(32, 64)
-MASKSADMxN(32, 32)
-MASKSADMxN(32, 16)
-MASKSADMxN(16, 32)
-MASKSADMxN(16, 16)
-MASKSADMxN(16, 8)
-MASKSADMxN(8, 16)
-MASKSADMxN(8, 8)
-MASKSADMxN(8, 4)
-MASKSADMxN(4, 8)
-MASKSADMxN(4, 4)
-MASKSADMxN(4, 16)
-MASKSADMxN(16, 4)
-MASKSADMxN(8, 32)
-MASKSADMxN(32, 8)
-MASKSADMxN(16, 64)
-MASKSADMxN(64, 16)
-
-    /* clang-format on */
-    static INLINE
-    unsigned int highbd_masked_sad(const uint8_t *src8, int src_stride,
-                                   const uint8_t *a8, int a_stride,
-                                   const uint8_t *b8, int b_stride,
-                                   const uint8_t *m, int m_stride, int width,
-                                   int height) {
+/* clang-format on */
+static INLINE unsigned int highbd_masked_sad(const uint8_t *src8,
+                                             int src_stride, const uint8_t *a8,
+                                             int a_stride, const uint8_t *b8,
+                                             int b_stride, const uint8_t *m,
+                                             int m_stride, int width,
+                                             int height) {
   int y, x;
   unsigned int sad = 0;
   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
@@ -158,61 +86,13 @@
 // pre: predictor being evaluated
 // wsrc: target weighted prediction (has been *4096 to keep precision)
 // mask: 2d weights (scaled by 4096)
-static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
-                                    const int32_t *wsrc, const int32_t *mask,
-                                    int width, int height) {
-  int y, x;
-  unsigned int sad = 0;
 
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++)
-      sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
+/* clang-format on */
 
-    pre += pre_stride;
-    wsrc += width;
-    mask += width;
-  }
-
-  return sad;
-}
-
-#define OBMCSADMxN(m, n)                                                     \
-  unsigned int aom_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \
-                                         const int32_t *wsrc,                \
-                                         const int32_t *mask) {              \
-    return obmc_sad(ref, ref_stride, wsrc, mask, m, n);                      \
-  }
-
-/* clang-format off */
-OBMCSADMxN(128, 128)
-OBMCSADMxN(128, 64)
-OBMCSADMxN(64, 128)
-OBMCSADMxN(64, 64)
-OBMCSADMxN(64, 32)
-OBMCSADMxN(32, 64)
-OBMCSADMxN(32, 32)
-OBMCSADMxN(32, 16)
-OBMCSADMxN(16, 32)
-OBMCSADMxN(16, 16)
-OBMCSADMxN(16, 8)
-OBMCSADMxN(8, 16)
-OBMCSADMxN(8, 8)
-OBMCSADMxN(8, 4)
-OBMCSADMxN(4, 8)
-OBMCSADMxN(4, 4)
-OBMCSADMxN(4, 16)
-OBMCSADMxN(16, 4)
-OBMCSADMxN(8, 32)
-OBMCSADMxN(32, 8)
-OBMCSADMxN(16, 64)
-OBMCSADMxN(64, 16)
-
-    /* clang-format on */
-
-    static INLINE
-    unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
-                                 const int32_t *wsrc, const int32_t *mask,
-                                 int width, int height) {
+static INLINE unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
+                                           const int32_t *wsrc,
+                                           const int32_t *mask, int width,
+                                           int height) {
   int y, x;
   unsigned int sad = 0;
   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);

diff --git a/aom_dsp/sse.c b/aom_dsp/sse.c
index c71ebe6..4c18301 100644
--- a/aom_dsp/sse.c
+++ b/aom_dsp/sse.c

@@ -17,23 +17,6 @@
 
 #include "aom/aom_integer.h"
 
-int64_t aom_sse_c(const uint8_t *a, int a_stride, const uint8_t *b,
-                  int b_stride, int width, int height) {
-  int y, x;
-  int64_t sse = 0;
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) {
-      const int32_t diff = abs(a[x] - b[x]);
-      sse += diff * diff;
-    }
-
-    a += a_stride;
-    b += b_stride;
-  }
-  return sse;
-}
-
 int64_t aom_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8,
                          int b_stride, int width, int height) {
   int y, x;

diff --git a/aom_dsp/ssim.c b/aom_dsp/ssim.c
index 5510ea3..7c8613b 100644
--- a/aom_dsp/ssim.c
+++ b/aom_dsp/ssim.c

@@ -19,37 +19,6 @@
 #include "aom_ports/mem.h"
 #include "aom_ports/system_state.h"
 
-void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
-                            uint32_t *sum_s, uint32_t *sum_r,
-                            uint32_t *sum_sq_s, uint32_t *sum_sq_r,
-                            uint32_t *sum_sxr) {
-  int i, j;
-  for (i = 0; i < 16; i++, s += sp, r += rp) {
-    for (j = 0; j < 16; j++) {
-      *sum_s += s[j];
-      *sum_r += r[j];
-      *sum_sq_s += s[j] * s[j];
-      *sum_sq_r += r[j] * r[j];
-      *sum_sxr += s[j] * r[j];
-    }
-  }
-}
-
-void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
-                          uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
-                          uint32_t *sum_sq_r, uint32_t *sum_sxr) {
-  int i, j;
-  for (i = 0; i < 8; i++, s += sp, r += rp) {
-    for (j = 0; j < 8; j++) {
-      *sum_s += s[j];
-      *sum_r += r[j];
-      *sum_sq_s += s[j] * s[j];
-      *sum_sq_r += r[j] * r[j];
-      *sum_sxr += s[j] * r[j];
-    }
-  }
-}
-
 void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r,
                                  int rp, uint32_t *sum_s, uint32_t *sum_r,
                                  uint32_t *sum_sq_s, uint32_t *sum_sq_r,
@@ -103,13 +72,6 @@
   return ssim_n / ssim_d;
 }
 
-static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
-  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
-  aom_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
-                     &sum_sxr);
-  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8);
-}
-
 static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
                               int rp, uint32_t bd, uint32_t shift) {
   uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
@@ -122,26 +84,6 @@
 // We are using a 8x8 moving window with starting location of each 8x8 window
 // on the 4x4 pixel grid. Such arrangement allows the windows to overlap
 // block boundaries to penalize blocking artifacts.
-static double aom_ssim2(const uint8_t *img1, const uint8_t *img2,
-                        int stride_img1, int stride_img2, int width,
-                        int height) {
-  int i, j;
-  int samples = 0;
-  double ssim_total = 0;
-
-  // sample point start with each 4x4 location
-  for (i = 0; i <= height - 8;
-       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
-    for (j = 0; j <= width - 8; j += 4) {
-      double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
-      ssim_total += v;
-      samples++;
-    }
-  }
-  ssim_total /= samples;
-  return ssim_total;
-}
-
 static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
                                int stride_img1, int stride_img2, int width,
                                int height, uint32_t bd, uint32_t shift) {
@@ -164,262 +106,6 @@
   return ssim_total;
 }
 
-double aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                     const YV12_BUFFER_CONFIG *dest, double *weight) {
-  double abc[3];
-  for (int i = 0; i < 3; ++i) {
-    const int is_uv = i > 0;
-    abc[i] = aom_ssim2(source->buffers[i], dest->buffers[i],
-                       source->strides[is_uv], dest->strides[is_uv],
-                       source->crop_widths[is_uv], source->crop_heights[is_uv]);
-  }
-
-  *weight = 1;
-  return abc[0] * .8 + .1 * (abc[1] + abc[2]);
-}
-
-// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity
-//
-// Re working out the math ->
-//
-// ssim(x,y) =  (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) /
-//   ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2))
-//
-// mean(x) = sum(x) / n
-//
-// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n)
-//
-// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n)
-//
-// ssim(x,y) =
-//   (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) /
-//   (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) *
-//    ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+
-//     (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2)))
-//
-// factoring out n*n
-//
-// ssim(x,y) =
-//   (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) /
-//   (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) *
-//    (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2))
-//
-// Replace c1 with n*n * c1 for the final step that leads to this code:
-// The final step scales by 12 bits so we don't lose precision in the constants.
-
-static double ssimv_similarity(const Ssimv *sv, int64_t n) {
-  // Scale the constants by number of pixels.
-  const int64_t c1 = (cc1 * n * n) >> 12;
-  const int64_t c2 = (cc2 * n * n) >> 12;
-
-  const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) /
-                   (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1);
-
-  // Since these variables are unsigned sums, convert to double so
-  // math is done in double arithmetic.
-  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
-                   (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
-                    n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
-
-  return l * v;
-}
-
-// The first term of the ssim metric is a luminance factor.
-//
-// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1)
-//
-// This luminance factor is super sensitive to the dark side of luminance
-// values and completely insensitive on the white side.  check out 2 sets
-// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60
-// 2*250*252/ (250^2+252^2) => .99999997
-//
-// As a result in this tweaked version of the calculation in which the
-// luminance is taken as percentage off from peak possible.
-//
-// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count
-//
-static double ssimv_similarity2(const Ssimv *sv, int64_t n) {
-  // Scale the constants by number of pixels.
-  const int64_t c1 = (cc1 * n * n) >> 12;
-  const int64_t c2 = (cc2 * n * n) >> 12;
-
-  const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n;
-  const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1);
-
-  // Since these variables are unsigned, sums convert to double so
-  // math is done in double arithmetic.
-  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
-                   (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
-                    n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
-
-  return l * v;
-}
-static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2,
-                        int img2_pitch, Ssimv *sv) {
-  aom_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r,
-                     &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr);
-}
-
-double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
-                            int img2_pitch, int width, int height, Ssimv *sv2,
-                            Metrics *m, int do_inconsistency) {
-  double dssim_total = 0;
-  double ssim_total = 0;
-  double ssim2_total = 0;
-  double inconsistency_total = 0;
-  int i, j;
-  int c = 0;
-  double norm;
-  double old_ssim_total = 0;
-  aom_clear_system_state();
-  // We can sample points as frequently as we like start with 1 per 4x4.
-  for (i = 0; i < height;
-       i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
-    for (j = 0; j < width; j += 4, ++c) {
-      Ssimv sv = { 0, 0, 0, 0, 0, 0 };
-      double ssim;
-      double ssim2;
-      double dssim;
-      uint32_t var_new;
-      uint32_t var_old;
-      uint32_t mean_new;
-      uint32_t mean_old;
-      double ssim_new;
-      double ssim_old;
-
-      // Not sure there's a great way to handle the edge pixels
-      // in ssim when using a window. Seems biased against edge pixels
-      // however you handle this. This uses only samples that are
-      // fully in the frame.
-      if (j + 8 <= width && i + 8 <= height) {
-        ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv);
-      }
-
-      ssim = ssimv_similarity(&sv, 64);
-      ssim2 = ssimv_similarity2(&sv, 64);
-
-      sv.ssim = ssim2;
-
-      // dssim is calculated to use as an actual error metric and
-      // is scaled up to the same range as sum square error.
-      // Since we are subsampling every 16th point maybe this should be
-      // *16 ?
-      dssim = 255 * 255 * (1 - ssim2) / 2;
-
-      // Here I introduce a new error metric: consistency-weighted
-      // SSIM-inconsistency.  This metric isolates frames where the
-      // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much
-      // sharper or blurrier than the others. Higher values indicate a
-      // temporally inconsistent SSIM. There are two ideas at work:
-      //
-      // 1) 'SSIM-inconsistency': the total inconsistency value
-      // reflects how much SSIM values are changing between this
-      // source / reference frame pair and the previous pair.
-      //
-      // 2) 'consistency-weighted': weights de-emphasize areas in the
-      // frame where the scene content has changed. Changes in scene
-      // content are detected via changes in local variance and local
-      // mean.
-      //
-      // Thus the overall measure reflects how inconsistent the SSIM
-      // values are, over consistent regions of the frame.
-      //
-      // The metric has three terms:
-      //
-      // term 1 -> uses change in scene Variance to weight error score
-      //  2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2)
-      //  larger changes from one frame to the next mean we care
-      //  less about consistency.
-      //
-      // term 2 -> uses change in local scene luminance to weight error
-      //  2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2)
-      //  larger changes from one frame to the next mean we care
-      //  less about consistency.
-      //
-      // term3 -> measures inconsistency in ssim scores between frames
-      //   1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2).
-      //
-      // This term compares the ssim score for the same location in 2
-      // subsequent frames.
-      var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64;
-      var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64;
-      mean_new = sv.sum_s;
-      mean_old = sv2[c].sum_s;
-      ssim_new = sv.ssim;
-      ssim_old = sv2[c].ssim;
-
-      if (do_inconsistency) {
-        // We do the metric once for every 4x4 block in the image. Since
-        // we are scaling the error to SSE for use in a psnr calculation
-        // 1.0 = 4x4x255x255 the worst error we can possibly have.
-        static const double kScaling = 4. * 4 * 255 * 255;
-
-        // The constants have to be non 0 to avoid potential divide by 0
-        // issues other than that they affect kind of a weighting between
-        // the terms.  No testing of what the right terms should be has been
-        // done.
-        static const double c1 = 1, c2 = 1, c3 = 1;
-
-        // This measures how much consistent variance is in two consecutive
-        // source frames. 1.0 means they have exactly the same variance.
-        const double variance_term =
-            (2.0 * var_old * var_new + c1) /
-            (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1);
-
-        // This measures how consistent the local mean are between two
-        // consecutive frames. 1.0 means they have exactly the same mean.
-        const double mean_term =
-            (2.0 * mean_old * mean_new + c2) /
-            (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2);
-
-        // This measures how consistent the ssims of two
-        // consecutive frames is. 1.0 means they are exactly the same.
-        double ssim_term =
-            pow((2.0 * ssim_old * ssim_new + c3) /
-                    (ssim_old * ssim_old + ssim_new * ssim_new + c3),
-                5);
-
-        double this_inconsistency;
-
-        // Floating point math sometimes makes this > 1 by a tiny bit.
-        // We want the metric to scale between 0 and 1.0 so we can convert
-        // it to an snr scaled value.
-        if (ssim_term > 1) ssim_term = 1;
-
-        // This converts the consistency metric to an inconsistency metric
-        // ( so we can scale it like psnr to something like sum square error.
-        // The reason for the variance and mean terms is the assumption that
-        // if there are big changes in the source we shouldn't penalize
-        // inconsistency in ssim scores a bit less as it will be less visible
-        // to the user.
-        this_inconsistency = (1 - ssim_term) * variance_term * mean_term;
-
-        this_inconsistency *= kScaling;
-        inconsistency_total += this_inconsistency;
-      }
-      sv2[c] = sv;
-      ssim_total += ssim;
-      ssim2_total += ssim2;
-      dssim_total += dssim;
-
-      old_ssim_total += ssim_old;
-    }
-    old_ssim_total += 0;
-  }
-
-  norm = 1. / (width / 4) / (height / 4);
-  ssim_total *= norm;
-  ssim2_total *= norm;
-  m->ssim2 = ssim2_total;
-  m->ssim = ssim_total;
-  if (old_ssim_total == 0) inconsistency_total = 0;
-
-  m->ssimc = inconsistency_total;
-
-  m->dssim = dssim_total;
-  return inconsistency_total;
-}
-
 double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
                             const YV12_BUFFER_CONFIG *dest, double *weight,
                             uint32_t bd, uint32_t in_bd) {

diff --git a/aom_dsp/ssim.h b/aom_dsp/ssim.h
index 627ef6a..182ac18 100644
--- a/aom_dsp/ssim.h
+++ b/aom_dsp/ssim.h

@@ -65,13 +65,6 @@
   double ssimcd;
 } Metrics;
 
-double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
-                            int img2_pitch, int width, int height, Ssimv *sv2,
-                            Metrics *m, int do_inconsistency);
-
-double aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                     const YV12_BUFFER_CONFIG *dest, double *weight);
-
 double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
                          const YV12_BUFFER_CONFIG *dest, double *ssim_y,
                          double *ssim_u, double *ssim_v, uint32_t bd,

diff --git a/aom_dsp/subtract.c b/aom_dsp/subtract.c
index 1f66d5a..714147e 100644
--- a/aom_dsp/subtract.c
+++ b/aom_dsp/subtract.c

@@ -18,21 +18,6 @@
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
 
-void aom_subtract_block_c(int rows, int cols, int16_t *diff,
-                          ptrdiff_t diff_stride, const uint8_t *src,
-                          ptrdiff_t src_stride, const uint8_t *pred,
-                          ptrdiff_t pred_stride) {
-  int r, c;
-
-  for (r = 0; r < rows; r++) {
-    for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c];
-
-    diff += diff_stride;
-    pred += pred_stride;
-    src += src_stride;
-  }
-}
-
 void aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff,
                                  ptrdiff_t diff_stride, const uint8_t *src8,
                                  ptrdiff_t src_stride, const uint8_t *pred8,

diff --git a/aom_dsp/txfm_common.h b/aom_dsp/txfm_common.h
index 42412e4..47a276e 100644
--- a/aom_dsp/txfm_common.h
+++ b/aom_dsp/txfm_common.h

@@ -36,9 +36,6 @@
   TX_SIZE tx_size;
   int lossless;
   int bd;
-  // are the pixel buffers octets or shorts?  This should collapse to
-  // bd==8 implies !is_hbd, but that's not certain right now.
-  int is_hbd;
   TxSetType tx_set_type;
   // for inverse transforms only
   int eob;

diff --git a/aom_dsp/variance.c b/aom_dsp/variance.c
index f031904..c60a86a 100644
--- a/aom_dsp/variance.c
+++ b/aom_dsp/variance.c

@@ -29,24 +29,6 @@
 #include "av1/common/reconinter.h"
 #include "av1/encoder/reconinter_enc.h"
 
-uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
-                            int b_stride) {
-  int distortion = 0;
-  int r, c;
-
-  for (r = 0; r < 4; ++r) {
-    for (c = 0; c < 4; ++c) {
-      int diff = a[c] - b[c];
-      distortion += diff * diff;
-    }
-
-    a += a_stride;
-    b += b_stride;
-  }
-
-  return distortion;
-}
-
 uint32_t aom_get_mb_ss_c(const int16_t *a) {
   unsigned int i, sum = 0;
 
@@ -57,33 +39,6 @@
   return sum;
 }
 
-static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
-                     int b_stride, int w, int h, uint32_t *sse, int *sum) {
-  int i, j;
-
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; ++j) {
-      const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
-    }
-
-    a += a_stride;
-    b += b_stride;
-  }
-}
-
-uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
-                          int b_stride, int w, int h) {
-  uint32_t sse;
-  int sum;
-  variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
-  return sse;
-}
-
 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
 // or vertical direction to produce the filtered output block. Used to implement
 // the first-pass of 2-D separable filter.
@@ -142,283 +97,6 @@
   }
 }
 
-#define VAR(W, H)                                                    \
-  uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                     const uint8_t *b, int b_stride, \
-                                     uint32_t *sse) {                \
-    int sum;                                                         \
-    variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
-    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));        \
-  }
-
-#define SUBPIX_VAR(W, H)                                                      \
-  uint32_t aom_sub_pixel_variance##W##x##H##_c(                               \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,               \
-      const uint8_t *b, int b_stride, uint32_t *sse) {                        \
-    uint16_t fdata3[(H + 1) * W];                                             \
-    uint8_t temp2[H * W];                                                     \
-                                                                              \
-    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
-                                            bilinear_filters_2t[xoffset]);    \
-    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
-                                             bilinear_filters_2t[yoffset]);   \
-                                                                              \
-    return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse);             \
-  }
-
-#define SUBPIX_AVG_VAR(W, H)                                                   \
-  uint32_t aom_sub_pixel_avg_variance##W##x##H##_c(                            \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,                \
-      const uint8_t *b, int b_stride, uint32_t *sse,                           \
-      const uint8_t *second_pred) {                                            \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint8_t temp2[H * W];                                                      \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
-                                                                               \
-    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W,  \
-                                            bilinear_filters_2t[xoffset]);     \
-    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
-                                             bilinear_filters_2t[yoffset]);    \
-                                                                               \
-    aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W);                     \
-                                                                               \
-    return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse);              \
-  }                                                                            \
-  uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(                   \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,                \
-      const uint8_t *b, int b_stride, uint32_t *sse,                           \
-      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {     \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint8_t temp2[H * W];                                                      \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
-                                                                               \
-    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W,  \
-                                            bilinear_filters_2t[xoffset]);     \
-    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
-                                             bilinear_filters_2t[yoffset]);    \
-                                                                               \
-    aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
-                                                                               \
-    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);                  \
-  }
-
-/* Identical to the variance call except it takes an additional parameter, sum,
- * and returns that value using pass-by-reference instead of returning
- * sse - sum^2 / w*h
- */
-#define GET_VAR(W, H)                                                         \
-  void aom_get##W##x##H##var_c(const uint8_t *a, int a_stride,                \
-                               const uint8_t *b, int b_stride, uint32_t *sse, \
-                               int *sum) {                                    \
-    variance(a, a_stride, b, b_stride, W, H, sse, sum);                       \
-  }
-
-/* Identical to the variance call except it does not calculate the
- * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
- * variable.
- */
-#define MSE(W, H)                                               \
-  uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                const uint8_t *b, int b_stride, \
-                                uint32_t *sse) {                \
-    int sum;                                                    \
-    variance(a, a_stride, b, b_stride, W, H, sse, &sum);        \
-    return *sse;                                                \
-  }
-
-/* All three forms of the variance are available in the same sizes. */
-#define VARIANCES(W, H) \
-  VAR(W, H)             \
-  SUBPIX_VAR(W, H)      \
-  SUBPIX_AVG_VAR(W, H)
-
-VARIANCES(128, 128)
-VARIANCES(128, 64)
-VARIANCES(64, 128)
-VARIANCES(64, 64)
-VARIANCES(64, 32)
-VARIANCES(32, 64)
-VARIANCES(32, 32)
-VARIANCES(32, 16)
-VARIANCES(16, 32)
-VARIANCES(16, 16)
-VARIANCES(16, 8)
-VARIANCES(8, 16)
-VARIANCES(8, 8)
-VARIANCES(8, 4)
-VARIANCES(4, 8)
-VARIANCES(4, 4)
-VARIANCES(4, 2)
-VARIANCES(2, 4)
-VARIANCES(2, 2)
-VARIANCES(4, 16)
-VARIANCES(16, 4)
-VARIANCES(8, 32)
-VARIANCES(32, 8)
-VARIANCES(16, 64)
-VARIANCES(64, 16)
-
-GET_VAR(16, 16)
-GET_VAR(8, 8)
-
-MSE(16, 16)
-MSE(16, 8)
-MSE(8, 16)
-MSE(8, 8)
-
-void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
-                         int height, const uint8_t *ref, int ref_stride) {
-  int i, j;
-
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
-      const int tmp = pred[j] + ref[j];
-      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
-    }
-    comp_pred += width;
-    pred += width;
-    ref += ref_stride;
-  }
-}
-
-// Get pred block from up-sampled reference.
-void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
-                          int mi_row, int mi_col, const MV *const mv,
-                          uint8_t *comp_pred, int width, int height,
-                          int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-                          int ref_stride, int subpel_search) {
-  // expect xd == NULL only in tests
-  if (xd != NULL) {
-    const MB_MODE_INFO *mi = xd->mi[0];
-    const int ref_num = 0;
-    const int is_intrabc = is_intrabc_block(mi, xd->tree_type);
-    const struct scale_factors *const sf =
-        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
-    const int is_scaled = av1_is_scaled(sf);
-
-    if (is_scaled) {
-      int plane = 0;
-      const int mi_x = mi_col * MI_SIZE;
-      const int mi_y = mi_row * MI_SIZE;
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const struct buf_2d *const dst_buf = &pd->dst;
-      const struct buf_2d *const pre_buf =
-          is_intrabc ? dst_buf : &pd->pre[ref_num];
-
-      InterPredParams inter_pred_params;
-      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
-      const InterpFilter filters = EIGHTTAP_REGULAR;
-      av1_init_inter_params(
-          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
-          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
-          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
-      av1_enc_build_one_inter_predictor(comp_pred, width, mv,
-                                        &inter_pred_params);
-      return;
-    }
-  }
-
-  const InterpFilterParams *filter = av1_get_filter(subpel_search);
-
-  if (!subpel_x_q3 && !subpel_y_q3) {
-    for (int i = 0; i < height; i++) {
-      memcpy(comp_pred, ref, width * sizeof(*comp_pred));
-      comp_pred += width;
-      ref += ref_stride;
-    }
-  } else if (!subpel_y_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
-                          -1, width, height);
-  } else if (!subpel_x_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
-                         16, width, height);
-  } else {
-    DECLARE_ALIGNED(16, uint8_t,
-                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
-    const int16_t *const kernel_x =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    const int16_t *const kernel_y =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    const int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
-    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1),
-                          ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
-                          width, intermediate_height);
-    aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
-                         MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
-                         width, height);
-  }
-}
-
-void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
-                                   int mi_row, int mi_col, const MV *const mv,
-                                   uint8_t *comp_pred, const uint8_t *pred,
-                                   int width, int height, int subpel_x_q3,
-                                   int subpel_y_q3, const uint8_t *ref,
-                                   int ref_stride, int subpel_search) {
-  int i, j;
-
-  aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
-                       subpel_search);
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
-    }
-    comp_pred += width;
-    pred += width;
-  }
-}
-
-void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
-                                  int width, int height, const uint8_t *ref,
-                                  int ref_stride,
-                                  const DIST_WTD_COMP_PARAMS *jcp_param) {
-  int i, j;
-  const int fwd_offset = jcp_param->fwd_offset;
-  const int bck_offset = jcp_param->bck_offset;
-
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
-      int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
-      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
-      comp_pred[j] = (uint8_t)tmp;
-    }
-    comp_pred += width;
-    pred += width;
-    ref += ref_stride;
-  }
-}
-
-void aom_dist_wtd_comp_avg_upsampled_pred_c(
-    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
-  int i, j;
-  const int fwd_offset = jcp_param->fwd_offset;
-  const int bck_offset = jcp_param->bck_offset;
-
-  aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
-                       subpel_search);
-
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
-      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
-      comp_pred[j] = (uint8_t)tmp;
-    }
-    comp_pred += width;
-    pred += width;
-  }
-}
-
 static void highbd_variance64(const uint8_t *a8, int a_stride,
                               const uint8_t *b8, int b_stride, int w, int h,
                               uint64_t *sse, int64_t *sum) {
@@ -853,7 +531,7 @@
       av1_init_inter_params(
           &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
           mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
-          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+          xd->bd, is_intrabc, sf, pre_buf, filters);
       av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
                                         &inter_pred_params);
       return;
@@ -971,88 +649,6 @@
   }
 }
 
-void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
-                          int height, const uint8_t *ref, int ref_stride,
-                          const uint8_t *mask, int mask_stride,
-                          int invert_mask) {
-  int i, j;
-  const uint8_t *src0 = invert_mask ? pred : ref;
-  const uint8_t *src1 = invert_mask ? ref : pred;
-  const int stride0 = invert_mask ? width : ref_stride;
-  const int stride1 = invert_mask ? ref_stride : width;
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
-      comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
-    }
-    comp_pred += width;
-    src0 += stride0;
-    src1 += stride1;
-    mask += mask_stride;
-  }
-}
-
-void aom_comp_mask_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
-                                    int mi_row, int mi_col, const MV *const mv,
-                                    uint8_t *comp_pred, const uint8_t *pred,
-                                    int width, int height, int subpel_x_q3,
-                                    int subpel_y_q3, const uint8_t *ref,
-                                    int ref_stride, const uint8_t *mask,
-                                    int mask_stride, int invert_mask,
-                                    int subpel_search) {
-  if (subpel_x_q3 | subpel_y_q3) {
-    aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                         subpel_x_q3, subpel_y_q3, ref, ref_stride,
-                         subpel_search);
-    ref = comp_pred;
-    ref_stride = width;
-  }
-  aom_comp_mask_pred_c(comp_pred, pred, width, height, ref, ref_stride, mask,
-                       mask_stride, invert_mask);
-}
-
-#define MASK_SUBPIX_VAR(W, H)                                                  \
-  unsigned int aom_masked_sub_pixel_variance##W##x##H##_c(                     \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
-      const uint8_t *msk, int msk_stride, int invert_mask,                     \
-      unsigned int *sse) {                                                     \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint8_t temp2[H * W];                                                      \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
-                                                                               \
-    aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \
-                                            W, bilinear_filters_2t[xoffset]);  \
-    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
-                                             bilinear_filters_2t[yoffset]);    \
-                                                                               \
-    aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride,  \
-                         invert_mask);                                         \
-    return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse);          \
-  }
-
-MASK_SUBPIX_VAR(4, 4)
-MASK_SUBPIX_VAR(4, 8)
-MASK_SUBPIX_VAR(8, 4)
-MASK_SUBPIX_VAR(8, 8)
-MASK_SUBPIX_VAR(8, 16)
-MASK_SUBPIX_VAR(16, 8)
-MASK_SUBPIX_VAR(16, 16)
-MASK_SUBPIX_VAR(16, 32)
-MASK_SUBPIX_VAR(32, 16)
-MASK_SUBPIX_VAR(32, 32)
-MASK_SUBPIX_VAR(32, 64)
-MASK_SUBPIX_VAR(64, 32)
-MASK_SUBPIX_VAR(64, 64)
-MASK_SUBPIX_VAR(64, 128)
-MASK_SUBPIX_VAR(128, 64)
-MASK_SUBPIX_VAR(128, 128)
-MASK_SUBPIX_VAR(4, 16)
-MASK_SUBPIX_VAR(16, 4)
-MASK_SUBPIX_VAR(8, 32)
-MASK_SUBPIX_VAR(32, 8)
-MASK_SUBPIX_VAR(16, 64)
-MASK_SUBPIX_VAR(64, 16)
-
 void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
                                  int width, int height, const uint8_t *ref8,
                                  int ref_stride, const uint8_t *mask,
@@ -1178,112 +774,6 @@
 HIGHBD_MASK_SUBPIX_VAR(16, 64)
 HIGHBD_MASK_SUBPIX_VAR(64, 16)
 
-static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
-                                 const int32_t *wsrc, const int32_t *mask,
-                                 int w, int h, unsigned int *sse, int *sum) {
-  int i, j;
-
-  *sse = 0;
-  *sum = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
-      *sum += diff;
-      *sse += diff * diff;
-    }
-
-    pre += pre_stride;
-    wsrc += w;
-    mask += w;
-  }
-}
-
-#define OBMC_VAR(W, H)                                            \
-  unsigned int aom_obmc_variance##W##x##H##_c(                    \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
-      const int32_t *mask, unsigned int *sse) {                   \
-    int sum;                                                      \
-    obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);  \
-    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
-  }
-
-#define OBMC_SUBPIX_VAR(W, H)                                                  \
-  unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c(                       \
-      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
-      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint8_t temp2[H * W];                                                      \
-                                                                               \
-    aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \
-                                            W, bilinear_filters_2t[xoffset]);  \
-    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
-                                             bilinear_filters_2t[yoffset]);    \
-                                                                               \
-    return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse);          \
-  }
-
-OBMC_VAR(4, 4)
-OBMC_SUBPIX_VAR(4, 4)
-
-OBMC_VAR(4, 8)
-OBMC_SUBPIX_VAR(4, 8)
-
-OBMC_VAR(8, 4)
-OBMC_SUBPIX_VAR(8, 4)
-
-OBMC_VAR(8, 8)
-OBMC_SUBPIX_VAR(8, 8)
-
-OBMC_VAR(8, 16)
-OBMC_SUBPIX_VAR(8, 16)
-
-OBMC_VAR(16, 8)
-OBMC_SUBPIX_VAR(16, 8)
-
-OBMC_VAR(16, 16)
-OBMC_SUBPIX_VAR(16, 16)
-
-OBMC_VAR(16, 32)
-OBMC_SUBPIX_VAR(16, 32)
-
-OBMC_VAR(32, 16)
-OBMC_SUBPIX_VAR(32, 16)
-
-OBMC_VAR(32, 32)
-OBMC_SUBPIX_VAR(32, 32)
-
-OBMC_VAR(32, 64)
-OBMC_SUBPIX_VAR(32, 64)
-
-OBMC_VAR(64, 32)
-OBMC_SUBPIX_VAR(64, 32)
-
-OBMC_VAR(64, 64)
-OBMC_SUBPIX_VAR(64, 64)
-
-OBMC_VAR(64, 128)
-OBMC_SUBPIX_VAR(64, 128)
-
-OBMC_VAR(128, 64)
-OBMC_SUBPIX_VAR(128, 64)
-
-OBMC_VAR(128, 128)
-OBMC_SUBPIX_VAR(128, 128)
-
-OBMC_VAR(4, 16)
-OBMC_SUBPIX_VAR(4, 16)
-OBMC_VAR(16, 4)
-OBMC_SUBPIX_VAR(16, 4)
-OBMC_VAR(8, 32)
-OBMC_SUBPIX_VAR(8, 32)
-OBMC_VAR(32, 8)
-OBMC_SUBPIX_VAR(32, 8)
-OBMC_VAR(16, 64)
-OBMC_SUBPIX_VAR(16, 64)
-OBMC_VAR(64, 16)
-OBMC_SUBPIX_VAR(64, 16)
-
 static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
                                           const int32_t *wsrc,
                                           const int32_t *mask, int w, int h,
@@ -1476,18 +966,6 @@
 HIGHBD_OBMC_VAR(64, 16)
 HIGHBD_OBMC_SUBPIX_VAR(64, 16)
 
-uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride, uint16_t *src,
-                             int sstride, int w, int h) {
-  uint64_t sum = 0;
-  for (int i = 0; i < h; i++) {
-    for (int j = 0; j < w; j++) {
-      int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j];
-      sum += e * e;
-    }
-  }
-  return sum;
-}
-
 uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride, uint16_t *src,
                                     int sstride, int w, int h) {
   uint64_t sum = 0;

diff --git a/aom_dsp/variance.h b/aom_dsp/variance.h
index 3f577fa..32271f4 100644
--- a/aom_dsp/variance.h
+++ b/aom_dsp/variance.h

@@ -70,13 +70,6 @@
     const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
     const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
 
-void aom_highbd_comp_mask_upsampled_pred(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
-    int bd, int subpel_search);
-
 typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride,
                                           const int32_t *wsrc,
                                           const int32_t *msk);
@@ -121,9 +114,6 @@
     unsigned int output_height, unsigned int output_width,
     const uint8_t *filter);
 
-uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
-                          int b_stride, int w, int h);
-
 uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
                                  const uint8_t *b, int b_stride, int w, int h);
 

diff --git a/aom_dsp/vmaf.c b/aom_dsp/vmaf.c
index 83acc60..f125544 100644
--- a/aom_dsp/vmaf.c
+++ b/aom_dsp/vmaf.c

@@ -53,45 +53,24 @@
     assert(width == frames->distorted->y_width);
     assert(height == frames->distorted->y_height);
 
-    if (frames->source->flags & YV12_FLAG_HIGHBITDEPTH) {
-      const float scale_factor = 1.0f / (float)(1 << (frames->bit_depth - 8));
-      uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(frames->source->y_buffer);
-      uint16_t *main_ptr = CONVERT_TO_SHORTPTR(frames->distorted->y_buffer);
+    const float scale_factor = 1.0f / (float)(1 << (frames->bit_depth - 8));
+    uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(frames->source->y_buffer);
+    uint16_t *main_ptr = CONVERT_TO_SHORTPTR(frames->distorted->y_buffer);
 
-      for (int row = 0; row < height; ++row) {
-        for (int col = 0; col < width; ++col) {
-          ref_data[col] = scale_factor * (float)ref_ptr[col];
-        }
-        ref_ptr += frames->source->y_stride;
-        ref_data += stride / sizeof(*ref_data);
+    for (int row = 0; row < height; ++row) {
+      for (int col = 0; col < width; ++col) {
+        ref_data[col] = scale_factor * (float)ref_ptr[col];
       }
+      ref_ptr += frames->source->y_stride;
+      ref_data += stride / sizeof(*ref_data);
+    }
 
-      for (int row = 0; row < height; ++row) {
-        for (int col = 0; col < width; ++col) {
-          main_data[col] = scale_factor * (float)main_ptr[col];
-        }
-        main_ptr += frames->distorted->y_stride;
-        main_data += stride / sizeof(*main_data);
+    for (int row = 0; row < height; ++row) {
+      for (int col = 0; col < width; ++col) {
+        main_data[col] = scale_factor * (float)main_ptr[col];
       }
-    } else {
-      uint8_t *ref_ptr = frames->source->y_buffer;
-      uint8_t *main_ptr = frames->distorted->y_buffer;
-
-      for (int row = 0; row < height; ++row) {
-        for (int col = 0; col < width; ++col) {
-          ref_data[col] = (float)ref_ptr[col];
-        }
-        ref_ptr += frames->source->y_stride;
-        ref_data += stride / sizeof(*ref_data);
-      }
-
-      for (int row = 0; row < height; ++row) {
-        for (int col = 0; col < width; ++col) {
-          main_data[col] = (float)main_ptr[col];
-        }
-        main_ptr += frames->distorted->y_stride;
-        main_data += stride / sizeof(*main_data);
-      }
+      main_ptr += frames->distorted->y_stride;
+      main_data += stride / sizeof(*main_data);
     }
     frames->frame_set = 1;
     return 0;

diff --git a/aom_dsp/x86/aom_asm_stubs.c b/aom_dsp/x86/aom_asm_stubs.c
index f0c8653..c0ecca2 100644
--- a/aom_dsp/x86/aom_asm_stubs.c
+++ b/aom_dsp/x86/aom_asm_stubs.c

@@ -16,40 +16,6 @@
 #include "aom_dsp/x86/convolve.h"
 
 #if HAVE_SSE2
-filter8_1dfunction aom_filter_block1d16_v8_sse2;
-filter8_1dfunction aom_filter_block1d16_h8_sse2;
-filter8_1dfunction aom_filter_block1d8_v8_sse2;
-filter8_1dfunction aom_filter_block1d8_h8_sse2;
-filter8_1dfunction aom_filter_block1d4_v8_sse2;
-filter8_1dfunction aom_filter_block1d4_h8_sse2;
-filter8_1dfunction aom_filter_block1d16_v4_sse2;
-filter8_1dfunction aom_filter_block1d16_h4_sse2;
-
-filter8_1dfunction aom_filter_block1d8_h4_sse2;
-filter8_1dfunction aom_filter_block1d8_v4_sse2;
-filter8_1dfunction aom_filter_block1d4_h4_sse2;
-filter8_1dfunction aom_filter_block1d4_v4_sse2;
-
-filter8_1dfunction aom_filter_block1d16_v2_sse2;
-filter8_1dfunction aom_filter_block1d16_h2_sse2;
-filter8_1dfunction aom_filter_block1d8_v2_sse2;
-filter8_1dfunction aom_filter_block1d8_h2_sse2;
-filter8_1dfunction aom_filter_block1d4_v2_sse2;
-filter8_1dfunction aom_filter_block1d4_h2_sse2;
-
-// void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
-//                               int w, int h);
-// void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const int16_t *filter_x, int x_step_q4,
-//                              const int16_t *filter_y, int y_step_q4,
-//                              int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;

diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
deleted file mode 100644
index a3fea93..0000000
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ /dev/null

@@ -1,1442 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/convolve.h"
-#include "aom_dsp/x86/convolve_avx2.h"
-#include "aom_ports/mem.h"
-
-#if defined(__clang__)
-#if (__clang_major__ > 0 && __clang_major__ < 3) ||            \
-    (__clang_major__ == 3 && __clang_minor__ <= 3) ||          \
-    (defined(__APPLE__) && defined(__apple_build_version__) && \
-     ((__clang_major__ == 4 && __clang_minor__ <= 2) ||        \
-      (__clang_major__ == 5 && __clang_minor__ == 0)))
-#define MM256_BROADCASTSI128_SI256(x) \
-  _mm_broadcastsi128_si256((__m128i const *)&(x))
-#else  // clang > 3.3, and not 5.0 on macosx.
-#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif  // clang <= 3.3
-#elif defined(__GNUC__)
-#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
-#define MM256_BROADCASTSI128_SI256(x) \
-  _mm_broadcastsi128_si256((__m128i const *)&(x))
-#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
-#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
-#else  // gcc > 4.7
-#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif  // gcc <= 4.6
-#else   // !(gcc || clang)
-#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif  // __clang__
-
-static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr,
-                                    const ptrdiff_t stride, const __m256i *a) {
-  *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a));
-  *((uint32_t *)(output_ptr + stride)) =
-      _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1));
-}
-
-static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
-  __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
-  a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
-  return a;
-}
-
-static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr,
-                                    const ptrdiff_t stride, const __m256i *a) {
-  _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
-  _mm_storel_epi64((__m128i *)(output_ptr + stride),
-                   _mm256_extractf128_si256(*a, 1));
-}
-
-static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
-  __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
-  a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
-  return a;
-}
-
-static INLINE void xx_store2_mi128(const uint8_t *output_ptr,
-                                   const ptrdiff_t stride, const __m256i *a) {
-  _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
-  _mm_store_si128((__m128i *)(output_ptr + stride),
-                  _mm256_extractf128_si256(*a, 1));
-}
-
-static void aom_filter_block1d4_h4_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-  src_ptr -= 3;
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  firstFilters =
-      _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
-  filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pixels_per_line << 1;
-  dst_stride = output_pitch << 1;
-  for (i = output_height; i > 1; i -= 2) {
-    // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
-
-    // filter the source buffer
-    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
-
-    // multiply 4 adjacent elements with the filter and add the result
-    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
-
-    srcRegFilt32b1_1 =
-        _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 =
-        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
-
-    src_ptr += src_stride;
-
-    xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
-    output_ptr += dst_stride;
-  }
-
-  // if the number of strides is odd.
-  // process only 4 bytes
-  if (i > 0) {
-    __m128i srcReg1, srcRegFilt1_1;
-
-    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
-
-    // filter the source buffer
-    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
-
-    // multiply 4 adjacent elements with the filter and add the result
-    srcRegFilt1_1 =
-        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
-
-    srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
-    // shift by 6 bit each 16 bit
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
-
-    // save 4 bytes
-    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
-  }
-}
-
-static void aom_filter_block1d4_h8_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg32, filt1Reg, filt2Reg;
-  __m256i firstFilters, secondFilters;
-  __m256i srcRegFilt32b1_1, srcRegFilt32b2;
-  __m256i srcReg32b1;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-  src_ptr -= 3;
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 32 bits
-  firstFilters = _mm256_shuffle_epi32(filtersReg32, 0);
-  // duplicate only the second 32 bits
-  secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
-
-  filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
-  filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pixels_per_line << 1;
-  dst_stride = output_pitch << 1;
-  for (i = output_height; i > 1; i -= 2) {
-    // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
-
-    // filter the source buffer
-    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
-
-    // multiply 4 adjacent elements with the filter and add the result
-    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
-
-    // filter the source buffer
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
-
-    // multiply 4 adjacent elements with the filter and add the result
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
-
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
-
-    srcRegFilt32b1_1 =
-        _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 =
-        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
-
-    src_ptr += src_stride;
-
-    xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
-    output_ptr += dst_stride;
-  }
-
-  // if the number of strides is odd.
-  // process only 4 bytes
-  if (i > 0) {
-    __m128i srcReg1, srcRegFilt1_1;
-    __m128i srcRegFilt2;
-
-    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
-
-    // filter the source buffer
-    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
-
-    // multiply 4 adjacent elements with the filter and add the result
-    srcRegFilt1_1 =
-        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
-
-    // filter the source buffer
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
-
-    // multiply 4 adjacent elements with the filter and add the result
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
-
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-    srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
-    // shift by 6 bit each 16 bit
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
-
-    // save 4 bytes
-    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
-  }
-}
-
-static void aom_filter_block1d8_h4_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg32, filt2Reg, filt3Reg;
-  __m256i secondFilters, thirdFilters;
-  __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
-  __m256i srcReg32b1, filtersReg32;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-  src_ptr -= 3;
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-
-  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-
-  // multiply the size of the source and destination stride by two
-  src_stride = src_pixels_per_line << 1;
-  dst_stride = output_pitch << 1;
-  for (i = output_height; i > 1; i -= 2) {
-    // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1);
-
-    src_ptr += src_stride;
-
-    xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
-    output_ptr += dst_stride;
-  }
-
-  // if the number of strides is odd.
-  // process only 8 bytes
-  if (i > 0) {
-    __m128i srcReg1, srcRegFilt1_1;
-    __m128i srcRegFilt2, srcRegFilt3;
-
-    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
-
-    // filter the source buffer
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
-    srcRegFilt3 =
-        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3);
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
-
-    // save 8 bytes
-    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
-  }
-}
-
-static void aom_filter_block1d8_h8_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
-  __m256i srcReg32b1;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-  src_ptr -= 3;
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 256 bit register
-  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 256 bit register
-  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
-
-  filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
-  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-  filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pixels_per_line << 1;
-  dst_stride = output_pitch << 1;
-  for (i = output_height; i > 1; i -= 2) {
-    // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
-
-    // filter the source buffer
-    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 =
-        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
-
-    src_ptr += src_stride;
-
-    xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
-    output_ptr += dst_stride;
-  }
-
-  // if the number of strides is odd.
-  // process only 8 bytes
-  if (i > 0) {
-    __m128i srcReg1, srcRegFilt1_1;
-    __m128i srcRegFilt2, srcRegFilt3;
-
-    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
-
-    // filter the source buffer
-    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1_1 =
-        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-
-    // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 =
-        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
-
-    // save 8 bytes
-    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
-  }
-}
-
-static void aom_filter_block1d16_h4_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg32, filt2Reg, filt3Reg;
-  __m256i secondFilters, thirdFilters;
-  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
-  __m256i srcReg32b1, srcReg32b2, filtersReg32;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-  src_ptr -= 3;
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-
-  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-
-  // multiply the size of the source and destination stride by two
-  src_stride = src_pixels_per_line << 1;
-  dst_stride = output_pitch << 1;
-  for (i = output_height; i > 1; i -= 2) {
-    // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
-
-    // reading 2 strides of the next 16 bytes
-    // (part of it was being read by earlier read)
-    srcReg32b2 =
-        xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
-    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
-
-    src_ptr += src_stride;
-
-    xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
-    output_ptr += dst_stride;
-  }
-
-  // if the number of strides is odd.
-  // process only 16 bytes
-  if (i > 0) {
-    __m256i srcReg1, srcReg12;
-    __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1;
-
-    srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr));
-    srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94);
-
-    // filter the source buffer
-    srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg);
-    srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters);
-    srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3);
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32);
-    srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1);
-    srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8);
-
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr,
-                    _mm256_castsi256_si128(srcRegFilt1_1));
-  }
-}
-
-static void aom_filter_block1d16_h8_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
-  __m256i srcReg32b1, srcReg32b2, filtersReg32;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-  src_ptr -= 3;
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 256 bit register
-  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 256 bit register
-  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
-
-  filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
-  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-  filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pixels_per_line << 1;
-  dst_stride = output_pitch << 1;
-  for (i = output_height; i > 1; i -= 2) {
-    // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
-
-    // filter the source buffer
-    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
-
-    // reading 2 strides of the next 16 bytes
-    // (part of it was being read by earlier read)
-    srcReg32b2 =
-        xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
-
-    // filter the source buffer
-    srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b2_1 = _mm256_adds_epi16(
-        srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
-    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
-
-    src_ptr += src_stride;
-
-    xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
-    output_ptr += dst_stride;
-  }
-
-  // if the number of strides is odd.
-  // process only 16 bytes
-  if (i > 0) {
-    __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
-    __m128i srcRegFilt2, srcRegFilt3;
-
-    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
-
-    // filter the source buffer
-    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1_1 =
-        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-
-    // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 =
-        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
-
-    // reading the next 16 bytes
-    // (part of it was being read by earlier read)
-    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
-
-    // filter the source buffer
-    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt2_1 =
-        _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
-
-    // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 =
-        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt2_1 =
-        _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
-
-    srcRegFilt2_1 =
-        _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
-
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
-  }
-}
-
-static void aom_filter_block1d8_v4_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i filtersReg32, addFilterReg32;
-  __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
-  __m256i srcReg23_34_lo, srcReg45_56_lo;
-  __m256i resReg23_34_lo, resReg45_56_lo;
-  __m256i resReglo, resReg;
-  __m256i secondFilters, thirdFilters;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the
-  // same data in both lanes of 128 bit register.
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
-  srcReg4x = _mm256_castsi128_si256(
-      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
-
-  // have consecutive loads on the same 256 register
-  srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
-
-  srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
-
-  for (i = output_height; i > 1; i -= 2) {
-    // load the last 2 loads of 16 bytes and have every two
-    // consecutive loads in the same 256 bit register
-    srcReg5x = _mm256_castsi128_si256(
-        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
-    srcReg45 =
-        _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
-
-    srcReg6x = _mm256_castsi128_si256(
-        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
-    srcReg56 =
-        _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
-
-    // merge every two consecutive registers
-    srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
-    resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
-
-    // add and saturate the results together
-    resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
-
-    // shift by 6 bit each 16 bit
-    resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
-    resReglo = _mm256_srai_epi16(resReglo, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    resReg = _mm256_packus_epi16(resReglo, resReglo);
-
-    src_ptr += src_stride;
-
-    xx_storeu2_epi64(output_ptr, out_pitch, &resReg);
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    srcReg23_34_lo = srcReg45_56_lo;
-    srcReg4x = srcReg6x;
-  }
-}
-
-static void aom_filter_block1d8_v8_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg32;
-  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
-  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
-  __m256i srcReg32b11, srcReg32b12, filtersReg32;
-  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the
-  // same data in both lanes of 128 bit register.
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 256 bit register
-  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 256 bit register
-  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  // load 16 bytes 7 times in stride of src_pitch
-  srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr);
-  srcReg32b3 =
-      xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
-  srcReg32b5 =
-      xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
-  srcReg32b7 = _mm256_castsi128_si256(
-      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
-
-  // have each consecutive loads on the same 256 register
-  srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
-  srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
-  srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
-  // merge every two consecutive registers except the last one
-  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
-  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
-  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
-
-  for (i = output_height; i > 1; i -= 2) {
-    // load the last 2 loads of 16 bytes and have every two
-    // consecutive loads in the same 256 bit register
-    srcReg32b8 = _mm256_castsi128_si256(
-        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)));
-    srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
-                                         _mm256_castsi256_si128(srcReg32b8), 1);
-    srcReg32b9 = _mm256_castsi128_si256(
-        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8)));
-    srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
-                                         _mm256_castsi256_si128(srcReg32b9), 1);
-
-    // merge every two consecutive registers
-    // save
-    srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
-    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
-
-    // add and saturate the results together
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
-    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
-                                    _mm256_adds_epi16(srcReg32b8, srcReg32b12));
-
-    // shift by 6 bit each 16 bit
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
-    srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256());
-
-    src_ptr += src_stride;
-
-    xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1);
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    srcReg32b10 = srcReg32b11;
-    srcReg32b11 = srcReg32b2;
-    srcReg32b2 = srcReg32b4;
-    srcReg32b7 = srcReg32b9;
-  }
-  if (i > 0) {
-    __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8;
-    // load the last 16 bytes
-    srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
-
-    // merge the last 2 results together
-    srcRegFilt4 =
-        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
-                                    _mm256_castsi256_si128(firstFilters));
-    srcRegFilt4 =
-        _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
-                                    _mm256_castsi256_si128(secondFilters));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
-                                    _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128());
-
-    // save 8 bytes
-    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1);
-  }
-}
-
-static void aom_filter_block1d16_v4_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i filtersReg32, addFilterReg32;
-  __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
-  __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi;
-  __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi;
-  __m256i resReglo, resReghi, resReg;
-  __m256i secondFilters, thirdFilters;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the
-  // same data in both lanes of 128 bit register.
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
-  srcReg4x = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
-
-  // have consecutive loads on the same 256 register
-  srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
-
-  srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
-  srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34);
-
-  for (i = output_height; i > 1; i -= 2) {
-    // load the last 2 loads of 16 bytes and have every two
-    // consecutive loads in the same 256 bit register
-    srcReg5x = _mm256_castsi128_si256(
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
-    srcReg45 =
-        _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
-
-    srcReg6x = _mm256_castsi128_si256(
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
-    srcReg56 =
-        _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
-
-    // merge every two consecutive registers
-    srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
-    srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
-    resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
-
-    // add and saturate the results together
-    resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters);
-    resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters);
-
-    // add and saturate the results together
-    resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi);
-
-    // shift by 6 bit each 16 bit
-    resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
-    resReghi = _mm256_adds_epi16(resReghi, addFilterReg32);
-    resReglo = _mm256_srai_epi16(resReglo, 6);
-    resReghi = _mm256_srai_epi16(resReghi, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    resReg = _mm256_packus_epi16(resReglo, resReghi);
-
-    src_ptr += src_stride;
-
-    xx_store2_mi128(output_ptr, out_pitch, &resReg);
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    srcReg23_34_lo = srcReg45_56_lo;
-    srcReg23_34_hi = srcReg45_56_hi;
-    srcReg4x = srcReg6x;
-  }
-}
-
-static void aom_filter_block1d16_v8_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg32;
-  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
-  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
-  __m256i srcReg32b11, srcReg32b12, filtersReg32;
-  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the
-  // same data in both lanes of 128 bit register.
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 256 bit register
-  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 256 bit register
-  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  // load 16 bytes 7 times in stride of src_pitch
-  srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr);
-  srcReg32b3 =
-      xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
-  srcReg32b5 =
-      xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
-  srcReg32b7 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
-
-  // have each consecutive loads on the same 256 register
-  srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
-  srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
-  srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
-  // merge every two consecutive registers except the last one
-  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
-  srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
-
-  // save
-  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
-  srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
-  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
-  srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
-
-  for (i = output_height; i > 1; i -= 2) {
-    // load the last 2 loads of 16 bytes and have every two
-    // consecutive loads in the same 256 bit register
-    srcReg32b8 = _mm256_castsi128_si256(
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
-    srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
-                                         _mm256_castsi256_si128(srcReg32b8), 1);
-    srcReg32b9 = _mm256_castsi128_si256(
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
-    srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
-                                         _mm256_castsi256_si128(srcReg32b9), 1);
-
-    // merge every two consecutive registers
-    // save
-    srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
-    srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
-    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
-
-    // add and saturate the results together
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
-    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
-                                    _mm256_adds_epi16(srcReg32b8, srcReg32b12));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
-    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
-
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
-    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
-
-    // add and saturate the results together
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                                   _mm256_adds_epi16(srcReg32b8, srcReg32b12));
-
-    // shift by 6 bit each 16 bit
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32);
-    srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
-    srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
-
-    src_ptr += src_stride;
-
-    xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1);
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    srcReg32b10 = srcReg32b11;
-    srcReg32b1 = srcReg32b3;
-    srcReg32b11 = srcReg32b2;
-    srcReg32b3 = srcReg32b5;
-    srcReg32b2 = srcReg32b4;
-    srcReg32b5 = srcReg32b7;
-    srcReg32b7 = srcReg32b9;
-  }
-  if (i > 0) {
-    __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
-    __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
-    // load the last 16 bytes
-    srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-
-    // merge the last 2 results together
-    srcRegFilt4 =
-        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
-    srcRegFilt7 =
-        _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
-                                    _mm256_castsi256_si128(firstFilters));
-    srcRegFilt4 =
-        _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
-    srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
-                                    _mm256_castsi256_si128(firstFilters));
-    srcRegFilt7 =
-        _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
-                                    _mm256_castsi256_si128(secondFilters));
-    srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
-                                    _mm256_castsi256_si128(secondFilters));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
-                                    _mm256_castsi256_si128(thirdFilters));
-    srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
-                                    _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
-    srcRegFilt3 =
-        _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7));
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt3 =
-        _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
-    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
-
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
-  }
-}
-
-static void aom_filter_block1d4_v4_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i filtersReg32, addFilterReg32;
-  __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
-  __m256i srcReg23_34_lo, srcReg45_56_lo;
-  __m256i srcReg2345_3456_lo;
-  __m256i resReglo, resReg;
-  __m256i firstFilters;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the
-  // same data in both lanes of 128 bit register.
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  firstFilters =
-      _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
-  srcReg4x = _mm256_castsi128_si256(
-      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
-
-  // have consecutive loads on the same 256 register
-  srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
-
-  srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
-
-  for (i = output_height; i > 1; i -= 2) {
-    // load the last 2 loads of 16 bytes and have every two
-    // consecutive loads in the same 256 bit register
-    srcReg5x = _mm256_castsi128_si256(
-        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
-    srcReg45 =
-        _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
-
-    srcReg6x = _mm256_castsi128_si256(
-        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
-    srcReg56 =
-        _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
-
-    // merge every two consecutive registers
-    srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
-
-    srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
-
-    resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256());
-
-    // shift by 6 bit each 16 bit
-    resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
-    resReglo = _mm256_srai_epi16(resReglo, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    resReg = _mm256_packus_epi16(resReglo, resReglo);
-
-    src_ptr += src_stride;
-
-    xx_storeu2_epi32(output_ptr, out_pitch, &resReg);
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    srcReg23_34_lo = srcReg45_56_lo;
-    srcReg4x = srcReg6x;
-  }
-}
-
-#if HAVE_AVX2 && HAVE_SSSE3
-filter8_1dfunction aom_filter_block1d4_v8_ssse3;
-filter8_1dfunction aom_filter_block1d16_v2_ssse3;
-filter8_1dfunction aom_filter_block1d16_h2_ssse3;
-filter8_1dfunction aom_filter_block1d8_v2_ssse3;
-filter8_1dfunction aom_filter_block1d8_h2_ssse3;
-filter8_1dfunction aom_filter_block1d4_v2_ssse3;
-filter8_1dfunction aom_filter_block1d4_h2_ssse3;
-#define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3
-#define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3
-#define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3
-#define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3
-#define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3
-#define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3
-#define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3
-// void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
-//                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
-//                                int w, int h);
-// void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
-//                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
-//                               int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
-
-#endif  // HAVE_AX2 && HAVE_SSSE3

diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c b/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c
deleted file mode 100644
index a870451..0000000
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c
+++ /dev/null

@@ -1,570 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "config/aom_dsp_rtcd.h"
-#include "aom_dsp/x86/convolve.h"
-#include "aom_ports/mem.h"
-
-void aom_filter_block1d16_h4_sse2(const uint8_t *src_ptr,
-                                  ptrdiff_t src_pixels_per_line,
-                                  uint8_t *output_ptr, ptrdiff_t output_pitch,
-                                  uint32_t output_height,
-                                  const int16_t *filter) {
-  __m128i filtersReg;
-  __m128i addFilterReg32;
-  __m128i secondFilters, thirdFilters;
-  __m128i srcRegFilt32b1_1, srcRegFilt32b1_2, srcRegFilt32b2_1,
-      srcRegFilt32b2_2;
-  __m128i srcReg32b1, srcReg32b2;
-  unsigned int i;
-  src_ptr -= 3;
-  addFilterReg32 = _mm_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-
-  // coeffs 0 1 0 1 2 3 2 3
-  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
-  // coeffs 4 5 4 5 6 7 6 7
-  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
-
-  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
-  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
-
-  for (i = output_height; i > 0; i -= 1) {
-    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
-
-    __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
-    __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
-    __m128i ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
-    __m128i ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
-    __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters);
-    __m128i d2 = _mm_madd_epi16(ss_2_1, thirdFilters);
-    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
-
-    __m128i ss_1 = _mm_srli_si128(srcReg32b1, 3);
-    __m128i ss_3 = _mm_srli_si128(srcReg32b1, 5);
-    __m128i ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128());
-    __m128i ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
-    d1 = _mm_madd_epi16(ss_1_2, secondFilters);
-    d2 = _mm_madd_epi16(ss_2_2, thirdFilters);
-    srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
-
-    __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
-    __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
-    srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi);
-
-    // reading stride of the next 16 bytes
-    // (part of it was being read by earlier read)
-    srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
-
-    ss_2 = _mm_srli_si128(srcReg32b2, 2);
-    ss_4 = _mm_srli_si128(srcReg32b2, 4);
-    ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
-    ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
-    d1 = _mm_madd_epi16(ss_1_1, secondFilters);
-    d2 = _mm_madd_epi16(ss_2_1, thirdFilters);
-    srcRegFilt32b2_1 = _mm_add_epi32(d1, d2);
-
-    ss_1 = _mm_srli_si128(srcReg32b2, 3);
-    ss_3 = _mm_srli_si128(srcReg32b2, 5);
-    ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128());
-    ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
-    d1 = _mm_madd_epi16(ss_1_2, secondFilters);
-    d2 = _mm_madd_epi16(ss_2_2, thirdFilters);
-    srcRegFilt32b2_2 = _mm_add_epi32(d1, d2);
-
-    res_lo = _mm_unpacklo_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2);
-    res_hi = _mm_unpackhi_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2);
-    srcRegFilt32b2_1 = _mm_packs_epi32(res_lo, res_hi);
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
-    srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
-
-    src_ptr += src_pixels_per_line;
-
-    _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1);
-
-    output_ptr += output_pitch;
-  }
-}
-
-void aom_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                  uint8_t *output_ptr, ptrdiff_t out_pitch,
-                                  uint32_t output_height,
-                                  const int16_t *filter) {
-  __m128i filtersReg;
-  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
-  __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
-  __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
-  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
-  __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
-  __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
-  __m128i resReg23_45, resReg34_56;
-  __m128i addFilterReg32, secondFilters, thirdFilters;
-  __m128i tmp_0, tmp_1;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-
-  // coeffs 0 1 0 1 2 3 2 3
-  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
-  // coeffs 4 5 4 5 6 7 6 7
-  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
-
-  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
-  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
-
-  // multiply the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
-  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
-  srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
-  srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3);
-  __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128());
-  __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128());
-  __m128i resReg23_hi_1 = _mm_unpacklo_epi8(srcReg23_hi, _mm_setzero_si128());
-  __m128i resReg23_hi_2 = _mm_unpackhi_epi8(srcReg23_hi, _mm_setzero_si128());
-
-  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
-  srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
-  srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4);
-  __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128());
-  __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128());
-  __m128i resReg34_hi_1 = _mm_unpacklo_epi8(srcReg34_hi, _mm_setzero_si128());
-  __m128i resReg34_hi_2 = _mm_unpackhi_epi8(srcReg34_hi, _mm_setzero_si128());
-
-  for (i = output_height; i > 1; i -= 2) {
-    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
-
-    srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
-    srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5);
-
-    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
-
-    srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
-    srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-
-    tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters);
-    tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters);
-    resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
-    tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters);
-    tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters);
-    resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
-    __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128());
-    __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128());
-    tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters);
-    tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters);
-    resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
-    __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128());
-    __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128());
-    tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters);
-    tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters);
-    resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
-    // add and saturate the results together
-    resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
-    resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
-
-    // multiply 2 adjacent elements with the filter and add the result
-
-    tmp_0 = _mm_madd_epi16(resReg23_hi_1, secondFilters);
-    tmp_1 = _mm_madd_epi16(resReg23_hi_2, secondFilters);
-    resReg23_hi = _mm_packs_epi32(tmp_0, tmp_1);
-
-    tmp_0 = _mm_madd_epi16(resReg34_hi_1, secondFilters);
-    tmp_1 = _mm_madd_epi16(resReg34_hi_2, secondFilters);
-    resReg34_hi = _mm_packs_epi32(tmp_0, tmp_1);
-
-    __m128i resReg45_hi_1 = _mm_unpacklo_epi8(srcReg45_hi, _mm_setzero_si128());
-    __m128i resReg45_hi_2 = _mm_unpackhi_epi8(srcReg45_hi, _mm_setzero_si128());
-    tmp_0 = _mm_madd_epi16(resReg45_hi_1, thirdFilters);
-    tmp_1 = _mm_madd_epi16(resReg45_hi_2, thirdFilters);
-    resReg45_hi = _mm_packs_epi32(tmp_0, tmp_1);
-
-    __m128i resReg56_hi_1 = _mm_unpacklo_epi8(srcReg56_hi, _mm_setzero_si128());
-    __m128i resReg56_hi_2 = _mm_unpackhi_epi8(srcReg56_hi, _mm_setzero_si128());
-    tmp_0 = _mm_madd_epi16(resReg56_hi_1, thirdFilters);
-    tmp_1 = _mm_madd_epi16(resReg56_hi_2, thirdFilters);
-    resReg56_hi = _mm_packs_epi32(tmp_0, tmp_1);
-
-    // add and saturate the results together
-    resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi);
-    resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi);
-
-    // shift by 6 bit each 16 bit
-    resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
-    resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
-    resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32);
-    resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32);
-    resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
-    resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
-    resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6);
-    resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi);
-    resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi);
-
-    src_ptr += src_stride;
-
-    _mm_store_si128((__m128i *)output_ptr, (resReg23_45));
-    _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56));
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    resReg23_lo_1 = resReg45_lo_1;
-    resReg23_lo_2 = resReg45_lo_2;
-    resReg23_hi_1 = resReg45_hi_1;
-    resReg23_hi_2 = resReg45_hi_2;
-    resReg34_lo_1 = resReg56_lo_1;
-    resReg34_lo_2 = resReg56_lo_2;
-    resReg34_hi_1 = resReg56_hi_1;
-    resReg34_hi_2 = resReg56_hi_2;
-    srcReg4 = srcReg6;
-  }
-}
-
-void aom_filter_block1d8_h4_sse2(const uint8_t *src_ptr,
-                                 ptrdiff_t src_pixels_per_line,
-                                 uint8_t *output_ptr, ptrdiff_t output_pitch,
-                                 uint32_t output_height,
-                                 const int16_t *filter) {
-  __m128i filtersReg;
-  __m128i addFilterReg32;
-  __m128i secondFilters, thirdFilters;
-  __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
-  __m128i srcReg32b1;
-  unsigned int i;
-  src_ptr -= 3;
-  addFilterReg32 = _mm_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-
-  // coeffs 0 1 0 1 2 3 2 3
-  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
-  // coeffs 4 5 4 5 6 7 6 7
-  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
-
-  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
-  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
-
-  for (i = output_height; i > 0; i -= 1) {
-    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
-
-    __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
-    __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
-    ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
-    ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
-    __m128i d1 = _mm_madd_epi16(ss_2, secondFilters);
-    __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
-    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
-
-    __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3);
-    __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5);
-    ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
-    ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128());
-    d1 = _mm_madd_epi16(ss_3, secondFilters);
-    d2 = _mm_madd_epi16(ss_5, thirdFilters);
-    srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
-
-    __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
-    __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
-    srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi);
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
-
-    src_ptr += src_pixels_per_line;
-
-    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);
-
-    output_ptr += output_pitch;
-  }
-}
-
-void aom_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
-                                 uint32_t output_height,
-                                 const int16_t *filter) {
-  __m128i filtersReg;
-  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
-  __m128i srcReg23_lo, srcReg34_lo;
-  __m128i srcReg45_lo, srcReg56_lo;
-  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
-  __m128i resReg23_45_lo, resReg34_56_lo;
-  __m128i resReg23_45, resReg34_56;
-  __m128i addFilterReg32, secondFilters, thirdFilters;
-  __m128i tmp_0, tmp_1;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-
-  // coeffs 0 1 0 1 2 3 2 3
-  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
-  // coeffs 4 5 4 5 6 7 6 7
-  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
-
-  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
-  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
-
-  // multiply the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
-  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
-  srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
-  __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128());
-  __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128());
-
-  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
-  srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
-  __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128());
-  __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128());
-
-  for (i = output_height; i > 1; i -= 2) {
-    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
-    srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
-
-    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
-    srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-
-    tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters);
-    tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters);
-    resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
-    tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters);
-    tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters);
-    resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
-    __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128());
-    __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128());
-    tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters);
-    tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters);
-    resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
-    __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128());
-    __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128());
-    tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters);
-    tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters);
-    resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
-    // add and saturate the results together
-    resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
-    resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
-
-    // shift by 6 bit each 16 bit
-    resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
-    resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
-    resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
-    resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    resReg23_45 = _mm_packus_epi16(resReg23_45_lo, _mm_setzero_si128());
-    resReg34_56 = _mm_packus_epi16(resReg34_56_lo, _mm_setzero_si128());
-
-    src_ptr += src_stride;
-
-    _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45));
-    _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56));
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    resReg23_lo_1 = resReg45_lo_1;
-    resReg23_lo_2 = resReg45_lo_2;
-    resReg34_lo_1 = resReg56_lo_1;
-    resReg34_lo_2 = resReg56_lo_2;
-    srcReg4 = srcReg6;
-  }
-}
-
-void aom_filter_block1d4_h4_sse2(const uint8_t *src_ptr,
-                                 ptrdiff_t src_pixels_per_line,
-                                 uint8_t *output_ptr, ptrdiff_t output_pitch,
-                                 uint32_t output_height,
-                                 const int16_t *filter) {
-  __m128i filtersReg;
-  __m128i addFilterReg32;
-  __m128i secondFilters, thirdFilters;
-  __m128i srcRegFilt32b1_1;
-  __m128i srcReg32b1;
-  unsigned int i;
-  src_ptr -= 3;
-  addFilterReg32 = _mm_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-
-  // coeffs 0 1 0 1 2 3 2 3
-  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
-  // coeffs 4 5 4 5 6 7 6 7
-  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
-
-  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
-  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
-
-  for (i = output_height; i > 0; i -= 1) {
-    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
-
-    __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
-    __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3);
-    __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
-    __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5);
-
-    ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
-    ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
-    ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
-    ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128());
-
-    __m128i ss_1_1 = _mm_unpacklo_epi32(ss_2, ss_3);
-    __m128i ss_1_2 = _mm_unpacklo_epi32(ss_4, ss_5);
-
-    __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters);
-    __m128i d2 = _mm_madd_epi16(ss_1_2, thirdFilters);
-    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
-
-    srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
-
-    src_ptr += src_pixels_per_line;
-
-    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
-
-    output_ptr += output_pitch;
-  }
-}
-
-void aom_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
-                                 uint32_t output_height,
-                                 const int16_t *filter) {
-  __m128i filtersReg;
-  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
-  __m128i srcReg23, srcReg34, srcReg45, srcReg56;
-  __m128i resReg23_34, resReg45_56;
-  __m128i resReg23_34_45_56;
-  __m128i addFilterReg32, secondFilters, thirdFilters;
-  __m128i tmp_0, tmp_1;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-
-  // coeffs 0 1 0 1 2 3 2 3
-  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
-  // coeffs 4 5 4 5 6 7 6 7
-  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
-
-  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
-  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
-
-  // multiply the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3);
-  __m128i resReg23 = _mm_unpacklo_epi8(srcReg23, _mm_setzero_si128());
-
-  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
-  srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4);
-  __m128i resReg34 = _mm_unpacklo_epi8(srcReg34, _mm_setzero_si128());
-
-  for (i = output_height; i > 1; i -= 2) {
-    srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
-    srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5);
-    srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
-    srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    tmp_0 = _mm_madd_epi16(resReg23, secondFilters);
-    tmp_1 = _mm_madd_epi16(resReg34, secondFilters);
-    resReg23_34 = _mm_packs_epi32(tmp_0, tmp_1);
-
-    __m128i resReg45 = _mm_unpacklo_epi8(srcReg45, _mm_setzero_si128());
-    __m128i resReg56 = _mm_unpacklo_epi8(srcReg56, _mm_setzero_si128());
-
-    tmp_0 = _mm_madd_epi16(resReg45, thirdFilters);
-    tmp_1 = _mm_madd_epi16(resReg56, thirdFilters);
-    resReg45_56 = _mm_packs_epi32(tmp_0, tmp_1);
-
-    // add and saturate the results together
-    resReg23_34_45_56 = _mm_adds_epi16(resReg23_34, resReg45_56);
-
-    // shift by 6 bit each 16 bit
-    resReg23_34_45_56 = _mm_adds_epi16(resReg23_34_45_56, addFilterReg32);
-    resReg23_34_45_56 = _mm_srai_epi16(resReg23_34_45_56, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    resReg23_34_45_56 =
-        _mm_packus_epi16(resReg23_34_45_56, _mm_setzero_si128());
-
-    src_ptr += src_stride;
-
-    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(resReg23_34_45_56);
-    *((uint32_t *)(output_ptr + out_pitch)) =
-        _mm_cvtsi128_si32(_mm_srli_si128(resReg23_34_45_56, 4));
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    resReg23 = resReg45;
-    resReg34 = resReg56;
-    srcReg4 = srcReg6;
-  }
-}

diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
deleted file mode 100644
index 7425373..0000000
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ /dev/null

@@ -1,771 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <tmmintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/convolve.h"
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/mem.h"
-#include "aom_ports/emmintrin_compat.h"
-
-// filters only for the 4_h8 convolution
-DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { 0, 1, 1, 2, 2, 3,
-                                                              3, 4, 2, 3, 3, 4,
-                                                              4, 5, 5, 6 };
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { 4, 5, 5, 6, 6, 7,
-                                                              7, 8, 6, 7, 7, 8,
-                                                              8, 9, 9, 10 };
-
-// filters for 8_h8 and 16_h8
-DECLARE_ALIGNED(16, static const uint8_t,
-                filt1_global[16]) = { 0, 1, 1, 2, 2, 3, 3, 4,
-                                      4, 5, 5, 6, 6, 7, 7, 8 };
-
-DECLARE_ALIGNED(16, static const uint8_t,
-                filt2_global[16]) = { 2, 3, 3, 4, 4, 5, 5, 6,
-                                      6, 7, 7, 8, 8, 9, 9, 10 };
-
-DECLARE_ALIGNED(16, static const uint8_t,
-                filt3_global[16]) = { 4, 5, 5, 6,  6,  7,  7,  8,
-                                      8, 9, 9, 10, 10, 11, 11, 12 };
-
-DECLARE_ALIGNED(16, static const uint8_t,
-                filt4_global[16]) = { 6,  7,  7,  8,  8,  9,  9,  10,
-                                      10, 11, 11, 12, 12, 13, 13, 14 };
-
-DECLARE_ALIGNED(32, static const uint8_t, filt_h4[]) = {
-  0,  1,  1,  2,  2, 3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  0,  1,  1,
-  2,  2,  3,  3,  4, 4,  5,  5,  6,  6,  7,  7,  8,  2,  3,  3,  4,  4,  5,
-  5,  6,  6,  7,  7, 8,  8,  9,  9,  10, 2,  3,  3,  4,  4,  5,  5,  6,  6,
-  7,  7,  8,  8,  9, 9,  10, 4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10,
-  10, 11, 11, 12, 4, 5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10, 10, 11, 11,
-  12, 6,  7,  7,  8, 8,  9,  9,  10, 10, 11, 11, 12, 12, 13, 13, 14, 6,  7,
-  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filtd4[]) = {
-  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
-  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
-};
-
-// These are reused by the avx2 intrinsics.
-filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3;
-
-static void aom_filter_block1d4_h4_ssse3(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m128i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
-  unsigned int i;
-  src_ptr -= 3;
-  addFilterReg32 = _mm_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));
-  filt1Reg = _mm_load_si128((__m128i const *)(filtd4));
-
-  for (i = output_height; i > 0; i -= 1) {
-    // load the 2 strides of source
-    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
-
-    // filter the source buffer
-    srcRegFilt32b1_1 = _mm_shuffle_epi8(srcReg32b1, filt1Reg);
-
-    // multiply 4 adjacent elements with the filter and add the result
-    srcRegFilt32b1_1 = _mm_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
-
-    srcRegFilt32b1_1 = _mm_hadds_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
-
-    src_ptr += src_pixels_per_line;
-
-    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
-    output_ptr += output_pitch;
-  }
-}
-
-static void aom_filter_block1d4_v4_ssse3(
-    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m128i addFilterReg32;
-  __m128i srcReg2, srcReg3, srcReg23, srcReg4, srcReg34, srcReg5, srcReg45,
-      srcReg6, srcReg56;
-  __m128i srcReg23_34_lo, srcReg45_56_lo;
-  __m128i srcReg2345_3456_lo, srcReg2345_3456_hi;
-  __m128i resReglo, resReghi;
-  __m128i firstFilters;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the
-  // same data in both lanes of 128 bit register.
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  srcReg23 = _mm_unpacklo_epi32(srcReg2, srcReg3);
-
-  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
-
-  // have consecutive loads on the same 256 register
-  srcReg34 = _mm_unpacklo_epi32(srcReg3, srcReg4);
-
-  srcReg23_34_lo = _mm_unpacklo_epi8(srcReg23, srcReg34);
-
-  for (i = output_height; i > 1; i -= 2) {
-    srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
-    srcReg45 = _mm_unpacklo_epi32(srcReg4, srcReg5);
-
-    srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
-    srcReg56 = _mm_unpacklo_epi32(srcReg5, srcReg6);
-
-    // merge every two consecutive registers
-    srcReg45_56_lo = _mm_unpacklo_epi8(srcReg45, srcReg56);
-
-    srcReg2345_3456_lo = _mm_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
-    srcReg2345_3456_hi = _mm_unpackhi_epi16(srcReg23_34_lo, srcReg45_56_lo);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    resReglo = _mm_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
-    resReghi = _mm_maddubs_epi16(srcReg2345_3456_hi, firstFilters);
-
-    resReglo = _mm_hadds_epi16(resReglo, _mm_setzero_si128());
-    resReghi = _mm_hadds_epi16(resReghi, _mm_setzero_si128());
-
-    // shift by 6 bit each 16 bit
-    resReglo = _mm_adds_epi16(resReglo, addFilterReg32);
-    resReghi = _mm_adds_epi16(resReghi, addFilterReg32);
-    resReglo = _mm_srai_epi16(resReglo, 6);
-    resReghi = _mm_srai_epi16(resReghi, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    resReglo = _mm_packus_epi16(resReglo, resReglo);
-    resReghi = _mm_packus_epi16(resReghi, resReghi);
-
-    src_ptr += src_stride;
-
-    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(resReglo);
-    *((uint32_t *)(output_ptr + out_pitch)) = _mm_cvtsi128_si32(resReghi);
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    srcReg23_34_lo = srcReg45_56_lo;
-    srcReg4 = srcReg6;
-  }
-}
-
-void aom_filter_block1d4_h8_intrin_ssse3(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i firstFilters, secondFilters, shuffle1, shuffle2;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-  __m128i addFilterReg64, filtersReg, srcReg, minReg;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits in the filter into the first lane
-  firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
-  // duplicate only the third 16 bit in the filter into the first lane
-  secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
-  // duplicate only the seconds 16 bits in the filter into the second lane
-  // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
-  firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
-  // duplicate only the forth 16 bits in the filter into the second lane
-  // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
-  secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
-
-  // loading the local filters
-  shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8);
-  shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
-
-  for (i = 0; i < output_height; i++) {
-    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
-    // filter the source buffer
-    srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1);
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-    // extract the higher half of the lane
-    srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
-    srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
-
-    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
-
-    // add and saturate all the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bits
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-    src_ptr += src_pixels_per_line;
-
-    // save only 4 bytes
-    *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);
-
-    output_ptr += output_pitch;
-  }
-}
-
-static void aom_filter_block1d8_h4_ssse3(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m128i addFilterReg32, filt2Reg, filt3Reg;
-  __m128i secondFilters, thirdFilters;
-  __m128i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
-  __m128i srcReg32b1;
-  unsigned int i;
-  src_ptr -= 3;
-  addFilterReg32 = _mm_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-
-  filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32));
-  filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2));
-
-  for (i = output_height; i > 0; i -= 1) {
-    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg);
-    srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
-
-    src_ptr += src_pixels_per_line;
-
-    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);
-
-    output_ptr += output_pitch;
-  }
-}
-
-static void aom_filter_block1d8_v4_ssse3(
-    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
-  __m128i srcReg23, srcReg34, srcReg45, srcReg56;
-  __m128i resReg23, resReg34, resReg45, resReg56;
-  __m128i resReg23_45, resReg34_56;
-  __m128i addFilterReg32, secondFilters, thirdFilters;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the
-  // same data in both lanes of 128 bit register.
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 128 bit register
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 128 bit register
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3);
-
-  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
-
-  // have consecutive loads on the same 256 register
-  srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4);
-
-  for (i = output_height; i > 1; i -= 2) {
-    srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
-
-    srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5);
-
-    srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
-
-    srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    resReg23 = _mm_maddubs_epi16(srcReg23, secondFilters);
-    resReg34 = _mm_maddubs_epi16(srcReg34, secondFilters);
-    resReg45 = _mm_maddubs_epi16(srcReg45, thirdFilters);
-    resReg56 = _mm_maddubs_epi16(srcReg56, thirdFilters);
-
-    // add and saturate the results together
-    resReg23_45 = _mm_adds_epi16(resReg23, resReg45);
-    resReg34_56 = _mm_adds_epi16(resReg34, resReg56);
-
-    // shift by 6 bit each 16 bit
-    resReg23_45 = _mm_adds_epi16(resReg23_45, addFilterReg32);
-    resReg34_56 = _mm_adds_epi16(resReg34_56, addFilterReg32);
-    resReg23_45 = _mm_srai_epi16(resReg23_45, 6);
-    resReg34_56 = _mm_srai_epi16(resReg34_56, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    resReg23_45 = _mm_packus_epi16(resReg23_45, _mm_setzero_si128());
-    resReg34_56 = _mm_packus_epi16(resReg34_56, _mm_setzero_si128());
-
-    src_ptr += src_stride;
-
-    _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45));
-    _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56));
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    srcReg23 = srcReg45;
-    srcReg34 = srcReg56;
-    srcReg4 = srcReg6;
-  }
-}
-
-void aom_filter_block1d8_h8_intrin_ssse3(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
-  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-  __m128i addFilterReg64, filtersReg, minReg;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 128 bit register
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 128 bit register
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 128 bit register
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 128 bit register
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
-  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
-  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
-  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
-
-  for (i = 0; i < output_height; i++) {
-    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
-    // filter the source buffer
-    srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg);
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-    // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg);
-    srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
-    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
-
-    // add and saturate all the results together
-    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
-    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bits
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-    src_ptr += src_pixels_per_line;
-
-    // save only 8 bytes
-    _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
-
-    output_ptr += output_pitch;
-  }
-}
-
-void aom_filter_block1d8_v8_intrin_ssse3(
-    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i addFilterReg64, filtersReg, minReg;
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
-  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
-  __m128i srcReg8;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits in the filter
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits in the filter
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits in the filter
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits in the filter
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-  // load the first 7 rows of 8 bytes
-  srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
-  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
-  srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
-  srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
-
-  for (i = 0; i < output_height; i++) {
-    // load the last 8 bytes
-    srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
-
-    // merge the result together
-    srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
-    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
-
-    // merge the result together
-    srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
-    srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
-
-    // add and saturate the results together
-    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
-    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-    src_ptr += src_pitch;
-
-    // shift down a row
-    srcReg1 = srcReg2;
-    srcReg2 = srcReg3;
-    srcReg3 = srcReg4;
-    srcReg4 = srcReg5;
-    srcReg5 = srcReg6;
-    srcReg6 = srcReg7;
-    srcReg7 = srcReg8;
-
-    // save only 8 bytes convolve result
-    _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
-
-    output_ptr += out_pitch;
-  }
-}
-
-static void aom_filter_block1d16_h4_ssse3(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m128i addFilterReg32, filt2Reg, filt3Reg;
-  __m128i secondFilters, thirdFilters;
-  __m128i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
-  __m128i srcReg32b1, srcReg32b2;
-  unsigned int i;
-  src_ptr -= 3;
-  addFilterReg32 = _mm_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-
-  filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32));
-  filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2));
-
-  for (i = output_height; i > 0; i -= 1) {
-    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg);
-    srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
-
-    // reading stride of the next 16 bytes
-    // (part of it was being read by earlier read)
-    srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b2, filt2Reg);
-    srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b2, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
-    srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
-
-    src_ptr += src_pixels_per_line;
-
-    _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1);
-
-    output_ptr += output_pitch;
-  }
-}
-
-static void aom_filter_block1d16_v4_ssse3(
-    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
-  __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
-  __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
-  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
-  __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
-  __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
-  __m128i resReg23_45, resReg34_56;
-  __m128i addFilterReg32, secondFilters, thirdFilters;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the
-  // same data in both lanes of 128 bit register.
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 128 bit register
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 128 bit register
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
-  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
-  srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
-  srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3);
-
-  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
-
-  // have consecutive loads on the same 256 register
-  srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
-  srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4);
-
-  for (i = output_height; i > 1; i -= 2) {
-    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
-
-    srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
-    srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5);
-
-    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
-
-    srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
-    srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    resReg23_lo = _mm_maddubs_epi16(srcReg23_lo, secondFilters);
-    resReg34_lo = _mm_maddubs_epi16(srcReg34_lo, secondFilters);
-    resReg45_lo = _mm_maddubs_epi16(srcReg45_lo, thirdFilters);
-    resReg56_lo = _mm_maddubs_epi16(srcReg56_lo, thirdFilters);
-
-    // add and saturate the results together
-    resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
-    resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
-
-    // multiply 2 adjacent elements with the filter and add the result
-
-    resReg23_hi = _mm_maddubs_epi16(srcReg23_hi, secondFilters);
-    resReg34_hi = _mm_maddubs_epi16(srcReg34_hi, secondFilters);
-    resReg45_hi = _mm_maddubs_epi16(srcReg45_hi, thirdFilters);
-    resReg56_hi = _mm_maddubs_epi16(srcReg56_hi, thirdFilters);
-
-    // add and saturate the results together
-    resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi);
-    resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi);
-
-    // shift by 6 bit each 16 bit
-    resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
-    resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
-    resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32);
-    resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32);
-    resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
-    resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
-    resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6);
-    resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi);
-    resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi);
-
-    src_ptr += src_stride;
-
-    _mm_store_si128((__m128i *)output_ptr, (resReg23_45));
-    _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56));
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    srcReg23_lo = srcReg45_lo;
-    srcReg34_lo = srcReg56_lo;
-    srcReg23_hi = srcReg45_hi;
-    srcReg34_hi = srcReg56_hi;
-    srcReg4 = srcReg6;
-  }
-}
-
-filter8_1dfunction aom_filter_block1d16_v8_ssse3;
-filter8_1dfunction aom_filter_block1d16_h8_ssse3;
-filter8_1dfunction aom_filter_block1d8_v8_ssse3;
-filter8_1dfunction aom_filter_block1d8_h8_ssse3;
-filter8_1dfunction aom_filter_block1d4_v8_ssse3;
-filter8_1dfunction aom_filter_block1d4_h8_ssse3;
-
-filter8_1dfunction aom_filter_block1d16_v2_ssse3;
-filter8_1dfunction aom_filter_block1d16_h2_ssse3;
-filter8_1dfunction aom_filter_block1d8_v2_ssse3;
-filter8_1dfunction aom_filter_block1d8_h2_ssse3;
-filter8_1dfunction aom_filter_block1d4_v2_ssse3;
-filter8_1dfunction aom_filter_block1d4_h2_ssse3;
-
-// void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
-//                                int w, int h);
-// void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
-//                               int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);

diff --git a/aom_dsp/x86/avg_intrin_avx2.c b/aom_dsp/x86/avg_intrin_avx2.c
index f5b5d27..ad9df3e 100644
--- a/aom_dsp/x86/avg_intrin_avx2.c
+++ b/aom_dsp/x86/avg_intrin_avx2.c

@@ -17,246 +17,6 @@
 #include "aom_dsp/x86/bitdepth_conversion_avx2.h"
 #include "aom_ports/mem.h"
 
-static void hadamard_col8x2_avx2(__m256i *in, int iter) {
-  __m256i a0 = in[0];
-  __m256i a1 = in[1];
-  __m256i a2 = in[2];
-  __m256i a3 = in[3];
-  __m256i a4 = in[4];
-  __m256i a5 = in[5];
-  __m256i a6 = in[6];
-  __m256i a7 = in[7];
-
-  __m256i b0 = _mm256_add_epi16(a0, a1);
-  __m256i b1 = _mm256_sub_epi16(a0, a1);
-  __m256i b2 = _mm256_add_epi16(a2, a3);
-  __m256i b3 = _mm256_sub_epi16(a2, a3);
-  __m256i b4 = _mm256_add_epi16(a4, a5);
-  __m256i b5 = _mm256_sub_epi16(a4, a5);
-  __m256i b6 = _mm256_add_epi16(a6, a7);
-  __m256i b7 = _mm256_sub_epi16(a6, a7);
-
-  a0 = _mm256_add_epi16(b0, b2);
-  a1 = _mm256_add_epi16(b1, b3);
-  a2 = _mm256_sub_epi16(b0, b2);
-  a3 = _mm256_sub_epi16(b1, b3);
-  a4 = _mm256_add_epi16(b4, b6);
-  a5 = _mm256_add_epi16(b5, b7);
-  a6 = _mm256_sub_epi16(b4, b6);
-  a7 = _mm256_sub_epi16(b5, b7);
-
-  if (iter == 0) {
-    b0 = _mm256_add_epi16(a0, a4);
-    b7 = _mm256_add_epi16(a1, a5);
-    b3 = _mm256_add_epi16(a2, a6);
-    b4 = _mm256_add_epi16(a3, a7);
-    b2 = _mm256_sub_epi16(a0, a4);
-    b6 = _mm256_sub_epi16(a1, a5);
-    b1 = _mm256_sub_epi16(a2, a6);
-    b5 = _mm256_sub_epi16(a3, a7);
-
-    a0 = _mm256_unpacklo_epi16(b0, b1);
-    a1 = _mm256_unpacklo_epi16(b2, b3);
-    a2 = _mm256_unpackhi_epi16(b0, b1);
-    a3 = _mm256_unpackhi_epi16(b2, b3);
-    a4 = _mm256_unpacklo_epi16(b4, b5);
-    a5 = _mm256_unpacklo_epi16(b6, b7);
-    a6 = _mm256_unpackhi_epi16(b4, b5);
-    a7 = _mm256_unpackhi_epi16(b6, b7);
-
-    b0 = _mm256_unpacklo_epi32(a0, a1);
-    b1 = _mm256_unpacklo_epi32(a4, a5);
-    b2 = _mm256_unpackhi_epi32(a0, a1);
-    b3 = _mm256_unpackhi_epi32(a4, a5);
-    b4 = _mm256_unpacklo_epi32(a2, a3);
-    b5 = _mm256_unpacklo_epi32(a6, a7);
-    b6 = _mm256_unpackhi_epi32(a2, a3);
-    b7 = _mm256_unpackhi_epi32(a6, a7);
-
-    in[0] = _mm256_unpacklo_epi64(b0, b1);
-    in[1] = _mm256_unpackhi_epi64(b0, b1);
-    in[2] = _mm256_unpacklo_epi64(b2, b3);
-    in[3] = _mm256_unpackhi_epi64(b2, b3);
-    in[4] = _mm256_unpacklo_epi64(b4, b5);
-    in[5] = _mm256_unpackhi_epi64(b4, b5);
-    in[6] = _mm256_unpacklo_epi64(b6, b7);
-    in[7] = _mm256_unpackhi_epi64(b6, b7);
-  } else {
-    in[0] = _mm256_add_epi16(a0, a4);
-    in[7] = _mm256_add_epi16(a1, a5);
-    in[3] = _mm256_add_epi16(a2, a6);
-    in[4] = _mm256_add_epi16(a3, a7);
-    in[2] = _mm256_sub_epi16(a0, a4);
-    in[6] = _mm256_sub_epi16(a1, a5);
-    in[1] = _mm256_sub_epi16(a2, a6);
-    in[5] = _mm256_sub_epi16(a3, a7);
-  }
-}
-
-static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
-                                int16_t *coeff) {
-  __m256i src[8];
-  src[0] = _mm256_loadu_si256((const __m256i *)src_diff);
-  src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
-  src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
-  src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
-  src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
-  src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
-  src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
-  src[7] = _mm256_loadu_si256((const __m256i *)(src_diff + src_stride));
-
-  hadamard_col8x2_avx2(src, 0);
-  hadamard_col8x2_avx2(src, 1);
-
-  _mm256_storeu_si256((__m256i *)coeff,
-                      _mm256_permute2x128_si256(src[0], src[1], 0x20));
-  coeff += 16;
-  _mm256_storeu_si256((__m256i *)coeff,
-                      _mm256_permute2x128_si256(src[2], src[3], 0x20));
-  coeff += 16;
-  _mm256_storeu_si256((__m256i *)coeff,
-                      _mm256_permute2x128_si256(src[4], src[5], 0x20));
-  coeff += 16;
-  _mm256_storeu_si256((__m256i *)coeff,
-                      _mm256_permute2x128_si256(src[6], src[7], 0x20));
-  coeff += 16;
-  _mm256_storeu_si256((__m256i *)coeff,
-                      _mm256_permute2x128_si256(src[0], src[1], 0x31));
-  coeff += 16;
-  _mm256_storeu_si256((__m256i *)coeff,
-                      _mm256_permute2x128_si256(src[2], src[3], 0x31));
-  coeff += 16;
-  _mm256_storeu_si256((__m256i *)coeff,
-                      _mm256_permute2x128_si256(src[4], src[5], 0x31));
-  coeff += 16;
-  _mm256_storeu_si256((__m256i *)coeff,
-                      _mm256_permute2x128_si256(src[6], src[7], 0x31));
-}
-
-static INLINE void hadamard_16x16_avx2(const int16_t *src_diff,
-                                       ptrdiff_t src_stride, tran_low_t *coeff,
-                                       int is_final) {
-  DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
-  int16_t *t_coeff = temp_coeff;
-  int16_t *coeff16 = (int16_t *)coeff;
-  int idx;
-  for (idx = 0; idx < 2; ++idx) {
-    const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
-    hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
-  }
-
-  for (idx = 0; idx < 64; idx += 16) {
-    const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
-    const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
-    const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
-    const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
-
-    __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
-    __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
-    __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
-    __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
-
-    b0 = _mm256_srai_epi16(b0, 1);
-    b1 = _mm256_srai_epi16(b1, 1);
-    b2 = _mm256_srai_epi16(b2, 1);
-    b3 = _mm256_srai_epi16(b3, 1);
-    if (is_final) {
-      store_tran_low(_mm256_add_epi16(b0, b2), coeff);
-      store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64);
-      store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128);
-      store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192);
-      coeff += 16;
-    } else {
-      _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2));
-      _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3));
-      _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2));
-      _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3));
-      coeff16 += 16;
-    }
-    t_coeff += 16;
-  }
-}
-
-void aom_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
-                             tran_low_t *coeff) {
-  hadamard_16x16_avx2(src_diff, src_stride, coeff, 1);
-}
-
-void aom_hadamard_lp_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
-                                int16_t *coeff) {
-  int16_t *t_coeff = coeff;
-  for (int idx = 0; idx < 2; ++idx) {
-    const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
-    hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
-  }
-
-  for (int idx = 0; idx < 64; idx += 16) {
-    const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
-    const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
-    const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
-    const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
-
-    __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
-    __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
-    __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
-    __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
-
-    b0 = _mm256_srai_epi16(b0, 1);
-    b1 = _mm256_srai_epi16(b1, 1);
-    b2 = _mm256_srai_epi16(b2, 1);
-    b3 = _mm256_srai_epi16(b3, 1);
-    _mm256_storeu_si256((__m256i *)coeff, _mm256_add_epi16(b0, b2));
-    _mm256_storeu_si256((__m256i *)(coeff + 64), _mm256_add_epi16(b1, b3));
-    _mm256_storeu_si256((__m256i *)(coeff + 128), _mm256_sub_epi16(b0, b2));
-    _mm256_storeu_si256((__m256i *)(coeff + 192), _mm256_sub_epi16(b1, b3));
-    coeff += 16;
-    t_coeff += 16;
-  }
-}
-
-void aom_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
-                             tran_low_t *coeff) {
-  // For high bitdepths, it is unnecessary to store_tran_low
-  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
-  // next stage.  Output to an intermediate buffer first, then store_tran_low()
-  // in the final stage.
-  DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
-  int16_t *t_coeff = temp_coeff;
-  int idx;
-  for (idx = 0; idx < 4; ++idx) {
-    // src_diff: 9 bit, dynamic range [-255, 255]
-    const int16_t *src_ptr =
-        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
-    hadamard_16x16_avx2(src_ptr, src_stride,
-                        (tran_low_t *)(t_coeff + idx * 256), 0);
-  }
-
-  for (idx = 0; idx < 256; idx += 16) {
-    const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
-    const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
-    const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
-    const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
-
-    __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
-    __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
-    __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
-    __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
-
-    b0 = _mm256_srai_epi16(b0, 2);
-    b1 = _mm256_srai_epi16(b1, 2);
-    b2 = _mm256_srai_epi16(b2, 2);
-    b3 = _mm256_srai_epi16(b3, 2);
-
-    store_tran_low(_mm256_add_epi16(b0, b2), coeff);
-    store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256);
-    store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512);
-    store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768);
-
-    coeff += 16;
-    t_coeff += 16;
-  }
-}
-
 static void highbd_hadamard_col8_avx2(__m256i *in, int iter) {
   __m256i a0 = in[0];
   __m256i a1 = in[1];

diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index cc7b41d..59059e9 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c

@@ -17,403 +17,6 @@
 #include "aom_dsp/x86/bitdepth_conversion_sse2.h"
 #include "aom_ports/mem.h"
 
-void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
-                         int *min, int *max) {
-  __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
-  u0 = _mm_setzero_si128();
-  // Row 0
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff0 = _mm_max_epi16(diff, negdiff);
-  // Row 1
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
-  minabsdiff = _mm_min_epi16(absdiff0, absdiff);
-  // Row 2
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 3
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 4
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 5
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 6
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 7
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-
-  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
-  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
-  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
-  *max = _mm_extract_epi16(maxabsdiff, 0);
-
-  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
-  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
-  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
-  *min = _mm_extract_epi16(minabsdiff, 0);
-}
-
-unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) {
-  __m128i s0, s1, u0;
-  unsigned int avg = 0;
-  u0 = _mm_setzero_si128();
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-
-  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
-  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
-  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
-  avg = _mm_extract_epi16(s0, 0);
-  return (avg + 32) >> 6;
-}
-
-unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) {
-  __m128i s0, s1, u0;
-  unsigned int avg = 0;
-  u0 = _mm_setzero_si128();
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-
-  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
-  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
-  avg = _mm_extract_epi16(s0, 0);
-  return (avg + 8) >> 4;
-}
-
-static INLINE void hadamard_col8_sse2(__m128i *in, int iter) {
-  __m128i a0 = in[0];
-  __m128i a1 = in[1];
-  __m128i a2 = in[2];
-  __m128i a3 = in[3];
-  __m128i a4 = in[4];
-  __m128i a5 = in[5];
-  __m128i a6 = in[6];
-  __m128i a7 = in[7];
-
-  __m128i b0 = _mm_add_epi16(a0, a1);
-  __m128i b1 = _mm_sub_epi16(a0, a1);
-  __m128i b2 = _mm_add_epi16(a2, a3);
-  __m128i b3 = _mm_sub_epi16(a2, a3);
-  __m128i b4 = _mm_add_epi16(a4, a5);
-  __m128i b5 = _mm_sub_epi16(a4, a5);
-  __m128i b6 = _mm_add_epi16(a6, a7);
-  __m128i b7 = _mm_sub_epi16(a6, a7);
-
-  a0 = _mm_add_epi16(b0, b2);
-  a1 = _mm_add_epi16(b1, b3);
-  a2 = _mm_sub_epi16(b0, b2);
-  a3 = _mm_sub_epi16(b1, b3);
-  a4 = _mm_add_epi16(b4, b6);
-  a5 = _mm_add_epi16(b5, b7);
-  a6 = _mm_sub_epi16(b4, b6);
-  a7 = _mm_sub_epi16(b5, b7);
-
-  if (iter == 0) {
-    b0 = _mm_add_epi16(a0, a4);
-    b7 = _mm_add_epi16(a1, a5);
-    b3 = _mm_add_epi16(a2, a6);
-    b4 = _mm_add_epi16(a3, a7);
-    b2 = _mm_sub_epi16(a0, a4);
-    b6 = _mm_sub_epi16(a1, a5);
-    b1 = _mm_sub_epi16(a2, a6);
-    b5 = _mm_sub_epi16(a3, a7);
-
-    a0 = _mm_unpacklo_epi16(b0, b1);
-    a1 = _mm_unpacklo_epi16(b2, b3);
-    a2 = _mm_unpackhi_epi16(b0, b1);
-    a3 = _mm_unpackhi_epi16(b2, b3);
-    a4 = _mm_unpacklo_epi16(b4, b5);
-    a5 = _mm_unpacklo_epi16(b6, b7);
-    a6 = _mm_unpackhi_epi16(b4, b5);
-    a7 = _mm_unpackhi_epi16(b6, b7);
-
-    b0 = _mm_unpacklo_epi32(a0, a1);
-    b1 = _mm_unpacklo_epi32(a4, a5);
-    b2 = _mm_unpackhi_epi32(a0, a1);
-    b3 = _mm_unpackhi_epi32(a4, a5);
-    b4 = _mm_unpacklo_epi32(a2, a3);
-    b5 = _mm_unpacklo_epi32(a6, a7);
-    b6 = _mm_unpackhi_epi32(a2, a3);
-    b7 = _mm_unpackhi_epi32(a6, a7);
-
-    in[0] = _mm_unpacklo_epi64(b0, b1);
-    in[1] = _mm_unpackhi_epi64(b0, b1);
-    in[2] = _mm_unpacklo_epi64(b2, b3);
-    in[3] = _mm_unpackhi_epi64(b2, b3);
-    in[4] = _mm_unpacklo_epi64(b4, b5);
-    in[5] = _mm_unpackhi_epi64(b4, b5);
-    in[6] = _mm_unpacklo_epi64(b6, b7);
-    in[7] = _mm_unpackhi_epi64(b6, b7);
-  } else {
-    in[0] = _mm_add_epi16(a0, a4);
-    in[7] = _mm_add_epi16(a1, a5);
-    in[3] = _mm_add_epi16(a2, a6);
-    in[4] = _mm_add_epi16(a3, a7);
-    in[2] = _mm_sub_epi16(a0, a4);
-    in[6] = _mm_sub_epi16(a1, a5);
-    in[1] = _mm_sub_epi16(a2, a6);
-    in[5] = _mm_sub_epi16(a3, a7);
-  }
-}
-
-static INLINE void hadamard_8x8_sse2(const int16_t *src_diff,
-                                     ptrdiff_t src_stride, tran_low_t *coeff,
-                                     int is_final) {
-  __m128i src[8];
-  src[0] = _mm_load_si128((const __m128i *)src_diff);
-  src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride));
-
-  hadamard_col8_sse2(src, 0);
-  hadamard_col8_sse2(src, 1);
-
-  if (is_final) {
-    store_tran_low(src[0], coeff);
-    coeff += 8;
-    store_tran_low(src[1], coeff);
-    coeff += 8;
-    store_tran_low(src[2], coeff);
-    coeff += 8;
-    store_tran_low(src[3], coeff);
-    coeff += 8;
-    store_tran_low(src[4], coeff);
-    coeff += 8;
-    store_tran_low(src[5], coeff);
-    coeff += 8;
-    store_tran_low(src[6], coeff);
-    coeff += 8;
-    store_tran_low(src[7], coeff);
-  } else {
-    int16_t *coeff16 = (int16_t *)coeff;
-    _mm_store_si128((__m128i *)coeff16, src[0]);
-    coeff16 += 8;
-    _mm_store_si128((__m128i *)coeff16, src[1]);
-    coeff16 += 8;
-    _mm_store_si128((__m128i *)coeff16, src[2]);
-    coeff16 += 8;
-    _mm_store_si128((__m128i *)coeff16, src[3]);
-    coeff16 += 8;
-    _mm_store_si128((__m128i *)coeff16, src[4]);
-    coeff16 += 8;
-    _mm_store_si128((__m128i *)coeff16, src[5]);
-    coeff16 += 8;
-    _mm_store_si128((__m128i *)coeff16, src[6]);
-    coeff16 += 8;
-    _mm_store_si128((__m128i *)coeff16, src[7]);
-  }
-}
-
-void aom_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
-                           tran_low_t *coeff) {
-  hadamard_8x8_sse2(src_diff, src_stride, coeff, 1);
-}
-
-void aom_hadamard_lp_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
-                              int16_t *coeff) {
-  __m128i src[8];
-  src[0] = _mm_load_si128((const __m128i *)src_diff);
-  src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride));
-
-  hadamard_col8_sse2(src, 0);
-  hadamard_col8_sse2(src, 1);
-
-  _mm_store_si128((__m128i *)coeff, src[0]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[1]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[2]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[3]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[4]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[5]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[6]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[7]);
-}
-
-static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
-                                       ptrdiff_t src_stride, tran_low_t *coeff,
-                                       int is_final) {
-  // For high bitdepths, it is unnecessary to store_tran_low
-  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
-  // next stage.  Output to an intermediate buffer first, then store_tran_low()
-  // in the final stage.
-  DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
-  int16_t *t_coeff = temp_coeff;
-  int16_t *coeff16 = (int16_t *)coeff;
-  int idx;
-  for (idx = 0; idx < 4; ++idx) {
-    const int16_t *src_ptr =
-        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
-    hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64),
-                      0);
-  }
-
-  for (idx = 0; idx < 64; idx += 8) {
-    __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
-    __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
-    __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
-    __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
-
-    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
-    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
-    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
-    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
-
-    b0 = _mm_srai_epi16(b0, 1);
-    b1 = _mm_srai_epi16(b1, 1);
-    b2 = _mm_srai_epi16(b2, 1);
-    b3 = _mm_srai_epi16(b3, 1);
-
-    coeff0 = _mm_add_epi16(b0, b2);
-    coeff1 = _mm_add_epi16(b1, b3);
-    coeff2 = _mm_sub_epi16(b0, b2);
-    coeff3 = _mm_sub_epi16(b1, b3);
-
-    if (is_final) {
-      store_tran_low(coeff0, coeff);
-      store_tran_low(coeff1, coeff + 64);
-      store_tran_low(coeff2, coeff + 128);
-      store_tran_low(coeff3, coeff + 192);
-      coeff += 8;
-    } else {
-      _mm_store_si128((__m128i *)coeff16, coeff0);
-      _mm_store_si128((__m128i *)(coeff16 + 64), coeff1);
-      _mm_store_si128((__m128i *)(coeff16 + 128), coeff2);
-      _mm_store_si128((__m128i *)(coeff16 + 192), coeff3);
-      coeff16 += 8;
-    }
-
-    t_coeff += 8;
-  }
-}
-
-void aom_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
-                             tran_low_t *coeff) {
-  hadamard_16x16_sse2(src_diff, src_stride, coeff, 1);
-}
-
-void aom_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
-                             tran_low_t *coeff) {
-  // For high bitdepths, it is unnecessary to store_tran_low
-  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
-  // next stage.  Output to an intermediate buffer first, then store_tran_low()
-  // in the final stage.
-  DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
-  int16_t *t_coeff = temp_coeff;
-  int idx;
-  for (idx = 0; idx < 4; ++idx) {
-    const int16_t *src_ptr =
-        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
-    hadamard_16x16_sse2(src_ptr, src_stride,
-                        (tran_low_t *)(t_coeff + idx * 256), 0);
-  }
-
-  for (idx = 0; idx < 256; idx += 8) {
-    __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
-    __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256));
-    __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512));
-    __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768));
-
-    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
-    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
-    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
-    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
-
-    b0 = _mm_srai_epi16(b0, 2);
-    b1 = _mm_srai_epi16(b1, 2);
-    b2 = _mm_srai_epi16(b2, 2);
-    b3 = _mm_srai_epi16(b3, 2);
-
-    coeff0 = _mm_add_epi16(b0, b2);
-    coeff1 = _mm_add_epi16(b1, b3);
-    store_tran_low(coeff0, coeff);
-    store_tran_low(coeff1, coeff + 256);
-
-    coeff2 = _mm_sub_epi16(b0, b2);
-    coeff3 = _mm_sub_epi16(b1, b3);
-    store_tran_low(coeff2, coeff + 512);
-    store_tran_low(coeff3, coeff + 768);
-
-    coeff += 8;
-    t_coeff += 8;
-  }
-}
-
 int aom_satd_sse2(const tran_low_t *coeff, int length) {
   int i;
   const __m128i zero = _mm_setzero_si128();

diff --git a/aom_dsp/x86/blend_a64_hmask_sse4.c b/aom_dsp/x86/blend_a64_hmask_sse4.c
index 57a4026..dc49fcc 100644
--- a/aom_dsp/x86/blend_a64_hmask_sse4.c
+++ b/aom_dsp/x86/blend_a64_hmask_sse4.c

@@ -17,14 +17,6 @@
 // To start out, just dispatch to the function using the 2D mask and
 // pass mask stride as 0. This can be improved upon if necessary.
 
-void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                const uint8_t *src0, uint32_t src0_stride,
-                                const uint8_t *src1, uint32_t src1_stride,
-                                const uint8_t *mask, int w, int h) {
-  aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                            src1_stride, mask, 0, w, h, 0, 0);
-}
-
 void aom_highbd_blend_a64_hmask_sse4_1(
     uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
     uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,

diff --git a/aom_dsp/x86/blend_a64_mask_avx2.c b/aom_dsp/x86/blend_a64_mask_avx2.c
index 0942010..fdd87c1 100644
--- a/aom_dsp/x86/blend_a64_mask_avx2.c
+++ b/aom_dsp/x86/blend_a64_mask_avx2.c

@@ -26,880 +26,6 @@
 
 #include "config/aom_dsp_rtcd.h"
 
-static INLINE void blend_a64_d16_mask_w16_avx2(
-    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
-    const __m256i *m0, const __m256i *v_round_offset, const __m256i *v_maxval,
-    int shift) {
-  const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
-  const __m256i s0_0 = yy_loadu_256(src0);
-  const __m256i s1_0 = yy_loadu_256(src1);
-  __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
-                                      _mm256_unpacklo_epi16(*m0, max_minus_m0));
-  __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
-                                      _mm256_unpackhi_epi16(*m0, max_minus_m0));
-  res0_lo =
-      _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
-  res0_hi =
-      _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
-  const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
-  __m256i res = _mm256_packus_epi16(res0, res0);
-  res = _mm256_permute4x64_epi64(res, 0xd8);
-  _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res));
-}
-
-static INLINE void blend_a64_d16_mask_w32_avx2(
-    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
-    const __m256i *m0, const __m256i *m1, const __m256i *v_round_offset,
-    const __m256i *v_maxval, int shift) {
-  const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
-  const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1);
-  const __m256i s0_0 = yy_loadu_256(src0);
-  const __m256i s0_1 = yy_loadu_256(src0 + 16);
-  const __m256i s1_0 = yy_loadu_256(src1);
-  const __m256i s1_1 = yy_loadu_256(src1 + 16);
-  __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
-                                      _mm256_unpacklo_epi16(*m0, max_minus_m0));
-  __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
-                                      _mm256_unpackhi_epi16(*m0, max_minus_m0));
-  __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1),
-                                      _mm256_unpacklo_epi16(*m1, max_minus_m1));
-  __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1),
-                                      _mm256_unpackhi_epi16(*m1, max_minus_m1));
-  res0_lo =
-      _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
-  res0_hi =
-      _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
-  res1_lo =
-      _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift);
-  res1_hi =
-      _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift);
-  const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
-  const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi);
-  __m256i res = _mm256_packus_epi16(res0, res1);
-  res = _mm256_permute4x64_epi64(res, 0xd8);
-  _mm256_storeu_si256((__m256i *)(dst), res);
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h,
-    const __m256i *round_offset, int shift) {
-  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  for (int i = 0; i < h; ++i) {
-    const __m128i m = xx_loadu_128(mask);
-    const __m256i m0 = _mm256_cvtepu8_epi16(m);
-
-    blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
-                                shift);
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m256i *round_offset, int shift) {
-  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 32) {
-      const __m256i m = yy_loadu_256(mask + j);
-      const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m));
-      const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1));
-
-      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
-                                  round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h,
-    const __m256i *round_offset, int shift) {
-  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m256i one_b = _mm256_set1_epi8(1);
-  const __m256i two_w = _mm256_set1_epi16(2);
-  for (int i = 0; i < h; ++i) {
-    const __m256i m_i00 = yy_loadu_256(mask);
-    const __m256i m_i10 = yy_loadu_256(mask + mask_stride);
-
-    const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
-    const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
-    const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
-
-    blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
-                                shift);
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m256i *round_offset, int shift) {
-  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m256i one_b = _mm256_set1_epi8(1);
-  const __m256i two_w = _mm256_set1_epi16(2);
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 32) {
-      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
-      const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
-      const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j);
-      const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32);
-
-      const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
-      const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11);
-      const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
-      const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b);
-      const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
-      const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2);
-
-      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
-                                  round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m256i *round_offset, int shift) {
-  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m256i one_b = _mm256_set1_epi8(1);
-  const __m256i zeros = _mm256_setzero_si256();
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 16) {
-      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
-      const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
-      const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
-
-      blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
-                                  round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m256i *round_offset, int shift) {
-  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m256i one_b = _mm256_set1_epi8(1);
-  const __m256i zeros = _mm256_setzero_si256();
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 32) {
-      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
-      const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
-      const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
-      const __m256i m1_ac = _mm256_maddubs_epi16(m_i01, one_b);
-      const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
-      const __m256i m1 = _mm256_avg_epu16(m1_ac, zeros);
-
-      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
-                                  round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m256i *round_offset, int shift) {
-  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i zeros = _mm_setzero_si128();
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 16) {
-      const __m128i m_i00 = xx_loadu_128(mask + j);
-      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
-
-      const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
-      const __m256i m0 = _mm256_cvtepu8_epi16(m_ac);
-
-      blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
-                                  round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m256i *round_offset, int shift) {
-  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m256i zeros = _mm256_setzero_si256();
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 32) {
-      const __m256i m_i00 = yy_loadu_256(mask + j);
-      const __m256i m_i10 = yy_loadu_256(mask + mask_stride + j);
-
-      const __m256i m_ac =
-          _mm256_avg_epu8(_mm256_adds_epu8(m_i00, m_i10), zeros);
-      const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m_ac));
-      const __m256i m1 =
-          _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m_ac, 1));
-
-      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
-                                  round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-void aom_lowbd_blend_a64_d16_mask_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
-    ConvolveParams *conv_params) {
-  const int bd = 8;
-  const int round_bits =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-
-  const int round_offset =
-      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
-       (1 << (round_bits - 1)))
-      << AOM_BLEND_A64_ROUND_BITS;
-
-  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
-  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 4);
-  assert(w >= 4);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-  const __m128i v_round_offset = _mm_set1_epi32(round_offset);
-  const __m256i y_round_offset = _mm256_set1_epi32(round_offset);
-
-  if (subw == 0 && subh == 0) {
-    switch (w) {
-      case 4:
-        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 8:
-        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 16:
-        lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &y_round_offset, shift);
-        break;
-      default:
-        lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &y_round_offset, shift);
-        break;
-    }
-  } else if (subw == 1 && subh == 1) {
-    switch (w) {
-      case 4:
-        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 8:
-        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 16:
-        lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &y_round_offset, shift);
-        break;
-      default:
-        lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &y_round_offset, shift);
-        break;
-    }
-  } else if (subw == 1 && subh == 0) {
-    switch (w) {
-      case 4:
-        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 8:
-        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 16:
-        lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &y_round_offset, shift);
-        break;
-      default:
-        lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &y_round_offset, shift);
-        break;
-    }
-  } else {
-    switch (w) {
-      case 4:
-        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 8:
-        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 16:
-        lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &y_round_offset, shift);
-        break;
-      default:
-        lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &y_round_offset, shift);
-        break;
-    }
-  }
-}
-
-static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1,
-                                       const __m256i *v_m0_b,
-                                       const __m256i *v_m1_b,
-                                       const int32_t bits) {
-  const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0));
-  const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1));
-  const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8);
-  const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8);
-
-  const __m256i v_p0_w =
-      _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b),
-                           _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
-
-  const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
-  const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w);
-  const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8);
-  return v_res;
-}
-
-static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1,
-                                       const __m256i *v_m0_b,
-                                       const __m256i *v_m1_b,
-                                       const int32_t bits) {
-  const __m256i v_s0_b = yy_loadu_256(src0);
-  const __m256i v_s1_b = yy_loadu_256(src1);
-
-  const __m256i v_p0_w =
-      _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b),
-                           _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
-  const __m256i v_p1_w =
-      _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b),
-                           _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b));
-
-  const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
-  const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits);
-  const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w);
-  return v_res;
-}
-
-static INLINE void blend_a64_mask_sx_sy_w16_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h) {
-  const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
-  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  do {
-    const __m256i v_ral_b = yy_loadu_256(mask);
-    const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride);
-    const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
-    const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
-    const __m256i v_rvsbl_w =
-        _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
-    const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
-
-    const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2);
-    const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w);
-    const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
-
-    const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
-                                             AOM_BLEND_A64_ROUND_BITS);
-
-    xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b));
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static INLINE void blend_a64_mask_sx_sy_w32n_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
-  do {
-    int c;
-    for (c = 0; c < w; c += 32) {
-      const __m256i v_ral_b = yy_loadu_256(mask + 2 * c);
-      const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32);
-      const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c);
-      const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32);
-      const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
-      const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b);
-      const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
-      const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b);
-      const __m256i v_rvsbl_w =
-          _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
-      const __m256i v_rvsbh_w =
-          _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b);
-      const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
-      const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w);
-
-      const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2);
-      const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2);
-      const __m256i v_m0_b =
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8);
-      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
-
-      const __m256i v_res_b = blend_32_u8_avx2(
-          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
-
-      yy_storeu_256(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static INLINE void blend_a64_mask_sx_sy_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  switch (w) {
-    case 4:
-      do {
-        const __m128i v_ra_b = xx_loadl_64(mask);
-        const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
-        const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
-        const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
-        const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
-        const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
-        const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
-        const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
-        const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storel_32(dst, v_res_b);
-
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += 2 * mask_stride;
-      } while (--h);
-      break;
-    case 8:
-      do {
-        const __m128i v_ra_b = xx_loadu_128(mask);
-        const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
-        const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
-        const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
-        const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
-        const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
-        const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
-        const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
-        const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storel_64(dst, v_res_b);
-
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += 2 * mask_stride;
-      } while (--h);
-      break;
-    case 16:
-      blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
-                                    src1_stride, mask, mask_stride, h);
-      break;
-    default:
-      blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
-                                     src1_stride, mask, mask_stride, w, h);
-      break;
-  }
-}
-
-static INLINE void blend_a64_mask_sx_w16_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h) {
-  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m256i v_zmask_b = _mm256_set1_epi16(0xff);
-  do {
-    const __m256i v_rl_b = yy_loadu_256(mask);
-    const __m256i v_al_b =
-        _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1));
-
-    const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b);
-    const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256());
-    const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
-
-    const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
-                                             AOM_BLEND_A64_ROUND_BITS);
-
-    xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b));
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static INLINE void blend_a64_mask_sx_w32n_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle);
-  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  do {
-    int c;
-    for (c = 0; c < w; c += 32) {
-      const __m256i v_r0_b = yy_loadu_256(mask + 2 * c);
-      const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32);
-      const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b);
-      const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b);
-      const __m256i v_al_b =
-          _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8));
-      const __m256i v_ah_b =
-          _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8));
-
-      const __m256i v_m0_b =
-          _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8);
-      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
-
-      const __m256i v_res_b = blend_32_u8_avx2(
-          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
-
-      yy_storeu_256(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static INLINE void blend_a64_mask_sx_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  switch (w) {
-    case 4:
-      do {
-        const __m128i v_r_b = xx_loadl_64(mask);
-        const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
-        const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
-        const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
-        const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storel_32(dst, v_res_b);
-
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += mask_stride;
-      } while (--h);
-      break;
-    case 8:
-      do {
-        const __m128i v_r_b = xx_loadu_128(mask);
-        const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
-        const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
-        const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
-        const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storel_64(dst, v_res_b);
-
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += mask_stride;
-      } while (--h);
-      break;
-    case 16:
-      blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, mask_stride, h);
-      break;
-    default:
-      blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, w, h);
-      break;
-  }
-}
-
-static INLINE void blend_a64_mask_sy_w16_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h) {
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  do {
-    const __m128i v_ra_b = xx_loadu_128(mask);
-    const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
-    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-
-    const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b);
-    const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-    xx_storeu_128(dst, v_res_b);
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static INLINE void blend_a64_mask_sy_w32n_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  do {
-    int c;
-    for (c = 0; c < w; c += 32) {
-      const __m256i v_ra_b = yy_loadu_256(mask + c);
-      const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride);
-      const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b);
-      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
-      const __m256i v_res_b = blend_32_u8_avx2(
-          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
-
-      yy_storeu_256(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static INLINE void blend_a64_mask_sy_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  switch (w) {
-    case 4:
-      do {
-        const __m128i v_ra_b = xx_loadl_32(mask);
-        const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
-        const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storel_32(dst, v_res_b);
-
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += 2 * mask_stride;
-      } while (--h);
-      break;
-    case 8:
-      do {
-        const __m128i v_ra_b = xx_loadl_64(mask);
-        const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
-        const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storel_64(dst, v_res_b);
-
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += 2 * mask_stride;
-      } while (--h);
-      break;
-    case 16:
-      blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, mask_stride, h);
-      break;
-    default:
-      blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, w, h);
-  }
-}
-
-static INLINE void blend_a64_mask_w32n_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  do {
-    int c;
-    for (c = 0; c < w; c += 32) {
-      const __m256i v_m0_b = yy_loadu_256(mask + c);
-      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
-
-      const __m256i v_res_b = blend_32_u8_avx2(
-          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
-
-      yy_storeu_256(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static INLINE void blend_a64_mask_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  switch (w) {
-    case 4:
-      do {
-        const __m128i v_m0_b = xx_loadl_32(mask);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storel_32(dst, v_res_b);
-
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += mask_stride;
-      } while (--h);
-      break;
-    case 8:
-      do {
-        const __m128i v_m0_b = xx_loadl_64(mask);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storel_64(dst, v_res_b);
-
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += mask_stride;
-      } while (--h);
-      break;
-    case 16:
-      do {
-        const __m128i v_m0_b = xx_loadu_128(mask);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-        const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storeu_128(dst, v_res_b);
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += mask_stride;
-      } while (--h);
-      break;
-    default:
-      blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, mask_stride, w, h);
-  }
-}
-
-void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride,
-                             const uint8_t *src0, uint32_t src0_stride,
-                             const uint8_t *src1, uint32_t src1_stride,
-                             const uint8_t *mask, uint32_t mask_stride, int w,
-                             int h, int subw, int subh) {
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
-    aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                         mask, mask_stride, w, h, subw, subh);
-  } else {
-    if (subw & subh) {
-      blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
-                                src1_stride, mask, mask_stride, w, h);
-    } else if (subw) {
-      blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1,
-                             src1_stride, mask, mask_stride, w, h);
-    } else if (subh) {
-      blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
-                             src1_stride, mask, mask_stride, w, h);
-    } else {
-      blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                          mask, mask_stride, w, h);
-    }
-  }
-}
-
 //////////////////////////////////////////////////////////////////////////////
 // aom_highbd_blend_a64_d16_mask_avx2()
 //////////////////////////////////////////////////////////////////////////////

diff --git a/aom_dsp/x86/blend_a64_mask_sse4.c b/aom_dsp/x86/blend_a64_mask_sse4.c
index 342470c..fdc8924 100644
--- a/aom_dsp/x86/blend_a64_mask_sse4.c
+++ b/aom_dsp/x86/blend_a64_mask_sse4.c

@@ -29,405 +29,6 @@
 // No sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                     const uint8_t *src0, uint32_t src0_stride,
-                                     const uint8_t *src1, uint32_t src1_stride,
-                                     const uint8_t *mask, uint32_t mask_stride,
-                                     int w, int h) {
-  (void)w;
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  do {
-    const __m128i v_m0_b = xx_loadl_32(mask);
-    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-    xx_storel_32(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                     const uint8_t *src0, uint32_t src0_stride,
-                                     const uint8_t *src1, uint32_t src1_stride,
-                                     const uint8_t *mask, uint32_t mask_stride,
-                                     int w, int h) {
-  (void)w;
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  do {
-    const __m128i v_m0_b = xx_loadl_64(mask);
-    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-    xx_storel_64(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_w16n_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-
-  do {
-    int c;
-    for (c = 0; c < w; c += 16) {
-      const __m128i v_m0_b = xx_loadu_128(mask + c);
-      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-      const __m128i v_res_b =
-          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
-
-      xx_storeu_128(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Horizontal sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static void blend_a64_mask_sx_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-
-  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  do {
-    const __m128i v_r_b = xx_loadl_64(mask);
-    const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
-    const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
-    const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
-    const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
-    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-    xx_storel_32(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_sx_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-
-  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  do {
-    const __m128i v_r_b = xx_loadu_128(mask);
-    const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
-    const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
-    const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
-    const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
-    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-    xx_storel_64(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_sx_w16n_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-
-  do {
-    int c;
-    for (c = 0; c < w; c += 16) {
-      const __m128i v_r0_b = xx_loadu_128(mask + 2 * c);
-      const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16);
-      const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b);
-      const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b);
-      const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b);
-      const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b);
-      const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
-      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-      const __m128i v_res_b =
-          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
-
-      xx_storeu_128(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Vertical sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static void blend_a64_mask_sy_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-
-  do {
-    const __m128i v_ra_b = xx_loadl_32(mask);
-    const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
-    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-    xx_storel_32(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_sy_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  do {
-    const __m128i v_ra_b = xx_loadl_64(mask);
-    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
-    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-    xx_storel_64(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_sy_w16n_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  do {
-    int c;
-    for (c = 0; c < w; c += 16) {
-      const __m128i v_ra_b = xx_loadu_128(mask + c);
-      const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
-      const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-      const __m128i v_res_b =
-          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
-
-      xx_storeu_128(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Horizontal and Vertical sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static void blend_a64_mask_sx_sy_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  (void)w;
-
-  do {
-    const __m128i v_ra_b = xx_loadl_64(mask);
-    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
-    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
-    const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
-    const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
-    const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
-    const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
-    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
-    const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
-    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-    xx_storel_32(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_sx_sy_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  (void)w;
-
-  do {
-    const __m128i v_ra_b = xx_loadu_128(mask);
-    const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
-
-    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
-    const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
-    const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
-    const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
-    const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
-    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
-    const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
-    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-    xx_storel_64(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_sx_sy_w16n_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_zmask_b =
-      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  do {
-    int c;
-    for (c = 0; c < w; c += 16) {
-      const __m128i v_ral_b = xx_loadu_128(mask + 2 * c);
-      const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16);
-      const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c);
-      const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16);
-      const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
-      const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
-      const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
-      const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b);
-      const __m128i v_rvsbl_w =
-          _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b);
-      const __m128i v_rvsbh_w =
-          _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b);
-      const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
-      const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
-
-      const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
-      const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
-      const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w);
-      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-      const __m128i v_res_b =
-          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
-
-      xx_storeu_128(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Dispatch
-//////////////////////////////////////////////////////////////////////////////
-
-void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                               const uint8_t *src0, uint32_t src0_stride,
-                               const uint8_t *src1, uint32_t src1_stride,
-                               const uint8_t *mask, uint32_t mask_stride, int w,
-                               int h, int subw, int subh) {
-  typedef void (*blend_fn)(
-      uint8_t * dst, uint32_t dst_stride, const uint8_t *src0,
-      uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-      const uint8_t *mask, uint32_t mask_stride, int w, int h);
-
-  // Dimensions are: width_index X subx X suby
-  static const blend_fn blend[3][2][2] = {
-    { // w % 16 == 0
-      { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 },
-      { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } },
-    { // w == 4
-      { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 },
-      { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } },
-    { // w == 8
-      { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 },
-      { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } }
-  };
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
-    aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                         mask, mask_stride, w, h, subw, subh);
-  } else {
-    blend[(w >> 2) & 3][subw != 0][subh != 0](dst, dst_stride, src0,
-                                              src0_stride, src1, src1_stride,
-                                              mask, mask_stride, w, h);
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// No sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
 static INLINE void blend_a64_mask_bn_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
@@ -871,243 +472,6 @@
   }
 }
 
-static INLINE void blend_a64_d16_mask_w16_sse41(
-    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
-    const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset,
-    const __m128i *v_maxval, int shift) {
-  const __m128i max_minus_m0 = _mm_sub_epi16(*v_maxval, *m0);
-  const __m128i max_minus_m1 = _mm_sub_epi16(*v_maxval, *m1);
-  const __m128i s0_0 = xx_loadu_128(src0);
-  const __m128i s0_1 = xx_loadu_128(src0 + 8);
-  const __m128i s1_0 = xx_loadu_128(src1);
-  const __m128i s1_1 = xx_loadu_128(src1 + 8);
-  __m128i res0_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_0, s1_0),
-                                   _mm_unpacklo_epi16(*m0, max_minus_m0));
-  __m128i res0_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_0, s1_0),
-                                   _mm_unpackhi_epi16(*m0, max_minus_m0));
-  __m128i res1_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_1, s1_1),
-                                   _mm_unpacklo_epi16(*m1, max_minus_m1));
-  __m128i res1_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_1, s1_1),
-                                   _mm_unpackhi_epi16(*m1, max_minus_m1));
-  res0_lo = _mm_srai_epi32(_mm_sub_epi32(res0_lo, *v_round_offset), shift);
-  res0_hi = _mm_srai_epi32(_mm_sub_epi32(res0_hi, *v_round_offset), shift);
-  res1_lo = _mm_srai_epi32(_mm_sub_epi32(res1_lo, *v_round_offset), shift);
-  res1_hi = _mm_srai_epi32(_mm_sub_epi32(res1_hi, *v_round_offset), shift);
-  const __m128i res0 = _mm_packs_epi32(res0_lo, res0_hi);
-  const __m128i res1 = _mm_packs_epi32(res1_lo, res1_hi);
-  const __m128i res = _mm_packus_epi16(res0, res1);
-
-  _mm_storeu_si128((__m128i *)(dst), res);
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 16) {
-      const __m128i m = xx_loadu_128(mask + j);
-      const __m128i m0 = _mm_cvtepu8_epi16(m);
-      const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m, 8));
-
-      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
-                                   round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i one_b = _mm_set1_epi8(1);
-  const __m128i two_w = _mm_set1_epi16(2);
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 16) {
-      const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
-      const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
-      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j);
-      const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16);
-
-      const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10);
-      const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11);
-      const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b);
-      const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b);
-      const __m128i m0 = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2);
-      const __m128i m1 = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2);
-
-      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
-                                   round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i one_b = _mm_set1_epi8(1);
-  const __m128i zeros = _mm_setzero_si128();
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 16) {
-      const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
-      const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
-      const __m128i m0_ac = _mm_maddubs_epi16(m_i00, one_b);
-      const __m128i m1_ac = _mm_maddubs_epi16(m_i01, one_b);
-      const __m128i m0 = _mm_avg_epu16(m0_ac, zeros);
-      const __m128i m1 = _mm_avg_epu16(m1_ac, zeros);
-
-      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
-                                   round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i zeros = _mm_setzero_si128();
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 16) {
-      const __m128i m_i00 = xx_loadu_128(mask + j);
-      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
-
-      const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
-      const __m128i m0 = _mm_cvtepu8_epi16(m_ac);
-      const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m_ac, 8));
-
-      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
-                                   round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-void aom_lowbd_blend_a64_d16_mask_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
-    ConvolveParams *conv_params) {
-  const int bd = 8;
-  const int round_bits =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-
-  const int round_offset =
-      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
-       (1 << (round_bits - 1)))
-      << AOM_BLEND_A64_ROUND_BITS;
-
-  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
-  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 4);
-  assert(w >= 4);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  const __m128i v_round_offset = _mm_set1_epi32(round_offset);
-
-  if (subw == 0 && subh == 0) {
-    switch (w) {
-      case 4:
-        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 8:
-        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      default:
-        lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &v_round_offset, shift);
-        break;
-    }
-
-  } else if (subw == 1 && subh == 1) {
-    switch (w) {
-      case 4:
-        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 8:
-        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      default:
-        lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &v_round_offset, shift);
-        break;
-    }
-  } else if (subw == 1 && subh == 0) {
-    switch (w) {
-      case 4:
-        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 8:
-        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      default:
-        lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &v_round_offset, shift);
-        break;
-    }
-  } else {
-    switch (w) {
-      case 4:
-        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 8:
-        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      default:
-        lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &v_round_offset, shift);
-        break;
-    }
-  }
-}
-
 //////////////////////////////////////////////////////////////////////////////
 // aom_highbd_blend_a64_d16_mask_sse4_1()
 //////////////////////////////////////////////////////////////////////////////

diff --git a/aom_dsp/x86/blend_a64_vmask_sse4.c b/aom_dsp/x86/blend_a64_vmask_sse4.c
index 3ae6e85..7554c66 100644
--- a/aom_dsp/x86/blend_a64_vmask_sse4.c
+++ b/aom_dsp/x86/blend_a64_vmask_sse4.c

@@ -28,126 +28,6 @@
 // Implementation - No sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                      const uint8_t *src0, uint32_t src0_stride,
-                                      const uint8_t *src1, uint32_t src1_stride,
-                                      const uint8_t *mask, int w, int h) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  (void)w;
-
-  do {
-    const __m128i v_m0_w = _mm_set1_epi16(*mask);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
-    xx_storel_32(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 1;
-  } while (--h);
-}
-
-static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                      const uint8_t *src0, uint32_t src0_stride,
-                                      const uint8_t *src1, uint32_t src1_stride,
-                                      const uint8_t *mask, int w, int h) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  (void)w;
-
-  do {
-    const __m128i v_m0_w = _mm_set1_epi16(*mask);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
-    xx_storel_64(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 1;
-  } while (--h);
-}
-
-static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                        const uint8_t *src0,
-                                        uint32_t src0_stride,
-                                        const uint8_t *src1,
-                                        uint32_t src1_stride,
-                                        const uint8_t *mask, int w, int h) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    int c;
-    const __m128i v_m0_w = _mm_set1_epi16(*mask);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-    for (c = 0; c < w; c += 16) {
-      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w);
-      const __m128i v_resh_w =
-          blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w);
-
-      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
-
-      xx_storeu_128(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 1;
-  } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Dispatch
-//////////////////////////////////////////////////////////////////////////////
-
-void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                const uint8_t *src0, uint32_t src0_stride,
-                                const uint8_t *src1, uint32_t src1_stride,
-                                const uint8_t *mask, int w, int h) {
-  typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride,
-                           const uint8_t *src0, uint32_t src0_stride,
-                           const uint8_t *src1, uint32_t src1_stride,
-                           const uint8_t *mask, int w, int h);
-
-  // Dimension: width_index
-  static const blend_fn blend[9] = {
-    blend_a64_vmask_w16n_sse4_1,  // w % 16 == 0
-    aom_blend_a64_vmask_c,        // w == 1
-    aom_blend_a64_vmask_c,        // w == 2
-    NULL,                         // INVALID
-    blend_a64_vmask_w4_sse4_1,    // w == 4
-    NULL,                         // INVALID
-    NULL,                         // INVALID
-    NULL,                         // INVALID
-    blend_a64_vmask_w8_sse4_1,    // w == 8
-  };
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w,
-                 h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Implementation - No sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
 static INLINE void blend_a64_vmask_bn_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,

diff --git a/aom_dsp/x86/convolve.h b/aom_dsp/x86/convolve.h
index d551689..3ec440f 100644
--- a/aom_dsp/x86/convolve.h
+++ b/aom_dsp/x86/convolve.h

@@ -19,95 +19,6 @@
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
 
-typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                uint8_t *output_ptr, ptrdiff_t out_pitch,
-                                uint32_t output_height, const int16_t *filter);
-
-#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt)         \
-  void aom_convolve8_##name##_##opt(                                         \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,          \
-      const int16_t *filter_y, int y_step_q4, int w, int h) {                \
-    (void)filter_x;                                                          \
-    (void)x_step_q4;                                                         \
-    (void)filter_y;                                                          \
-    (void)y_step_q4;                                                         \
-    assert((-128 <= filter[3]) && (filter[3] <= 127));                       \
-    assert(step_q4 == 16);                                                   \
-    if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) &&            \
-        (filter[2] | filter[5])) {                                           \
-      while (w >= 16) {                                                      \
-        aom_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
-                                                 dst_stride, h, filter);     \
-        src += 16;                                                           \
-        dst += 16;                                                           \
-        w -= 16;                                                             \
-      }                                                                      \
-      while (w >= 8) {                                                       \
-        aom_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst,  \
-                                                dst_stride, h, filter);      \
-        src += 8;                                                            \
-        dst += 8;                                                            \
-        w -= 8;                                                              \
-      }                                                                      \
-      while (w >= 4) {                                                       \
-        aom_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst,  \
-                                                dst_stride, h, filter);      \
-        src += 4;                                                            \
-        dst += 4;                                                            \
-        w -= 4;                                                              \
-      }                                                                      \
-    } else if (filter[0] | filter[1] | filter[2]) {                          \
-      while (w >= 16) {                                                      \
-        aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
-                                                 dst_stride, h, filter);     \
-        src += 16;                                                           \
-        dst += 16;                                                           \
-        w -= 16;                                                             \
-      }                                                                      \
-      while (w >= 8) {                                                       \
-        aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,  \
-                                                dst_stride, h, filter);      \
-        src += 8;                                                            \
-        dst += 8;                                                            \
-        w -= 8;                                                              \
-      }                                                                      \
-      while (w >= 4) {                                                       \
-        aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,  \
-                                                dst_stride, h, filter);      \
-        src += 4;                                                            \
-        dst += 4;                                                            \
-        w -= 4;                                                              \
-      }                                                                      \
-    } else {                                                                 \
-      while (w >= 16) {                                                      \
-        aom_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst,       \
-                                                 dst_stride, h, filter);     \
-        src += 16;                                                           \
-        dst += 16;                                                           \
-        w -= 16;                                                             \
-      }                                                                      \
-      while (w >= 8) {                                                       \
-        aom_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst,        \
-                                                dst_stride, h, filter);      \
-        src += 8;                                                            \
-        dst += 8;                                                            \
-        w -= 8;                                                              \
-      }                                                                      \
-      while (w >= 4) {                                                       \
-        aom_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst,        \
-                                                dst_stride, h, filter);      \
-        src += 4;                                                            \
-        dst += 4;                                                            \
-        w -= 4;                                                              \
-      }                                                                      \
-    }                                                                        \
-    if (w) {                                                                 \
-      aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x,   \
-                               x_step_q4, filter_y, y_step_q4, w, h);        \
-    }                                                                        \
-  }
-
 typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
                                        const ptrdiff_t src_pitch,
                                        uint16_t *output_ptr,

diff --git a/aom_dsp/x86/fwd_txfm_impl_sse2.h b/aom_dsp/x86/fwd_txfm_impl_sse2.h
index 2c470cd..e7e4b81 100644
--- a/aom_dsp/x86/fwd_txfm_impl_sse2.h
+++ b/aom_dsp/x86/fwd_txfm_impl_sse2.h

@@ -22,15 +22,9 @@
 // TODO(jingning) The high bit-depth functions need rework for performance.
 // After we properly fix the high bit-depth function implementations, this
 // file's dependency should be substantially simplified.
-#if DCT_HIGH_BIT_DEPTH
 #define ADD_EPI16 _mm_adds_epi16
 #define SUB_EPI16 _mm_subs_epi16
 
-#else
-#define ADD_EPI16 _mm_add_epi16
-#define SUB_EPI16 _mm_sub_epi16
-#endif
-
 static void FDCT4x4_2D_HELPER(const int16_t *input, int stride, __m128i *in0,
                               __m128i *in1) {
   // Constants
@@ -246,9 +240,7 @@
   const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-#if DCT_HIGH_BIT_DEPTH
   int overflow;
-#endif
   // Load input
   __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
   __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
@@ -284,7 +276,6 @@
     const __m128i q5 = SUB_EPI16(in2, in5);
     const __m128i q6 = SUB_EPI16(in1, in6);
     const __m128i q7 = SUB_EPI16(in0, in7);
-#if DCT_HIGH_BIT_DEPTH
     if (pass == 1) {
       overflow =
           check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
@@ -293,7 +284,6 @@
         return;
       }
     }
-#endif  // DCT_HIGH_BIT_DEPTH
     // Work on first four results
     {
       // Add/subtract
@@ -301,13 +291,11 @@
       const __m128i r1 = ADD_EPI16(q1, q2);
       const __m128i r2 = SUB_EPI16(q1, q2);
       const __m128i r3 = SUB_EPI16(q0, q3);
-#if DCT_HIGH_BIT_DEPTH
       overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
       if (overflow) {
         aom_highbd_fdct8x8_c(input, output, stride);
         return;
       }
-#endif  // DCT_HIGH_BIT_DEPTH
       // Interleave to do the multiply by constants which gets us into 32bits
       {
         const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
@@ -344,13 +332,11 @@
         res4 = _mm_packs_epi32(w2, w3);
         res2 = _mm_packs_epi32(w4, w5);
         res6 = _mm_packs_epi32(w6, w7);
-#if DCT_HIGH_BIT_DEPTH
         overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6);
         if (overflow) {
           aom_highbd_fdct8x8_c(input, output, stride);
           return;
         }
-#endif  // DCT_HIGH_BIT_DEPTH
       }
     }
     // Work on next four results
@@ -374,26 +360,22 @@
       // Combine
       const __m128i r0 = _mm_packs_epi32(s0, s1);
       const __m128i r1 = _mm_packs_epi32(s2, s3);
-#if DCT_HIGH_BIT_DEPTH
       overflow = check_epi16_overflow_x2(&r0, &r1);
       if (overflow) {
         aom_highbd_fdct8x8_c(input, output, stride);
         return;
       }
-#endif  // DCT_HIGH_BIT_DEPTH
       {
         // Add/subtract
         const __m128i x0 = ADD_EPI16(q4, r0);
         const __m128i x1 = SUB_EPI16(q4, r0);
         const __m128i x2 = SUB_EPI16(q7, r1);
         const __m128i x3 = ADD_EPI16(q7, r1);
-#if DCT_HIGH_BIT_DEPTH
         overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
         if (overflow) {
           aom_highbd_fdct8x8_c(input, output, stride);
           return;
         }
-#endif  // DCT_HIGH_BIT_DEPTH
         // Interleave to do the multiply by constants which gets us into 32bits
         {
           const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
@@ -430,13 +412,11 @@
           res7 = _mm_packs_epi32(w2, w3);
           res5 = _mm_packs_epi32(w4, w5);
           res3 = _mm_packs_epi32(w6, w7);
-#if DCT_HIGH_BIT_DEPTH
           overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3);
           if (overflow) {
             aom_highbd_fdct8x8_c(input, output, stride);
             return;
           }
-#endif  // DCT_HIGH_BIT_DEPTH
         }
       }
     }

diff --git a/aom_dsp/x86/fwd_txfm_sse2.c b/aom_dsp/x86/fwd_txfm_sse2.c
index a645bdb..de37f0b 100644
--- a/aom_dsp/x86/fwd_txfm_sse2.c
+++ b/aom_dsp/x86/fwd_txfm_sse2.c

@@ -18,19 +18,12 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/x86/fwd_txfm_sse2.h"
 
-#define DCT_HIGH_BIT_DEPTH 0
 #define FDCT4x4_2D_HELPER fdct4x4_helper
 #define FDCT4x4_2D aom_fdct4x4_sse2
 #define FDCT4x4_2D_LP aom_fdct4x4_lp_sse2
-#define FDCT8x8_2D aom_fdct8x8_sse2
+#define FDCT8x8_2D aom_highbd_fdct8x8_sse2
 #include "aom_dsp/x86/fwd_txfm_impl_sse2.h"
 #undef FDCT4x4_2D_HELPER
 #undef FDCT4x4_2D
 #undef FDCT4x4_2D_LP
 #undef FDCT8x8_2D
-
-#undef DCT_HIGH_BIT_DEPTH
-#define DCT_HIGH_BIT_DEPTH 1
-#define FDCT8x8_2D aom_highbd_fdct8x8_sse2
-#include "aom_dsp/x86/fwd_txfm_impl_sse2.h"  // NOLINT
-#undef FDCT8x8_2D

diff --git a/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
deleted file mode 100644
index 723df21..0000000
--- a/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
+++ /dev/null

@@ -1,379 +0,0 @@
-;
-; Copyright (c) 2021, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 3-Clause Clear License and the
-; Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear License was
-; not distributed with this source code in the LICENSE file, you can obtain it
-; at aomedia.org/license/software-license/bsd-3-c-c/.  If the Alliance for Open Media Patent
-; License 1.0 was not distributed with this source code in the PATENTS file, you
-; can obtain it at aomedia.org/license/patent-license/.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-
-pw_11585x2: times 8 dw 23170
-pd_8192:    times 4 dd 8192
-
-%macro TRANSFORM_COEFFS 2
-pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
-pw_%2_m%1:  dw  %2, -%1,  %2, -%1,  %2, -%1,  %2, -%1
-%endmacro
-
-TRANSFORM_COEFFS 11585,  11585
-TRANSFORM_COEFFS 15137,   6270
-TRANSFORM_COEFFS 16069,   3196
-TRANSFORM_COEFFS  9102,  13623
-
-%macro STORE_OUTPUT 2 ; index, result
-  ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
-  ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
-  ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
-  ; _mm_store_si128((__m128i *)(dst_ptr), out0);
-  ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
-  pxor               m11, m11
-  pcmpgtw            m11, m%2
-  movdqa             m12, m%2
-  punpcklwd          m%2, m11
-  punpckhwd          m12, m11
-  mova               [outputq + 4*%1 +  0], m%2
-  mova               [outputq + 4*%1 + 16], m12
-%endmacro
-
-SECTION .text
-
-%if ARCH_X86_64
-INIT_XMM ssse3
-cglobal fdct8x8, 3, 5, 13, input, output, stride
-
-  mova               m8, [GLOBAL(pd_8192)]
-  mova              m12, [GLOBAL(pw_11585x2)]
-
-  lea                r3, [2 * strideq]
-  lea                r4, [4 * strideq]
-  mova               m0, [inputq]
-  mova               m1, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m2, [inputq]
-  mova               m3, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m4, [inputq]
-  mova               m5, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m6, [inputq]
-  mova               m7, [inputq + r3]
-
-  ; left shift by 2 to increase forward transformation precision
-  psllw              m0, 2
-  psllw              m1, 2
-  psllw              m2, 2
-  psllw              m3, 2
-  psllw              m4, 2
-  psllw              m5, 2
-  psllw              m6, 2
-  psllw              m7, 2
-
-  ; column transform
-  ; stage 1
-  paddw m10, m0, m7
-  psubw m0, m7
-
-  paddw m9, m1, m6
-  psubw m1, m6
-
-  paddw m7, m2, m5
-  psubw m2, m5
-
-  paddw m6, m3, m4
-  psubw m3, m4
-
-  ; stage 2
-  paddw m5, m9, m7
-  psubw m9, m7
-
-  paddw m4, m10, m6
-  psubw m10, m6
-
-  paddw m7, m1, m2
-  psubw m1, m2
-
-  ; stage 3
-  paddw m6, m4, m5
-  psubw m4, m5
-
-  pmulhrsw m1, m12
-  pmulhrsw m7, m12
-
-  ; sin(pi / 8), cos(pi / 8)
-  punpcklwd m2, m10, m9
-  punpckhwd m10, m9
-  pmaddwd m5, m2, [GLOBAL(pw_15137_6270)]
-  pmaddwd m2, [GLOBAL(pw_6270_m15137)]
-  pmaddwd m9, m10, [GLOBAL(pw_15137_6270)]
-  pmaddwd m10, [GLOBAL(pw_6270_m15137)]
-  paddd m5, m8
-  paddd m2, m8
-  paddd m9, m8
-  paddd m10, m8
-  psrad m5, 14
-  psrad m2, 14
-  psrad m9, 14
-  psrad m10, 14
-  packssdw m5, m9
-  packssdw m2, m10
-
-  pmulhrsw m6, m12
-  pmulhrsw m4, m12
-
-  paddw m9, m3, m1
-  psubw m3, m1
-
-  paddw m10, m0, m7
-  psubw m0, m7
-
-  ; stage 4
-  ; sin(pi / 16), cos(pi / 16)
-  punpcklwd m1, m10, m9
-  punpckhwd m10, m9
-  pmaddwd m7, m1, [GLOBAL(pw_16069_3196)]
-  pmaddwd m1, [GLOBAL(pw_3196_m16069)]
-  pmaddwd m9, m10, [GLOBAL(pw_16069_3196)]
-  pmaddwd m10, [GLOBAL(pw_3196_m16069)]
-  paddd m7, m8
-  paddd m1, m8
-  paddd m9, m8
-  paddd m10, m8
-  psrad m7, 14
-  psrad m1, 14
-  psrad m9, 14
-  psrad m10, 14
-  packssdw m7, m9
-  packssdw m1, m10
-
-  ; sin(3 * pi / 16), cos(3 * pi / 16)
-  punpcklwd m11, m0, m3
-  punpckhwd m0, m3
-  pmaddwd m9, m11, [GLOBAL(pw_9102_13623)]
-  pmaddwd m11, [GLOBAL(pw_13623_m9102)]
-  pmaddwd m3, m0, [GLOBAL(pw_9102_13623)]
-  pmaddwd m0, [GLOBAL(pw_13623_m9102)]
-  paddd m9, m8
-  paddd m11, m8
-  paddd m3, m8
-  paddd m0, m8
-  psrad m9, 14
-  psrad m11, 14
-  psrad m3, 14
-  psrad m0, 14
-  packssdw m9, m3
-  packssdw m11, m0
-
-  ; transpose
-  ; stage 1
-  punpcklwd m0, m6, m7
-  punpcklwd m3, m5, m11
-  punpckhwd m6, m7
-  punpckhwd m5, m11
-  punpcklwd m7, m4, m9
-  punpcklwd m10, m2, m1
-  punpckhwd m4, m9
-  punpckhwd m2, m1
-
-  ; stage 2
-  punpckldq m9, m0, m3
-  punpckldq m1, m6, m5
-  punpckhdq m0, m3
-  punpckhdq m6, m5
-  punpckldq m3, m7, m10
-  punpckldq m5, m4, m2
-  punpckhdq m7, m10
-  punpckhdq m4, m2
-
-  ; stage 3
-  punpcklqdq m10, m9, m3
-  punpckhqdq m9, m3
-  punpcklqdq m2, m0, m7
-  punpckhqdq m0, m7
-  punpcklqdq m3, m1, m5
-  punpckhqdq m1, m5
-  punpcklqdq m7, m6, m4
-  punpckhqdq m6, m4
-
-  ; row transform
-  ; stage 1
-  paddw m5, m10, m6
-  psubw m10, m6
-
-  paddw m4, m9, m7
-  psubw m9, m7
-
-  paddw m6, m2, m1
-  psubw m2, m1
-
-  paddw m7, m0, m3
-  psubw m0, m3
-
-  ;stage 2
-  paddw m1, m5, m7
-  psubw m5, m7
-
-  paddw m3, m4, m6
-  psubw m4, m6
-
-  paddw m7, m9, m2
-  psubw m9, m2
-
-  ; stage 3
-  punpcklwd m6, m1, m3
-  punpckhwd m1, m3
-  pmaddwd m2, m6, [GLOBAL(pw_11585_11585)]
-  pmaddwd m6, [GLOBAL(pw_11585_m11585)]
-  pmaddwd m3, m1, [GLOBAL(pw_11585_11585)]
-  pmaddwd m1, [GLOBAL(pw_11585_m11585)]
-  paddd m2, m8
-  paddd m6, m8
-  paddd m3, m8
-  paddd m1, m8
-  psrad m2, 14
-  psrad m6, 14
-  psrad m3, 14
-  psrad m1, 14
-  packssdw m2, m3
-  packssdw m6, m1
-
-  pmulhrsw m7, m12
-  pmulhrsw m9, m12
-
-  punpcklwd m3, m5, m4
-  punpckhwd m5, m4
-  pmaddwd m1, m3, [GLOBAL(pw_15137_6270)]
-  pmaddwd m3, [GLOBAL(pw_6270_m15137)]
-  pmaddwd m4, m5, [GLOBAL(pw_15137_6270)]
-  pmaddwd m5, [GLOBAL(pw_6270_m15137)]
-  paddd m1, m8
-  paddd m3, m8
-  paddd m4, m8
-  paddd m5, m8
-  psrad m1, 14
-  psrad m3, 14
-  psrad m4, 14
-  psrad m5, 14
-  packssdw m1, m4
-  packssdw m3, m5
-
-  paddw m4, m0, m9
-  psubw m0, m9
-
-  paddw m5, m10, m7
-  psubw m10, m7
-
-  ; stage 4
-  punpcklwd m9, m5, m4
-  punpckhwd m5, m4
-  pmaddwd m7, m9, [GLOBAL(pw_16069_3196)]
-  pmaddwd m9, [GLOBAL(pw_3196_m16069)]
-  pmaddwd m4, m5, [GLOBAL(pw_16069_3196)]
-  pmaddwd m5, [GLOBAL(pw_3196_m16069)]
-  paddd m7, m8
-  paddd m9, m8
-  paddd m4, m8
-  paddd m5, m8
-  psrad m7, 14
-  psrad m9, 14
-  psrad m4, 14
-  psrad m5, 14
-  packssdw m7, m4
-  packssdw m9, m5
-
-  punpcklwd m4, m10, m0
-  punpckhwd m10, m0
-  pmaddwd m5, m4, [GLOBAL(pw_9102_13623)]
-  pmaddwd m4, [GLOBAL(pw_13623_m9102)]
-  pmaddwd m0, m10, [GLOBAL(pw_9102_13623)]
-  pmaddwd m10, [GLOBAL(pw_13623_m9102)]
-  paddd m5, m8
-  paddd m4, m8
-  paddd m0, m8
-  paddd m10, m8
-  psrad m5, 14
-  psrad m4, 14
-  psrad m0, 14
-  psrad m10, 14
-  packssdw m5, m0
-  packssdw m4, m10
-
-  ; transpose
-  ; stage 1
-  punpcklwd m0, m2, m7
-  punpcklwd m10, m1, m4
-  punpckhwd m2, m7
-  punpckhwd m1, m4
-  punpcklwd m7, m6, m5
-  punpcklwd m4, m3, m9
-  punpckhwd m6, m5
-  punpckhwd m3, m9
-
-  ; stage 2
-  punpckldq m5, m0, m10
-  punpckldq m9, m2, m1
-  punpckhdq m0, m10
-  punpckhdq m2, m1
-  punpckldq m10, m7, m4
-  punpckldq m1, m6, m3
-  punpckhdq m7, m4
-  punpckhdq m6, m3
-
-  ; stage 3
-  punpcklqdq m4, m5, m10
-  punpckhqdq m5, m10
-  punpcklqdq m3, m0, m7
-  punpckhqdq m0, m7
-  punpcklqdq m10, m9, m1
-  punpckhqdq m9, m1
-  punpcklqdq m7, m2, m6
-  punpckhqdq m2, m6
-
-  psraw m1, m4, 15
-  psraw m6, m5, 15
-  psraw m8, m3, 15
-  psraw m11, m0, 15
-
-  psubw m4, m1
-  psubw m5, m6
-  psubw m3, m8
-  psubw m0, m11
-
-  psraw m4, 1
-  psraw m5, 1
-  psraw m3, 1
-  psraw m0, 1
-
-  psraw m1, m10, 15
-  psraw m6, m9, 15
-  psraw m8, m7, 15
-  psraw m11, m2, 15
-
-  psubw m10, m1
-  psubw m9, m6
-  psubw m7, m8
-  psubw m2, m11
-
-  psraw m10, 1
-  psraw m9, 1
-  psraw m7, 1
-  psraw m2, 1
-
-  STORE_OUTPUT  0,  4
-  STORE_OUTPUT  8,  5
-  STORE_OUTPUT 16,  3
-  STORE_OUTPUT 24,  0
-  STORE_OUTPUT 32, 10
-  STORE_OUTPUT 40,  9
-  STORE_OUTPUT 48,  7
-  STORE_OUTPUT 56,  2
-
-  RET
-%endif

diff --git a/aom_dsp/x86/highbd_variance_sse2.c b/aom_dsp/x86/highbd_variance_sse2.c
index c5a652f..27cd1a0 100644
--- a/aom_dsp/x86/highbd_variance_sse2.c
+++ b/aom_dsp/x86/highbd_variance_sse2.c

@@ -645,7 +645,7 @@
       av1_init_inter_params(
           &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
           mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
-          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+          xd->bd, is_intrabc, sf, pre_buf, filters);
       av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
                                         &inter_pred_params);
       return;

diff --git a/aom_dsp/x86/intrapred_asm_sse2.asm b/aom_dsp/x86/intrapred_asm_sse2.asm
deleted file mode 100644
index a2570d3..0000000
--- a/aom_dsp/x86/intrapred_asm_sse2.asm
+++ /dev/null

@@ -1,608 +0,0 @@
-;
-; Copyright (c) 2021, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 3-Clause Clear License and the
-; Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear License was
-; not distributed with this source code in the LICENSE file, you can obtain it
-; at aomedia.org/license/software-license/bsd-3-c-c/.  If the Alliance for Open Media Patent
-; License 1.0 was not distributed with this source code in the PATENTS file, you
-; can obtain it at aomedia.org/license/patent-license/.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pb_1: times 16 db 1
-pw_4:  times 8 dw 4
-pw_8:  times 8 dw 8
-pw_16: times 8 dw 16
-pw_32: times 8 dw 32
-dc_128: times 16 db 128
-pw2_4:  times 8 dw 2
-pw2_8:  times 8 dw 4
-pw2_16:  times 8 dw 8
-pw2_32:  times 8 dw 16
-
-SECTION .text
-
-INIT_XMM sse2
-cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  movd                  m2, [leftq]
-  movd                  m0, [aboveq]
-  pxor                  m1, m1
-  punpckldq             m0, m2
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw_4)]
-  psraw                 m0, 3
-  pshuflw               m0, m0, 0x0
-  packuswb              m0, m0
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  lea                 dstq, [dstq+strideq*2]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
-  movifnidn          leftq, leftmp
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movd                  m0, [leftq]
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw2_4)]
-  psraw                 m0, 2
-  pshuflw               m0, m0, 0x0
-  packuswb              m0, m0
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  lea                 dstq, [dstq+strideq*2]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movd                  m0, [aboveq]
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw2_4)]
-  psraw                 m0, 2
-  pshuflw               m0, m0, 0x0
-  packuswb              m0, m0
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  lea                 dstq, [dstq+strideq*2]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movq                  m0, [aboveq]
-  movq                  m2, [leftq]
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw_8)]
-  psraw                 m0, 4
-  punpcklbw             m0, m0
-  pshuflw               m0, m0, 0x0
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movq                  m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw2_8)]
-  psraw                 m0, 3
-  punpcklbw             m0, m0
-  pshuflw               m0, m0, 0x0
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
-  movifnidn          leftq, leftmp
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movq                  m0, [leftq]
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw2_8)]
-  psraw                 m0, 3
-  punpcklbw             m0, m0
-  pshuflw               m0, m0, 0x0
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  movd     m0,        [GLOBAL(dc_128)]
-  movd    [dstq          ], m0
-  movd    [dstq+strideq  ], m0
-  movd    [dstq+strideq*2], m0
-  movd    [dstq+stride3q ], m0
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  movq    m0,        [GLOBAL(dc_128)]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  mova                  m2, [leftq]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 4
-  psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw_16)]
-  psraw                 m0, 5
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-
-INIT_XMM sse2
-cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 4
-  psadbw                m0, m1
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw2_16)]
-  psraw                 m0, 4
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [leftq]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 4
-  psadbw                m0, m1
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw2_16)]
-  psraw                 m0, 4
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 4
-  mova    m0,        [GLOBAL(dc_128)]
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-  RESTORE_GOT
-  RET
-
-
-INIT_XMM sse2
-cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  mova                  m2, [aboveq+16]
-  mova                  m3, [leftq]
-  mova                  m4, [leftq+16]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 8
-  psadbw                m0, m1
-  psadbw                m2, m1
-  psadbw                m3, m1
-  psadbw                m4, m1
-  paddw                 m0, m2
-  paddw                 m0, m3
-  paddw                 m0, m4
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw_32)]
-  psraw                 m0, 6
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m0
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m0
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m0
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  mova                  m2, [aboveq+16]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 8
-  psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw2_32)]
-  psraw                 m0, 5
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m0
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m0
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m0
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [leftq]
-  mova                  m2, [leftq+16]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 8
-  psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw2_32)]
-  psraw                 m0, 5
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m0
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m0
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m0
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 8
-  mova    m0,        [GLOBAL(dc_128)]
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m0
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m0
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m0
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
-  movd                  m0, [aboveq]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  lea                 dstq, [dstq+strideq*2]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  RET
-
-INIT_XMM sse2
-cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
-  movq                  m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  RET
-
-INIT_XMM sse2
-cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
-  mova                  m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3, nlines4
-  lea             stride3q, [strideq*3]
-  mov              nlines4d, 4
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec             nlines4d
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
-  mova                  m0, [aboveq]
-  mova                  m1, [aboveq+16]
-  DEFINE_ARGS dst, stride, stride3, nlines4
-  lea             stride3q, [strideq*3]
-  mov              nlines4d, 8
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m1
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m1
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m1
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m1
-  lea                 dstq, [dstq+strideq*4]
-  dec             nlines4d
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left
-  movifnidn          leftq, leftmp
-  movd                  m0, [leftq]
-  punpcklbw             m0, m0
-  punpcklbw             m0, m0
-  pshufd                m1, m0, 0x1
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m1
-  pshufd                m2, m0, 0x2
-  lea                 dstq, [dstq+strideq*2]
-  pshufd                m3, m0, 0x3
-  movd      [dstq        ], m2
-  movd      [dstq+strideq], m3
-  RET
-
-INIT_XMM sse2
-cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left
-  movifnidn          leftq, leftmp
-  mov                lineq, -2
-  DEFINE_ARGS  dst, stride, line, left, stride3
-  lea             stride3q, [strideq*3]
-  movq                  m0, [leftq    ]
-  punpcklbw             m0, m0              ; l1 l1 l2 l2 ... l8 l8
-.loop:
-  pshuflw               m1, m0, 0x0         ; l1 l1 l1 l1 l1 l1 l1 l1
-  pshuflw               m2, m0, 0x55        ; l2 l2 l2 l2 l2 l2 l2 l2
-  movq      [dstq        ], m1
-  movq      [dstq+strideq], m2
-  pshuflw               m1, m0, 0xaa
-  pshuflw               m2, m0, 0xff
-  movq    [dstq+strideq*2], m1
-  movq    [dstq+stride3q ], m2
-  pshufd                m0, m0, 0xe         ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8
-  inc                lineq
-  lea                 dstq, [dstq+strideq*4]
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left
-  movifnidn          leftq, leftmp
-  mov                lineq, -4
-  DEFINE_ARGS dst, stride, line, left, stride3
-  lea             stride3q, [strideq*3]
-.loop:
-  movd                  m0, [leftq]
-  punpcklbw             m0, m0
-  punpcklbw             m0, m0              ; l1 to l4 each repeated 4 times
-  pshufd            m1, m0, 0x0             ; l1 repeated 16 times
-  pshufd            m2, m0, 0x55            ; l2 repeated 16 times
-  mova    [dstq          ], m1
-  mova    [dstq+strideq  ], m2
-  pshufd            m1, m0, 0xaa
-  pshufd            m2, m0, 0xff
-  mova    [dstq+strideq*2], m1
-  mova    [dstq+stride3q ], m2
-  inc                lineq
-  lea                leftq, [leftq+4       ]
-  lea                 dstq, [dstq+strideq*4]
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left
-  movifnidn              leftq, leftmp
-  mov                    lineq, -8
-  DEFINE_ARGS dst, stride, line, left, stride3
-  lea                 stride3q, [strideq*3]
-.loop:
-  movd                      m0, [leftq]
-  punpcklbw                 m0, m0
-  punpcklbw                 m0, m0              ; l1 to l4 each repeated 4 times
-  pshufd                m1, m0, 0x0             ; l1 repeated 16 times
-  pshufd                m2, m0, 0x55            ; l2 repeated 16 times
-  mova     [dstq             ], m1
-  mova     [dstq+16          ], m1
-  mova     [dstq+strideq     ], m2
-  mova     [dstq+strideq+16  ], m2
-  pshufd                m1, m0, 0xaa
-  pshufd                m2, m0, 0xff
-  mova     [dstq+strideq*2   ], m1
-  mova     [dstq+strideq*2+16], m1
-  mova     [dstq+stride3q    ], m2
-  mova     [dstq+stride3q+16 ], m2
-  inc                    lineq
-  lea                    leftq, [leftq+4       ]
-  lea                     dstq, [dstq+strideq*4]
-  jnz .loop
-  REP_RET

diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index 87ed57c..adaf428 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c

@@ -315,731 +315,6 @@
   }
 }
 
-void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const __m256i sum_above = dc_sum_32(above);
-  __m256i sum_left = dc_sum_32(left);
-  sum_left = _mm256_add_epi16(sum_left, sum_above);
-  const __m256i thirtytwo = _mm256_set1_epi16(32);
-  sum_left = _mm256_add_epi16(sum_left, thirtytwo);
-  sum_left = _mm256_srai_epi16(sum_left, 6);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum_left, zero);
-  row_store_32xh(&row, 32, dst, stride);
-}
-
-void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m256i sum = dc_sum_32(above);
-  (void)left;
-
-  const __m256i sixteen = _mm256_set1_epi16(16);
-  sum = _mm256_add_epi16(sum, sixteen);
-  sum = _mm256_srai_epi16(sum, 5);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_32xh(&row, 32, dst, stride);
-}
-
-void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m256i sum = dc_sum_32(left);
-  (void)above;
-
-  const __m256i sixteen = _mm256_set1_epi16(16);
-  sum = _mm256_add_epi16(sum, sixteen);
-  sum = _mm256_srai_epi16(sum, 5);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_32xh(&row, 32, dst, stride);
-}
-
-void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
-  row_store_32xh(&row, 32, dst, stride);
-}
-
-void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
-  (void)left;
-  row_store_32xh(&row, 32, dst, stride);
-}
-
-// There are 32 rows togeter. This function does line:
-// 0,1,2,3, and 16,17,18,19. The next call would do
-// 4,5,6,7, and 20,21,22,23. So 4 times of calling
-// would finish 32 rows.
-static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst,
-                                        ptrdiff_t stride) {
-  __m256i t[4];
-  __m256i m = _mm256_setzero_si256();
-  const __m256i inc = _mm256_set1_epi8(4);
-  int i;
-
-  for (i = 0; i < 4; i++) {
-    t[i] = _mm256_shuffle_epi8(*row, m);
-    __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0);
-    __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11);
-    _mm256_storeu_si256((__m256i *)dst, r0);
-    _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1);
-    dst += stride;
-    m = _mm256_add_epi8(m, inc);
-  }
-}
-
-void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  const __m256i left_col = _mm256_loadu_si256((__m256i const *)left);
-
-  __m256i u = _mm256_unpacklo_epi8(left_col, left_col);
-
-  __m256i v = _mm256_unpacklo_epi8(u, u);
-  h_predictor_32x8line(&v, dst, stride);
-  dst += stride << 2;
-
-  v = _mm256_unpackhi_epi8(u, u);
-  h_predictor_32x8line(&v, dst, stride);
-  dst += stride << 2;
-
-  u = _mm256_unpackhi_epi8(left_col, left_col);
-
-  v = _mm256_unpacklo_epi8(u, u);
-  h_predictor_32x8line(&v, dst, stride);
-  dst += stride << 2;
-
-  v = _mm256_unpackhi_epi8(u, u);
-  h_predictor_32x8line(&v, dst, stride);
-}
-
-// -----------------------------------------------------------------------------
-// Rectangle
-void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const __m128i top_sum = dc_sum_32_sse2(above);
-  __m128i left_sum = dc_sum_16_sse2(left);
-  left_sum = _mm_add_epi16(top_sum, left_sum);
-  uint16_t sum = _mm_cvtsi128_si32(left_sum);
-  sum += 24;
-  sum /= 48;
-  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
-  row_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const __m256i sum_above = dc_sum_32(above);
-  __m256i sum_left = dc_sum_64(left);
-  sum_left = _mm256_add_epi16(sum_left, sum_above);
-  uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
-  sum += 48;
-  sum /= 96;
-  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
-  row_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const __m256i sum_above = dc_sum_64(above);
-  __m256i sum_left = dc_sum_64(left);
-  sum_left = _mm256_add_epi16(sum_left, sum_above);
-  uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
-  sum += 64;
-  sum /= 128;
-  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
-  row_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const __m256i sum_above = dc_sum_64(above);
-  __m256i sum_left = dc_sum_32(left);
-  sum_left = _mm256_add_epi16(sum_left, sum_above);
-  uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
-  sum += 48;
-  sum /= 96;
-  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
-  row_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const __m256i sum_above = dc_sum_64(above);
-  __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
-  sum_left = _mm256_add_epi16(sum_left, sum_above);
-  uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
-  sum += 40;
-  sum /= 80;
-  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
-  row_store_64xh(&row, 16, dst, stride);
-}
-
-void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m256i sum = dc_sum_32(above);
-  (void)left;
-
-  const __m256i sixteen = _mm256_set1_epi16(16);
-  sum = _mm256_add_epi16(sum, sixteen);
-  sum = _mm256_srai_epi16(sum, 5);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m256i sum = dc_sum_32(above);
-  (void)left;
-
-  const __m256i sixteen = _mm256_set1_epi16(16);
-  sum = _mm256_add_epi16(sum, sixteen);
-  sum = _mm256_srai_epi16(sum, 5);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m256i sum = dc_sum_64(above);
-  (void)left;
-
-  const __m256i thirtytwo = _mm256_set1_epi16(32);
-  sum = _mm256_add_epi16(sum, thirtytwo);
-  sum = _mm256_srai_epi16(sum, 6);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m256i sum = dc_sum_64(above);
-  (void)left;
-
-  const __m256i thirtytwo = _mm256_set1_epi16(32);
-  sum = _mm256_add_epi16(sum, thirtytwo);
-  sum = _mm256_srai_epi16(sum, 6);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m256i sum = dc_sum_64(above);
-  (void)left;
-
-  const __m256i thirtytwo = _mm256_set1_epi16(32);
-  sum = _mm256_add_epi16(sum, thirtytwo);
-  sum = _mm256_srai_epi16(sum, 6);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_64xh(&row, 16, dst, stride);
-}
-
-void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i sum = dc_sum_16_sse2(left);
-  (void)above;
-
-  const __m128i eight = _mm_set1_epi16(8);
-  sum = _mm_add_epi16(sum, eight);
-  sum = _mm_srai_epi16(sum, 4);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i r = _mm_shuffle_epi8(sum, zero);
-  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
-  row_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m256i sum = dc_sum_64(left);
-  (void)above;
-
-  const __m256i thirtytwo = _mm256_set1_epi16(32);
-  sum = _mm256_add_epi16(sum, thirtytwo);
-  sum = _mm256_srai_epi16(sum, 6);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m256i sum = dc_sum_64(left);
-  (void)above;
-
-  const __m256i thirtytwo = _mm256_set1_epi16(32);
-  sum = _mm256_add_epi16(sum, thirtytwo);
-  sum = _mm256_srai_epi16(sum, 6);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m256i sum = dc_sum_32(left);
-  (void)above;
-
-  const __m256i sixteen = _mm256_set1_epi16(16);
-  sum = _mm256_add_epi16(sum, sixteen);
-  sum = _mm256_srai_epi16(sum, 5);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i sum = dc_sum_16_sse2(left);
-  (void)above;
-
-  const __m128i eight = _mm_set1_epi16(8);
-  sum = _mm_add_epi16(sum, eight);
-  sum = _mm_srai_epi16(sum, 4);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i r = _mm_shuffle_epi8(sum, zero);
-  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
-  row_store_64xh(&row, 16, dst, stride);
-}
-
-void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
-  row_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
-  row_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
-  row_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
-  row_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
-  row_store_64xh(&row, 16, dst, stride);
-}
-
-void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
-  (void)left;
-  row_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
-  (void)left;
-  row_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
-  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
-  (void)left;
-  row_store_32x2xh(&row0, &row1, 64, dst, stride);
-}
-
-void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
-  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
-  (void)left;
-  row_store_32x2xh(&row0, &row1, 32, dst, stride);
-}
-
-void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
-  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
-  (void)left;
-  row_store_32x2xh(&row0, &row1, 16, dst, stride);
-}
-
-// -----------------------------------------------------------------------------
-// PAETH_PRED
-
-// Return 16 16-bit pixels in one row (__m256i)
-static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top,
-                                 const __m256i *topleft) {
-  const __m256i base =
-      _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft);
-
-  __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left));
-  __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top));
-  __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft));
-
-  __m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
-  mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
-  __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
-
-  pl = _mm256_andnot_si256(mask1, *left);
-
-  ptl = _mm256_and_si256(mask2, *topleft);
-  pt = _mm256_andnot_si256(mask2, *top);
-  pt = _mm256_or_si256(pt, ptl);
-  pt = _mm256_and_si256(mask1, pt);
-
-  return _mm256_or_si256(pt, pl);
-}
-
-// Return 16 8-bit pixels in one row (__m128i)
-static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
-                                      const __m256i *topleft) {
-  const __m256i p0 = paeth_pred(left, top, topleft);
-  const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
-  const __m256i p = _mm256_packus_epi16(p0, p1);
-  return _mm256_castsi256_si128(p);
-}
-
-static INLINE __m256i get_top_vector(const uint8_t *above) {
-  const __m128i x = _mm_load_si128((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i t0 = _mm_unpacklo_epi8(x, zero);
-  const __m128i t1 = _mm_unpackhi_epi8(x, zero);
-  return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
-}
-
-void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  __m128i x = _mm_loadl_epi64((const __m128i *)left);
-  const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
-  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16((short)0x8000);
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i top = get_top_vector(above);
-
-  int i;
-  for (i = 0; i < 8; ++i) {
-    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm256_add_epi16(rep, one);
-  }
-}
-
-static INLINE __m256i get_left_vector(const uint8_t *left) {
-  const __m128i x = _mm_load_si128((const __m128i *)left);
-  return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
-}
-
-void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m256i l = get_left_vector(left);
-  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16((short)0x8000);
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i top = get_top_vector(above);
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm256_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m256i l = get_left_vector(left);
-  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16((short)0x8000);
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i top = get_top_vector(above);
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm256_add_epi16(rep, one);
-  }
-
-  l = get_left_vector(left + 16);
-  rep = _mm256_set1_epi16((short)0x8000);
-  for (i = 0; i < 16; ++i) {
-    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm256_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i top = get_top_vector(above);
-
-  for (int j = 0; j < 4; ++j) {
-    const __m256i l = get_left_vector(left + j * 16);
-    __m256i rep = _mm256_set1_epi16((short)0x8000);
-    for (int i = 0; i < 16; ++i) {
-      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-      const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
-
-      _mm_store_si128((__m128i *)dst, row);
-      dst += stride;
-      rep = _mm256_add_epi16(rep, one);
-    }
-  }
-}
-
-// Return 32 8-bit pixels in one row (__m256i)
-static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
-                                      const __m256i *top1,
-                                      const __m256i *topleft) {
-  __m256i p0 = paeth_pred(left, top0, topleft);
-  __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
-  const __m256i x0 = _mm256_packus_epi16(p0, p1);
-
-  p0 = paeth_pred(left, top1, topleft);
-  p1 = _mm256_permute4x64_epi64(p0, 0xe);
-  const __m256i x1 = _mm256_packus_epi16(p0, p1);
-
-  return _mm256_permute2x128_si256(x0, x1, 0x20);
-}
-
-void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m256i l = get_left_vector(left);
-  const __m256i t0 = get_top_vector(above);
-  const __m256i t1 = get_top_vector(above + 16);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16((short)0x8000);
-  const __m256i one = _mm256_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
-    const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
-
-    _mm256_storeu_si256((__m256i *)dst, r);
-
-    dst += stride;
-    rep = _mm256_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m256i l = get_left_vector(left);
-  const __m256i t0 = get_top_vector(above);
-  const __m256i t1 = get_top_vector(above + 16);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16((short)0x8000);
-  const __m256i one = _mm256_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
-    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
-    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
-
-    _mm_store_si128((__m128i *)dst, r0);
-    _mm_store_si128((__m128i *)(dst + 16), r1);
-
-    dst += stride;
-    rep = _mm256_add_epi16(rep, one);
-  }
-
-  l = get_left_vector(left + 16);
-  rep = _mm256_set1_epi16((short)0x8000);
-  for (i = 0; i < 16; ++i) {
-    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
-    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
-    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
-
-    _mm_store_si128((__m128i *)dst, r0);
-    _mm_store_si128((__m128i *)(dst + 16), r1);
-
-    dst += stride;
-    rep = _mm256_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m256i t0 = get_top_vector(above);
-  const __m256i t1 = get_top_vector(above + 16);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
-  const __m256i one = _mm256_set1_epi16(1);
-
-  int i, j;
-  for (j = 0; j < 4; ++j) {
-    const __m256i l = get_left_vector(left + j * 16);
-    __m256i rep = _mm256_set1_epi16((short)0x8000);
-    for (i = 0; i < 16; ++i) {
-      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
-      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
-      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
-
-      _mm_store_si128((__m128i *)dst, r0);
-      _mm_store_si128((__m128i *)(dst + 16), r1);
-
-      dst += stride;
-      rep = _mm256_add_epi16(rep, one);
-    }
-  }
-}
-
-void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m256i t0 = get_top_vector(above);
-  const __m256i t1 = get_top_vector(above + 16);
-  const __m256i t2 = get_top_vector(above + 32);
-  const __m256i t3 = get_top_vector(above + 48);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
-  const __m256i one = _mm256_set1_epi16(1);
-
-  int i, j;
-  for (j = 0; j < 2; ++j) {
-    const __m256i l = get_left_vector(left + j * 16);
-    __m256i rep = _mm256_set1_epi16((short)0x8000);
-    for (i = 0; i < 16; ++i) {
-      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
-      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
-      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
-      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
-      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
-
-      _mm_store_si128((__m128i *)dst, r0);
-      _mm_store_si128((__m128i *)(dst + 16), r1);
-      _mm_store_si128((__m128i *)(dst + 32), r2);
-      _mm_store_si128((__m128i *)(dst + 48), r3);
-
-      dst += stride;
-      rep = _mm256_add_epi16(rep, one);
-    }
-  }
-}
-
-void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m256i t0 = get_top_vector(above);
-  const __m256i t1 = get_top_vector(above + 16);
-  const __m256i t2 = get_top_vector(above + 32);
-  const __m256i t3 = get_top_vector(above + 48);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
-  const __m256i one = _mm256_set1_epi16(1);
-
-  int i, j;
-  for (j = 0; j < 4; ++j) {
-    const __m256i l = get_left_vector(left + j * 16);
-    __m256i rep = _mm256_set1_epi16((short)0x8000);
-    for (i = 0; i < 16; ++i) {
-      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
-      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
-      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
-      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
-      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
-
-      _mm_store_si128((__m128i *)dst, r0);
-      _mm_store_si128((__m128i *)(dst + 16), r1);
-      _mm_store_si128((__m128i *)(dst + 32), r2);
-      _mm_store_si128((__m128i *)(dst + 48), r3);
-
-      dst += stride;
-      rep = _mm256_add_epi16(rep, one);
-    }
-  }
-}
-
-void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m256i t0 = get_top_vector(above);
-  const __m256i t1 = get_top_vector(above + 16);
-  const __m256i t2 = get_top_vector(above + 32);
-  const __m256i t3 = get_top_vector(above + 48);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
-  const __m256i one = _mm256_set1_epi16(1);
-
-  int i;
-  const __m256i l = get_left_vector(left);
-  __m256i rep = _mm256_set1_epi16((short)0x8000);
-  for (i = 0; i < 16; ++i) {
-    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
-    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
-    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
-    const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
-    const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
-
-    _mm_store_si128((__m128i *)dst, r0);
-    _mm_store_si128((__m128i *)(dst + 16), r1);
-    _mm_store_si128((__m128i *)(dst + 32), r2);
-    _mm_store_si128((__m128i *)(dst + 48), r3);
-
-    dst += stride;
-    rep = _mm256_add_epi16(rep, one);
-  }
-}
-
 #define PERM4x64(c0, c1, c2, c3) c0 + (c1 << 2) + (c2 << 4) + (c3 << 6)
 #define PERM2x128(c0, c1) c0 + (c1 << 4)
 

diff --git a/aom_dsp/x86/intrapred_sse2.c b/aom_dsp/x86/intrapred_sse2.c
deleted file mode 100644
index 6902dd9..0000000
--- a/aom_dsp/x86/intrapred_sse2.c
+++ /dev/null

@@ -1,1412 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <emmintrin.h>
-#include "aom_dsp/x86/intrapred_x86.h"
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
-                                ptrdiff_t stride) {
-  for (int i = 0; i < height; i += 2) {
-    *(uint32_t *)dst = dc;
-    dst += stride;
-    *(uint32_t *)dst = dc;
-    dst += stride;
-  }
-}
-
-static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
-                                ptrdiff_t stride) {
-  int i;
-  for (i = 0; i < height; ++i) {
-    _mm_storel_epi64((__m128i *)dst, *row);
-    dst += stride;
-  }
-}
-
-static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
-                                 ptrdiff_t stride) {
-  int i;
-  for (i = 0; i < height; ++i) {
-    _mm_store_si128((__m128i *)dst, *row);
-    dst += stride;
-  }
-}
-
-static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
-                                 ptrdiff_t stride) {
-  int i;
-  for (i = 0; i < height; ++i) {
-    _mm_store_si128((__m128i *)dst, *row);
-    _mm_store_si128((__m128i *)(dst + 16), *row);
-    dst += stride;
-  }
-}
-
-static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
-                                 ptrdiff_t stride) {
-  for (int i = 0; i < height; ++i) {
-    _mm_store_si128((__m128i *)dst, *row);
-    _mm_store_si128((__m128i *)(dst + 16), *row);
-    _mm_store_si128((__m128i *)(dst + 32), *row);
-    _mm_store_si128((__m128i *)(dst + 48), *row);
-    dst += stride;
-  }
-}
-
-static INLINE __m128i dc_sum_4(const uint8_t *ref) {
-  __m128i x = _mm_loadl_epi64((__m128i const *)ref);
-  const __m128i zero = _mm_setzero_si128();
-  x = _mm_unpacklo_epi8(x, zero);
-  return _mm_sad_epu8(x, zero);
-}
-
-static INLINE __m128i dc_sum_8(const uint8_t *ref) {
-  __m128i x = _mm_loadl_epi64((__m128i const *)ref);
-  const __m128i zero = _mm_setzero_si128();
-  return _mm_sad_epu8(x, zero);
-}
-
-static INLINE __m128i dc_sum_64(const uint8_t *ref) {
-  __m128i x0 = _mm_load_si128((__m128i const *)ref);
-  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
-  __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
-  __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
-  const __m128i zero = _mm_setzero_si128();
-  x0 = _mm_sad_epu8(x0, zero);
-  x1 = _mm_sad_epu8(x1, zero);
-  x2 = _mm_sad_epu8(x2, zero);
-  x3 = _mm_sad_epu8(x3, zero);
-  x0 = _mm_add_epi16(x0, x1);
-  x2 = _mm_add_epi16(x2, x3);
-  x0 = _mm_add_epi16(x0, x2);
-  const __m128i high = _mm_unpackhi_epi64(x0, x0);
-  return _mm_add_epi16(x0, high);
-}
-
-#define DC_MULTIPLIER_1X2 0x5556
-#define DC_MULTIPLIER_1X4 0x3334
-
-#define DC_SHIFT2 16
-
-static INLINE int divide_using_multiply_shift(int num, int shift1,
-                                              int multiplier) {
-  const int interm = num >> shift1;
-  return interm * multiplier >> DC_SHIFT2;
-}
-
-// -----------------------------------------------------------------------------
-// DC_PRED
-
-void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_8(left);
-  __m128i sum_above = dc_sum_4(above);
-  sum_above = _mm_add_epi16(sum_left, sum_above);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 6;
-  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
-
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  const uint32_t pred = _mm_cvtsi128_si32(row);
-  dc_store_4xh(pred, 8, dst, stride);
-}
-
-void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_16_sse2(left);
-  __m128i sum_above = dc_sum_4(above);
-  sum_above = _mm_add_epi16(sum_left, sum_above);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 10;
-  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
-
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  const uint32_t pred = _mm_cvtsi128_si32(row);
-  dc_store_4xh(pred, 16, dst, stride);
-}
-
-void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_4(left);
-  __m128i sum_above = dc_sum_8(above);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 6;
-  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
-
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_8xh(&row, 4, dst, stride);
-}
-
-void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_16_sse2(left);
-  __m128i sum_above = dc_sum_8(above);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 12;
-  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_8xh(&row, 16, dst, stride);
-}
-
-void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_32_sse2(left);
-  __m128i sum_above = dc_sum_8(above);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 20;
-  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_8xh(&row, 32, dst, stride);
-}
-
-void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_4(left);
-  __m128i sum_above = dc_sum_16_sse2(above);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 10;
-  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_16xh(&row, 4, dst, stride);
-}
-
-void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_8(left);
-  __m128i sum_above = dc_sum_16_sse2(above);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 12;
-  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_16xh(&row, 8, dst, stride);
-}
-
-void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_32_sse2(left);
-  __m128i sum_above = dc_sum_16_sse2(above);
-  sum_above = _mm_add_epi16(sum_left, sum_above);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 24;
-  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_16xh(&row, 32, dst, stride);
-}
-
-void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_64(left);
-  __m128i sum_above = dc_sum_16_sse2(above);
-  sum_above = _mm_add_epi16(sum_left, sum_above);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 40;
-  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_16xh(&row, 64, dst, stride);
-}
-
-void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  __m128i sum_above = dc_sum_32_sse2(above);
-  const __m128i sum_left = dc_sum_8(left);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 20;
-  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_32xh(&row, 8, dst, stride);
-}
-
-void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  __m128i sum_above = dc_sum_32_sse2(above);
-  const __m128i sum_left = dc_sum_16_sse2(left);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 24;
-  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  __m128i sum_above = dc_sum_32_sse2(above);
-  const __m128i sum_left = dc_sum_64(left);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 48;
-  sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  __m128i sum_above = dc_sum_64(above);
-  const __m128i sum_left = dc_sum_64(left);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 64;
-  sum /= 128;
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  __m128i sum_above = dc_sum_64(above);
-  const __m128i sum_left = dc_sum_32_sse2(left);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 48;
-  sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  __m128i sum_above = dc_sum_64(above);
-  const __m128i sum_left = dc_sum_16_sse2(left);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 40;
-  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_64xh(&row, 16, dst, stride);
-}
-
-// -----------------------------------------------------------------------------
-// DC_TOP
-
-void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_4(above);
-  const __m128i two = _mm_set1_epi16((int16_t)2);
-  sum_above = _mm_add_epi16(sum_above, two);
-  sum_above = _mm_srai_epi16(sum_above, 2);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  sum_above = _mm_packus_epi16(sum_above, sum_above);
-
-  const uint32_t pred = _mm_cvtsi128_si32(sum_above);
-  dc_store_4xh(pred, 8, dst, stride);
-}
-
-void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_4(above);
-  const __m128i two = _mm_set1_epi16((int16_t)2);
-  sum_above = _mm_add_epi16(sum_above, two);
-  sum_above = _mm_srai_epi16(sum_above, 2);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  sum_above = _mm_packus_epi16(sum_above, sum_above);
-
-  const uint32_t pred = _mm_cvtsi128_si32(sum_above);
-  dc_store_4xh(pred, 16, dst, stride);
-}
-
-void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_8(above);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
-  sum_above = _mm_add_epi16(sum_above, four);
-  sum_above = _mm_srai_epi16(sum_above, 3);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
-  dc_store_8xh(&row, 4, dst, stride);
-}
-
-void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_8(above);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
-  sum_above = _mm_add_epi16(sum_above, four);
-  sum_above = _mm_srai_epi16(sum_above, 3);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
-  dc_store_8xh(&row, 16, dst, stride);
-}
-
-void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_8(above);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
-  sum_above = _mm_add_epi16(sum_above, four);
-  sum_above = _mm_srai_epi16(sum_above, 3);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
-  dc_store_8xh(&row, 32, dst, stride);
-}
-
-void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_16_sse2(above);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
-  sum_above = _mm_add_epi16(sum_above, eight);
-  sum_above = _mm_srai_epi16(sum_above, 4);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_16xh(&row, 4, dst, stride);
-}
-
-void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_16_sse2(above);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
-  sum_above = _mm_add_epi16(sum_above, eight);
-  sum_above = _mm_srai_epi16(sum_above, 4);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_16xh(&row, 8, dst, stride);
-}
-
-void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_16_sse2(above);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
-  sum_above = _mm_add_epi16(sum_above, eight);
-  sum_above = _mm_srai_epi16(sum_above, 4);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_16xh(&row, 32, dst, stride);
-}
-
-void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_16_sse2(above);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
-  sum_above = _mm_add_epi16(sum_above, eight);
-  sum_above = _mm_srai_epi16(sum_above, 4);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_16xh(&row, 64, dst, stride);
-}
-
-void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_32_sse2(above);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
-  sum_above = _mm_add_epi16(sum_above, sixteen);
-  sum_above = _mm_srai_epi16(sum_above, 5);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_32xh(&row, 8, dst, stride);
-}
-
-void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_32_sse2(above);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
-  sum_above = _mm_add_epi16(sum_above, sixteen);
-  sum_above = _mm_srai_epi16(sum_above, 5);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_32_sse2(above);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
-  sum_above = _mm_add_epi16(sum_above, sixteen);
-  sum_above = _mm_srai_epi16(sum_above, 5);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_64(above);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
-  sum_above = _mm_add_epi16(sum_above, thirtytwo);
-  sum_above = _mm_srai_epi16(sum_above, 6);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_64(above);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
-  sum_above = _mm_add_epi16(sum_above, thirtytwo);
-  sum_above = _mm_srai_epi16(sum_above, 6);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_64(above);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
-  sum_above = _mm_add_epi16(sum_above, thirtytwo);
-  sum_above = _mm_srai_epi16(sum_above, 6);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_64xh(&row, 16, dst, stride);
-}
-
-// -----------------------------------------------------------------------------
-// DC_LEFT
-
-void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_8(left);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
-  sum_left = _mm_add_epi16(sum_left, four);
-  sum_left = _mm_srai_epi16(sum_left, 3);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  sum_left = _mm_packus_epi16(sum_left, sum_left);
-
-  const uint32_t pred = _mm_cvtsi128_si32(sum_left);
-  dc_store_4xh(pred, 8, dst, stride);
-}
-
-void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_16_sse2(left);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
-  sum_left = _mm_add_epi16(sum_left, eight);
-  sum_left = _mm_srai_epi16(sum_left, 4);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  sum_left = _mm_packus_epi16(sum_left, sum_left);
-
-  const uint32_t pred = _mm_cvtsi128_si32(sum_left);
-  dc_store_4xh(pred, 16, dst, stride);
-}
-
-void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_4(left);
-  const __m128i two = _mm_set1_epi16((uint16_t)2);
-  sum_left = _mm_add_epi16(sum_left, two);
-  sum_left = _mm_srai_epi16(sum_left, 2);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
-  dc_store_8xh(&row, 4, dst, stride);
-}
-
-void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_16_sse2(left);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
-  sum_left = _mm_add_epi16(sum_left, eight);
-  sum_left = _mm_srai_epi16(sum_left, 4);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
-  dc_store_8xh(&row, 16, dst, stride);
-}
-
-void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_32_sse2(left);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
-  sum_left = _mm_add_epi16(sum_left, sixteen);
-  sum_left = _mm_srai_epi16(sum_left, 5);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
-  dc_store_8xh(&row, 32, dst, stride);
-}
-
-void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_4(left);
-  const __m128i two = _mm_set1_epi16((uint16_t)2);
-  sum_left = _mm_add_epi16(sum_left, two);
-  sum_left = _mm_srai_epi16(sum_left, 2);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_16xh(&row, 4, dst, stride);
-}
-
-void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_8(left);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
-  sum_left = _mm_add_epi16(sum_left, four);
-  sum_left = _mm_srai_epi16(sum_left, 3);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_16xh(&row, 8, dst, stride);
-}
-
-void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_32_sse2(left);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
-  sum_left = _mm_add_epi16(sum_left, sixteen);
-  sum_left = _mm_srai_epi16(sum_left, 5);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_16xh(&row, 32, dst, stride);
-}
-
-void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_64(left);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
-  sum_left = _mm_add_epi16(sum_left, thirtytwo);
-  sum_left = _mm_srai_epi16(sum_left, 6);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_16xh(&row, 64, dst, stride);
-}
-
-void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_8(left);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
-  sum_left = _mm_add_epi16(sum_left, four);
-  sum_left = _mm_srai_epi16(sum_left, 3);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_32xh(&row, 8, dst, stride);
-}
-
-void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_16_sse2(left);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
-  sum_left = _mm_add_epi16(sum_left, eight);
-  sum_left = _mm_srai_epi16(sum_left, 4);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_64(left);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
-  sum_left = _mm_add_epi16(sum_left, thirtytwo);
-  sum_left = _mm_srai_epi16(sum_left, 6);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_64(left);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
-  sum_left = _mm_add_epi16(sum_left, thirtytwo);
-  sum_left = _mm_srai_epi16(sum_left, 6);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_32_sse2(left);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
-  sum_left = _mm_add_epi16(sum_left, sixteen);
-  sum_left = _mm_srai_epi16(sum_left, 5);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_16_sse2(left);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
-  sum_left = _mm_add_epi16(sum_left, eight);
-  sum_left = _mm_srai_epi16(sum_left, 4);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_64xh(&row, 16, dst, stride);
-}
-
-// -----------------------------------------------------------------------------
-// DC_128
-
-void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const uint32_t pred = 0x80808080;
-  dc_store_4xh(pred, 8, dst, stride);
-}
-
-void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const uint32_t pred = 0x80808080;
-  dc_store_4xh(pred, 16, dst, stride);
-}
-
-void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_8xh(&row, 4, dst, stride);
-}
-
-void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_8xh(&row, 16, dst, stride);
-}
-
-void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_8xh(&row, 32, dst, stride);
-}
-
-void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_16xh(&row, 4, dst, stride);
-}
-
-void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_16xh(&row, 8, dst, stride);
-}
-
-void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_16xh(&row, 32, dst, stride);
-}
-
-void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_16xh(&row, 64, dst, stride);
-}
-
-void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_32xh(&row, 8, dst, stride);
-}
-
-void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_64xh(&row, 16, dst, stride);
-}
-
-// -----------------------------------------------------------------------------
-// V_PRED
-
-void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const uint32_t pred = *(uint32_t *)above;
-  (void)left;
-  dc_store_4xh(pred, 8, dst, stride);
-}
-
-void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  const uint32_t pred = *(uint32_t *)above;
-  (void)left;
-  dc_store_4xh(pred, 16, dst, stride);
-}
-
-void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
-  (void)left;
-  dc_store_8xh(&row, 4, dst, stride);
-}
-
-void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
-  (void)left;
-  dc_store_8xh(&row, 16, dst, stride);
-}
-
-void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
-  (void)left;
-  dc_store_8xh(&row, 32, dst, stride);
-}
-
-void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  const __m128i row = _mm_load_si128((__m128i const *)above);
-  (void)left;
-  dc_store_16xh(&row, 4, dst, stride);
-}
-
-void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  const __m128i row = _mm_load_si128((__m128i const *)above);
-  (void)left;
-  dc_store_16xh(&row, 8, dst, stride);
-}
-
-void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m128i row = _mm_load_si128((__m128i const *)above);
-  (void)left;
-  dc_store_16xh(&row, 32, dst, stride);
-}
-
-void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m128i row = _mm_load_si128((__m128i const *)above);
-  (void)left;
-  dc_store_16xh(&row, 64, dst, stride);
-}
-
-static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, int height) {
-  const __m128i row0 = _mm_load_si128((__m128i const *)above);
-  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
-  for (int i = 0; i < height; ++i) {
-    _mm_store_si128((__m128i *)dst, row0);
-    _mm_store_si128((__m128i *)(dst + 16), row1);
-    dst += stride;
-  }
-}
-
-void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  v_predictor_32xh(dst, stride, above, 8);
-}
-
-void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  v_predictor_32xh(dst, stride, above, 16);
-}
-
-void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  v_predictor_32xh(dst, stride, above, 64);
-}
-
-static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, int height) {
-  const __m128i row0 = _mm_load_si128((__m128i const *)above);
-  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
-  const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
-  const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
-  for (int i = 0; i < height; ++i) {
-    _mm_store_si128((__m128i *)dst, row0);
-    _mm_store_si128((__m128i *)(dst + 16), row1);
-    _mm_store_si128((__m128i *)(dst + 32), row2);
-    _mm_store_si128((__m128i *)(dst + 48), row3);
-    dst += stride;
-  }
-}
-
-void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  v_predictor_64xh(dst, stride, above, 64);
-}
-
-void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  v_predictor_64xh(dst, stride, above, 32);
-}
-
-void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  v_predictor_64xh(dst, stride, above, 16);
-}
-
-// -----------------------------------------------------------------------------
-// H_PRED
-
-void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
-  left_col = _mm_unpacklo_epi8(left_col, left_col);
-  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
-  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
-  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
-  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
-  dst += stride;
-  left_col = _mm_unpackhi_epi64(left_col, left_col);
-  row0 = _mm_shufflelo_epi16(left_col, 0);
-  row1 = _mm_shufflelo_epi16(left_col, 0x55);
-  row2 = _mm_shufflelo_epi16(left_col, 0xaa);
-  row3 = _mm_shufflelo_epi16(left_col, 0xff);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
-}
-
-void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  const __m128i left_col = _mm_load_si128((__m128i const *)left);
-  __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
-  __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
-
-  __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
-  __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
-  __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
-  __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
-  dst += stride;
-
-  left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
-  row0 = _mm_shufflelo_epi16(left_col_low, 0);
-  row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
-  row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
-  row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
-  dst += stride;
-
-  row0 = _mm_shufflelo_epi16(left_col_high, 0);
-  row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
-  row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
-  row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
-  dst += stride;
-
-  left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
-  row0 = _mm_shufflelo_epi16(left_col_high, 0);
-  row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
-  row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
-  row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
-}
-
-void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
-  left_col = _mm_unpacklo_epi8(left_col, left_col);
-  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
-  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
-  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
-  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
-  _mm_storel_epi64((__m128i *)dst, row0);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
-}
-
-static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above, const uint8_t *left,
-                                      int count) {
-  (void)above;
-  for (int i = 0; i < count; ++i) {
-    const __m128i left_col = _mm_load_si128((__m128i const *)left);
-    __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
-    __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
-
-    __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
-    __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
-    __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
-    __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
-    _mm_storel_epi64((__m128i *)dst, row0);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row1);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row2);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row3);
-    dst += stride;
-
-    left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
-    row0 = _mm_shufflelo_epi16(left_col_low, 0);
-    row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
-    row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
-    row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
-    _mm_storel_epi64((__m128i *)dst, row0);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row1);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row2);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row3);
-    dst += stride;
-
-    row0 = _mm_shufflelo_epi16(left_col_high, 0);
-    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
-    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
-    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
-    _mm_storel_epi64((__m128i *)dst, row0);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row1);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row2);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row3);
-    dst += stride;
-
-    left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
-    row0 = _mm_shufflelo_epi16(left_col_high, 0);
-    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
-    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
-    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
-    _mm_storel_epi64((__m128i *)dst, row0);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row1);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row2);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row3);
-    dst += stride;
-    left += 16;
-  }
-}
-
-void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  h_predictor_8x16xc(dst, stride, above, left, 1);
-}
-
-void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  h_predictor_8x16xc(dst, stride, above, left, 2);
-}
-
-static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
-                                     ptrdiff_t stride) {
-  int i;
-  for (i = 0; i < h; ++i) {
-    _mm_store_si128((__m128i *)dst, row[i]);
-    dst += stride;
-  }
-}
-
-static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) {
-  const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
-  const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
-  const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
-  const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
-
-  row[0] = _mm_unpacklo_epi64(u0, u0);
-  row[1] = _mm_unpacklo_epi64(u1, u1);
-  row[2] = _mm_unpacklo_epi64(u2, u2);
-  row[3] = _mm_unpacklo_epi64(u3, u3);
-}
-
-static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) {
-  const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
-  const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
-  const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
-  const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
-
-  row[0] = _mm_unpackhi_epi64(u0, u0);
-  row[1] = _mm_unpackhi_epi64(u1, u1);
-  row[2] = _mm_unpackhi_epi64(u2, u2);
-  row[3] = _mm_unpackhi_epi64(u3, u3);
-}
-
-// Process 16x8, first 4 rows
-// Use first 8 bytes of left register: xxxxxxxx33221100
-static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
-                                       ptrdiff_t stride) {
-  __m128i row[4];
-  repeat_low_4pixels(left, row);
-  h_pred_store_16xh(row, 4, dst, stride);
-}
-
-// Process 16x8, second 4 rows
-// Use second 8 bytes of left register: 77665544xxxxxxxx
-static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
-                                       ptrdiff_t stride) {
-  __m128i row[4];
-  repeat_high_4pixels(left, row);
-  h_pred_store_16xh(row, 4, dst, stride);
-}
-
-void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
-  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
-  h_prediction_16x8_1(&left_col_8p, dst, stride);
-}
-
-void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
-  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
-  h_prediction_16x8_1(&left_col_8p, dst, stride);
-  dst += stride << 2;
-  h_prediction_16x8_2(&left_col_8p, dst, stride);
-}
-
-static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *left, int count) {
-  int i = 0;
-  do {
-    const __m128i left_col = _mm_load_si128((const __m128i *)left);
-    const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
-    h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
-    dst += stride << 2;
-    h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
-    dst += stride << 2;
-
-    const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
-    h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
-    dst += stride << 2;
-    h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
-    dst += stride << 2;
-
-    left += 16;
-    i++;
-  } while (i < count);
-}
-
-void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  h_predictor_16xh(dst, stride, left, 2);
-}
-
-void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  h_predictor_16xh(dst, stride, left, 4);
-}
-
-static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
-                                     ptrdiff_t stride) {
-  int i;
-  for (i = 0; i < h; ++i) {
-    _mm_store_si128((__m128i *)dst, row[i]);
-    _mm_store_si128((__m128i *)(dst + 16), row[i]);
-    dst += stride;
-  }
-}
-
-// Process 32x8, first 4 rows
-// Use first 8 bytes of left register: xxxxxxxx33221100
-static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
-                                       ptrdiff_t stride) {
-  __m128i row[4];
-  repeat_low_4pixels(left, row);
-  h_pred_store_32xh(row, 4, dst, stride);
-}
-
-// Process 32x8, second 4 rows
-// Use second 8 bytes of left register: 77665544xxxxxxxx
-static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
-                                       ptrdiff_t stride) {
-  __m128i row[4];
-  repeat_high_4pixels(left, row);
-  h_pred_store_32xh(row, 4, dst, stride);
-}
-
-void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  __m128i left_col, left_col_8p;
-  (void)above;
-
-  left_col = _mm_load_si128((const __m128i *)left);
-
-  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
-  h_prediction_32x8_1(&left_col_8p, dst, stride);
-  dst += stride << 2;
-  h_prediction_32x8_2(&left_col_8p, dst, stride);
-}
-
-void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  __m128i left_col, left_col_8p;
-  (void)above;
-
-  left_col = _mm_load_si128((const __m128i *)left);
-
-  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
-  h_prediction_32x8_1(&left_col_8p, dst, stride);
-  dst += stride << 2;
-  h_prediction_32x8_2(&left_col_8p, dst, stride);
-  dst += stride << 2;
-
-  left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
-  h_prediction_32x8_1(&left_col_8p, dst, stride);
-  dst += stride << 2;
-  h_prediction_32x8_2(&left_col_8p, dst, stride);
-}
-
-static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *left, int height) {
-  int i = height >> 2;
-  do {
-    __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
-    left4 = _mm_unpacklo_epi8(left4, left4);
-    left4 = _mm_unpacklo_epi8(left4, left4);
-    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
-    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
-    _mm_store_si128((__m128i *)dst, r0);
-    _mm_store_si128((__m128i *)(dst + 16), r0);
-    _mm_store_si128((__m128i *)(dst + stride), r1);
-    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
-    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
-    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
-    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
-    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
-    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
-    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
-    left += 4;
-    dst += stride * 4;
-  } while (--i);
-}
-
-void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  h_predictor_32xh(dst, stride, left, 64);
-}
-
-static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *left, int height) {
-  int i = height >> 2;
-  do {
-    __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
-    left4 = _mm_unpacklo_epi8(left4, left4);
-    left4 = _mm_unpacklo_epi8(left4, left4);
-    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
-    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
-    _mm_store_si128((__m128i *)dst, r0);
-    _mm_store_si128((__m128i *)(dst + 16), r0);
-    _mm_store_si128((__m128i *)(dst + 32), r0);
-    _mm_store_si128((__m128i *)(dst + 48), r0);
-    _mm_store_si128((__m128i *)(dst + stride), r1);
-    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
-    _mm_store_si128((__m128i *)(dst + stride + 32), r1);
-    _mm_store_si128((__m128i *)(dst + stride + 48), r1);
-    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
-    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
-    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
-    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
-    _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2);
-    _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2);
-    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
-    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
-    _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3);
-    _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3);
-    left += 4;
-    dst += stride * 4;
-  } while (--i);
-}
-
-void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  h_predictor_64xh(dst, stride, left, 64);
-}
-
-void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  h_predictor_64xh(dst, stride, left, 32);
-}
-
-void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  h_predictor_64xh(dst, stride, left, 16);
-}

diff --git a/aom_dsp/x86/intrapred_ssse3.c b/aom_dsp/x86/intrapred_ssse3.c
deleted file mode 100644
index 468d036..0000000
--- a/aom_dsp/x86/intrapred_ssse3.c
+++ /dev/null

@@ -1,1696 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <tmmintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/intrapred_common.h"
-
-// -----------------------------------------------------------------------------
-// PAETH_PRED
-
-// Return 8 16-bit pixels in one row
-static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
-                                     const __m128i *topleft) {
-  const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
-
-  __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
-  __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
-  __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
-
-  __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
-  mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
-  __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
-
-  pl = _mm_andnot_si128(mask1, *left);
-
-  ptl = _mm_and_si128(mask2, *topleft);
-  pt = _mm_andnot_si128(mask2, *top);
-  pt = _mm_or_si128(pt, ptl);
-  pt = _mm_and_si128(mask1, pt);
-
-  return _mm_or_si128(pl, pt);
-}
-
-void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_loadl_epi64((const __m128i *)left);
-  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16((short)0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 4; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
-    *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_loadl_epi64((const __m128i *)left);
-  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16((short)0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 8; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
-    *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_load_si128((const __m128i *)left);
-  const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16((short)0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  for (int i = 0; i < 16; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
-    *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_loadl_epi64((const __m128i *)left);
-  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16((short)0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 4; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
-    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_loadl_epi64((const __m128i *)left);
-  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16((short)0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 8; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
-    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_load_si128((const __m128i *)left);
-  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16((short)0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
-    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  const __m128i one = _mm_set1_epi16(1);
-
-  for (int j = 0; j < 2; ++j) {
-    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16((short)0x8000);
-    for (int i = 0; i < 16; ++i) {
-      const __m128i l16 = _mm_shuffle_epi8(l, rep);
-      const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
-      _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
-      dst += stride;
-      rep = _mm_add_epi16(rep, one);
-    }
-  }
-}
-
-// Return 16 8-bit pixels in one row
-static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
-                                      const __m128i *top1,
-                                      const __m128i *topleft) {
-  const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
-  const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
-  return _mm_packus_epi16(p0, p1);
-}
-
-void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
-  const __m128i t = _mm_load_si128((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
-  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16((short)0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  for (int i = 0; i < 4; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_loadl_epi64((const __m128i *)left);
-  const __m128i t = _mm_load_si128((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
-  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16((short)0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 8; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i l = _mm_load_si128((const __m128i *)left);
-  const __m128i t = _mm_load_si128((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
-  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16((short)0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i l = _mm_load_si128((const __m128i *)left);
-  const __m128i t = _mm_load_si128((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
-  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16((short)0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i l16;
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-
-  l = _mm_load_si128((const __m128i *)(left + 16));
-  rep = _mm_set1_epi16((short)0x8000);
-  for (i = 0; i < 16; ++i) {
-    l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  const __m128i t = _mm_load_si128((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
-  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  const __m128i one = _mm_set1_epi16(1);
-
-  for (int j = 0; j < 4; ++j) {
-    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16((short)0x8000);
-    for (int i = 0; i < 16; ++i) {
-      const __m128i l16 = _mm_shuffle_epi8(l, rep);
-      const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
-      _mm_store_si128((__m128i *)dst, row);
-      dst += stride;
-      rep = _mm_add_epi16(rep, one);
-    }
-  }
-}
-
-void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m128i a = _mm_load_si128((const __m128i *)above);
-  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i al = _mm_unpacklo_epi8(a, zero);
-  const __m128i ah = _mm_unpackhi_epi8(a, zero);
-  const __m128i bl = _mm_unpacklo_epi8(b, zero);
-  const __m128i bh = _mm_unpackhi_epi8(b, zero);
-
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16((short)0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i l = _mm_loadl_epi64((const __m128i *)left);
-  __m128i l16;
-
-  for (int i = 0; i < 8; ++i) {
-    l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
-    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-
-    _mm_store_si128((__m128i *)dst, r32l);
-    _mm_store_si128((__m128i *)(dst + 16), r32h);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  const __m128i a = _mm_load_si128((const __m128i *)above);
-  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i al = _mm_unpacklo_epi8(a, zero);
-  const __m128i ah = _mm_unpackhi_epi8(a, zero);
-  const __m128i bl = _mm_unpacklo_epi8(b, zero);
-  const __m128i bh = _mm_unpackhi_epi8(b, zero);
-
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16((short)0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i l = _mm_load_si128((const __m128i *)left);
-  __m128i l16;
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
-    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-
-    _mm_store_si128((__m128i *)dst, r32l);
-    _mm_store_si128((__m128i *)(dst + 16), r32h);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  const __m128i a = _mm_load_si128((const __m128i *)above);
-  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i al = _mm_unpacklo_epi8(a, zero);
-  const __m128i ah = _mm_unpackhi_epi8(a, zero);
-  const __m128i bl = _mm_unpacklo_epi8(b, zero);
-  const __m128i bh = _mm_unpackhi_epi8(b, zero);
-
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16((short)0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i l = _mm_load_si128((const __m128i *)left);
-  __m128i l16;
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
-    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-
-    _mm_store_si128((__m128i *)dst, r32l);
-    _mm_store_si128((__m128i *)(dst + 16), r32h);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-
-  rep = _mm_set1_epi16((short)0x8000);
-  l = _mm_load_si128((const __m128i *)(left + 16));
-  for (i = 0; i < 16; ++i) {
-    l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
-    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-
-    _mm_store_si128((__m128i *)dst, r32l);
-    _mm_store_si128((__m128i *)(dst + 16), r32h);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  const __m128i a = _mm_load_si128((const __m128i *)above);
-  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i al = _mm_unpacklo_epi8(a, zero);
-  const __m128i ah = _mm_unpackhi_epi8(a, zero);
-  const __m128i bl = _mm_unpacklo_epi8(b, zero);
-  const __m128i bh = _mm_unpackhi_epi8(b, zero);
-
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i l16;
-
-  int i, j;
-  for (j = 0; j < 4; ++j) {
-    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16((short)0x8000);
-    for (i = 0; i < 16; ++i) {
-      l16 = _mm_shuffle_epi8(l, rep);
-      const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
-      const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-
-      _mm_store_si128((__m128i *)dst, r32l);
-      _mm_store_si128((__m128i *)(dst + 16), r32h);
-      dst += stride;
-      rep = _mm_add_epi16(rep, one);
-    }
-  }
-}
-
-void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  const __m128i a = _mm_load_si128((const __m128i *)above);
-  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
-  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i al = _mm_unpacklo_epi8(a, zero);
-  const __m128i ah = _mm_unpackhi_epi8(a, zero);
-  const __m128i bl = _mm_unpacklo_epi8(b, zero);
-  const __m128i bh = _mm_unpackhi_epi8(b, zero);
-  const __m128i cl = _mm_unpacklo_epi8(c, zero);
-  const __m128i ch = _mm_unpackhi_epi8(c, zero);
-  const __m128i dl = _mm_unpacklo_epi8(d, zero);
-  const __m128i dh = _mm_unpackhi_epi8(d, zero);
-
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i l16;
-
-  int i, j;
-  for (j = 0; j < 2; ++j) {
-    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16((short)0x8000);
-    for (i = 0; i < 16; ++i) {
-      l16 = _mm_shuffle_epi8(l, rep);
-      const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
-      const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-      const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
-      const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
-
-      _mm_store_si128((__m128i *)dst, r0);
-      _mm_store_si128((__m128i *)(dst + 16), r1);
-      _mm_store_si128((__m128i *)(dst + 32), r2);
-      _mm_store_si128((__m128i *)(dst + 48), r3);
-      dst += stride;
-      rep = _mm_add_epi16(rep, one);
-    }
-  }
-}
-
-void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  const __m128i a = _mm_load_si128((const __m128i *)above);
-  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
-  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i al = _mm_unpacklo_epi8(a, zero);
-  const __m128i ah = _mm_unpackhi_epi8(a, zero);
-  const __m128i bl = _mm_unpacklo_epi8(b, zero);
-  const __m128i bh = _mm_unpackhi_epi8(b, zero);
-  const __m128i cl = _mm_unpacklo_epi8(c, zero);
-  const __m128i ch = _mm_unpackhi_epi8(c, zero);
-  const __m128i dl = _mm_unpacklo_epi8(d, zero);
-  const __m128i dh = _mm_unpackhi_epi8(d, zero);
-
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i l16;
-
-  int i, j;
-  for (j = 0; j < 4; ++j) {
-    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16((short)0x8000);
-    for (i = 0; i < 16; ++i) {
-      l16 = _mm_shuffle_epi8(l, rep);
-      const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
-      const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-      const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
-      const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
-
-      _mm_store_si128((__m128i *)dst, r0);
-      _mm_store_si128((__m128i *)(dst + 16), r1);
-      _mm_store_si128((__m128i *)(dst + 32), r2);
-      _mm_store_si128((__m128i *)(dst + 48), r3);
-      dst += stride;
-      rep = _mm_add_epi16(rep, one);
-    }
-  }
-}
-
-void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  const __m128i a = _mm_load_si128((const __m128i *)above);
-  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
-  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i al = _mm_unpacklo_epi8(a, zero);
-  const __m128i ah = _mm_unpackhi_epi8(a, zero);
-  const __m128i bl = _mm_unpacklo_epi8(b, zero);
-  const __m128i bh = _mm_unpackhi_epi8(b, zero);
-  const __m128i cl = _mm_unpacklo_epi8(c, zero);
-  const __m128i ch = _mm_unpackhi_epi8(c, zero);
-  const __m128i dl = _mm_unpacklo_epi8(d, zero);
-  const __m128i dh = _mm_unpackhi_epi8(d, zero);
-
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i l16;
-
-  int i;
-  const __m128i l = _mm_load_si128((const __m128i *)left);
-  __m128i rep = _mm_set1_epi16((short)0x8000);
-  for (i = 0; i < 16; ++i) {
-    l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
-    const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-    const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
-    const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
-
-    _mm_store_si128((__m128i *)dst, r0);
-    _mm_store_si128((__m128i *)(dst + 16), r1);
-    _mm_store_si128((__m128i *)(dst + 32), r2);
-    _mm_store_si128((__m128i *)(dst + 48), r3);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-// -----------------------------------------------------------------------------
-// SMOOTH_PRED
-
-// pixels[0]: above and below_pred interleave vector
-// pixels[1]: left vector
-// pixels[2]: right_pred vector
-static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
-                                 int height, __m128i *pixels) {
-  __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
-  if (height == 4)
-    pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
-  else if (height == 8)
-    pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
-  else
-    pixels[1] = _mm_loadu_si128(((const __m128i *)left));
-
-  pixels[2] = _mm_set1_epi16((uint16_t)above[3]);
-
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
-  const __m128i zero = _mm_setzero_si128();
-  d = _mm_unpacklo_epi8(d, zero);
-  pixels[0] = _mm_unpacklo_epi16(d, bp);
-}
-
-// weight_h[0]: weight_h vector
-// weight_h[1]: scale - weight_h vector
-// weight_h[2]: same as [0], second half for height = 16 only
-// weight_h[3]: same as [1], second half for height = 16 only
-// weight_w[0]: weights_w and scale - weights_w interleave vector
-static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
-                                  __m128i *weight_h, __m128i *weight_w) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
-  weight_h[0] = _mm_unpacklo_epi8(t, zero);
-  weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-  weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
-
-  if (height == 8) {
-    const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
-    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-  } else if (height == 16) {
-    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
-    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(weight, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
-  }
-}
-
-static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
-                                   const __m128i *ww, int h, uint8_t *dst,
-                                   ptrdiff_t stride, int second_half) {
-  const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i inc = _mm_set1_epi16(0x202);
-  const __m128i gat = _mm_set1_epi32(0xc080400);
-  __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
-                            : _mm_set1_epi16((short)0x8000);
-  __m128i d = _mm_set1_epi16(0x100);
-
-  for (int i = 0; i < h; ++i) {
-    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
-    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
-    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
-    __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
-
-    __m128i b = _mm_shuffle_epi8(pixel[1], rep);
-    b = _mm_unpacklo_epi16(b, pixel[2]);
-    __m128i sum = _mm_madd_epi16(b, ww[0]);
-
-    sum = _mm_add_epi32(s, sum);
-    sum = _mm_add_epi32(sum, round);
-    sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale);
-
-    sum = _mm_shuffle_epi8(sum, gat);
-    *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
-    dst += stride;
-
-    rep = _mm_add_epi16(rep, one);
-    d = _mm_add_epi16(d, inc);
-  }
-}
-
-void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i pixels[3];
-  load_pixel_w4(above, left, 4, pixels);
-
-  __m128i wh[4], ww[2];
-  load_weight_w4(sm_weight_arrays, 4, wh, ww);
-
-  smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
-}
-
-void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i pixels[3];
-  load_pixel_w4(above, left, 8, pixels);
-
-  __m128i wh[4], ww[2];
-  load_weight_w4(sm_weight_arrays, 8, wh, ww);
-
-  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
-}
-
-void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i pixels[3];
-  load_pixel_w4(above, left, 16, pixels);
-
-  __m128i wh[4], ww[2];
-  load_weight_w4(sm_weight_arrays, 16, wh, ww);
-
-  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
-}
-
-// pixels[0]: above and below_pred interleave vector, first half
-// pixels[1]: above and below_pred interleave vector, second half
-// pixels[2]: left vector
-// pixels[3]: right_pred vector
-// pixels[4]: above and below_pred interleave vector, first half
-// pixels[5]: above and below_pred interleave vector, second half
-// pixels[6]: left vector + 16
-// pixels[7]: right_pred vector
-static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
-                                 int height, __m128i *pixels) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
-  __m128i d = _mm_loadl_epi64((const __m128i *)above);
-  d = _mm_unpacklo_epi8(d, zero);
-  pixels[0] = _mm_unpacklo_epi16(d, bp);
-  pixels[1] = _mm_unpackhi_epi16(d, bp);
-
-  pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
-
-  if (height == 4) {
-    pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
-  } else if (height == 8) {
-    pixels[2] = _mm_loadl_epi64((const __m128i *)left);
-  } else if (height == 16) {
-    pixels[2] = _mm_load_si128((const __m128i *)left);
-  } else {
-    pixels[2] = _mm_load_si128((const __m128i *)left);
-    pixels[4] = pixels[0];
-    pixels[5] = pixels[1];
-    pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
-    pixels[7] = pixels[3];
-  }
-}
-
-// weight_h[0]: weight_h vector
-// weight_h[1]: scale - weight_h vector
-// weight_h[2]: same as [0], offset 8
-// weight_h[3]: same as [1], offset 8
-// weight_h[4]: same as [0], offset 16
-// weight_h[5]: same as [1], offset 16
-// weight_h[6]: same as [0], offset 24
-// weight_h[7]: same as [1], offset 24
-// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
-// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
-static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
-                                  __m128i *weight_h, __m128i *weight_w) {
-  const __m128i zero = _mm_setzero_si128();
-  const int we_offset = height < 8 ? 4 : 8;
-  __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]);
-  weight_h[0] = _mm_unpacklo_epi8(we, zero);
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-
-  if (height == 4) {
-    we = _mm_srli_si128(we, 4);
-    __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
-    __m128i tmp2 = _mm_sub_epi16(d, tmp1);
-    weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
-    weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
-  } else {
-    weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
-    weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
-  }
-
-  if (height == 16) {
-    we = _mm_loadu_si128((const __m128i *)&weight_array[16]);
-    weight_h[0] = _mm_unpacklo_epi8(we, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(we, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
-  } else if (height == 32) {
-    const __m128i weight_lo =
-        _mm_loadu_si128((const __m128i *)&weight_array[32]);
-    weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
-    const __m128i weight_hi =
-        _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
-    weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
-    weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
-    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
-    weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
-  }
-}
-
-static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
-                                   const __m128i *ww, int h, uint8_t *dst,
-                                   ptrdiff_t stride, int second_half) {
-  const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i inc = _mm_set1_epi16(0x202);
-  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-
-  __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
-                            : _mm_set1_epi16((short)0x8000);
-  __m128i d = _mm_set1_epi16(0x100);
-
-  int i;
-  for (i = 0; i < h; ++i) {
-    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
-    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
-    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
-    __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
-    __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
-
-    __m128i b = _mm_shuffle_epi8(pixels[2], rep);
-    b = _mm_unpacklo_epi16(b, pixels[3]);
-    __m128i sum0 = _mm_madd_epi16(b, ww[0]);
-    __m128i sum1 = _mm_madd_epi16(b, ww[1]);
-
-    s0 = _mm_add_epi32(s0, sum0);
-    s0 = _mm_add_epi32(s0, round);
-    s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
-
-    s1 = _mm_add_epi32(s1, sum1);
-    s1 = _mm_add_epi32(s1, round);
-    s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
-
-    sum0 = _mm_packus_epi16(s0, s1);
-    sum0 = _mm_shuffle_epi8(sum0, gat);
-    _mm_storel_epi64((__m128i *)dst, sum0);
-    dst += stride;
-
-    rep = _mm_add_epi16(rep, one);
-    d = _mm_add_epi16(d, inc);
-  }
-}
-
-void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i pixels[4];
-  load_pixel_w8(above, left, 4, pixels);
-
-  __m128i wh[4], ww[2];
-  load_weight_w8(sm_weight_arrays, 4, wh, ww);
-
-  smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
-}
-
-void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i pixels[4];
-  load_pixel_w8(above, left, 8, pixels);
-
-  __m128i wh[4], ww[2];
-  load_weight_w8(sm_weight_arrays, 8, wh, ww);
-
-  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
-}
-
-void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i pixels[4];
-  load_pixel_w8(above, left, 16, pixels);
-
-  __m128i wh[4], ww[2];
-  load_weight_w8(sm_weight_arrays, 16, wh, ww);
-
-  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
-}
-
-void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i pixels[8];
-  load_pixel_w8(above, left, 32, pixels);
-
-  __m128i wh[8], ww[2];
-  load_weight_w8(sm_weight_arrays, 32, wh, ww);
-
-  smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
-  dst += stride << 3;
-  smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
-}
-
-static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left, uint32_t bw,
-                                        uint32_t bh) {
-  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
-  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i scale_value =
-      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
-  const __m128i dup16 = _mm_set1_epi32(0x01000100);
-  const __m128i top_right =
-      _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
-  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale));
-
-  for (uint32_t y = 0; y < bh; ++y) {
-    const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
-    const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
-    const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
-    __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left);
-    const __m128i wl_y =
-        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
-    pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round);
-    pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0);
-
-    for (uint32_t x = 0; x < bw; x += 8) {
-      const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
-      const __m128i weights_x =
-          _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
-      const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x);
-      const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero);
-      const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero);
-
-      __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
-      __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
-
-      const __m128i scale_m_weights_x =
-          _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero));
-      const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right);
-      const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero);
-      const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero);
-
-      pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl);
-      pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl);
-
-      pred_lo = _mm_add_epi32(pred_lo, swxtr_lo);
-      pred_hi = _mm_add_epi32(pred_hi, swxtr_hi);
-
-      pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale));
-      pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale));
-
-      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
-      pred = _mm_shuffle_epi8(pred, gat);
-      _mm_storel_epi64((__m128i *)(dst + x), pred);
-    }
-    dst += stride;
-  }
-}
-
-void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 16, 4);
-}
-
-void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 16, 8);
-}
-
-void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 16, 16);
-}
-
-void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 16, 32);
-}
-
-void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 32, 8);
-}
-
-void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 32, 16);
-}
-
-void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 32, 32);
-}
-
-void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 32, 64);
-}
-
-void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 64, 64);
-}
-
-void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 64, 32);
-}
-
-void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 64, 16);
-}
-
-void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 16, 64);
-}
-
-// -----------------------------------------------------------------------------
-// SMOOTH_V_PRED
-
-// pixels[0]: above and below_pred interleave vector
-static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left,
-                                   int height, __m128i *pixels) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
-  d = _mm_unpacklo_epi8(d, zero);
-  pixels[0] = _mm_unpacklo_epi16(d, bp);
-}
-
-// weights[0]: weights_h vector
-// weights[1]: scale - weights_h vector
-static INLINE void load_weight_v_w4(const uint8_t *weight_array, int height,
-                                    __m128i *weights) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-
-  if (height == 4) {
-    const __m128i weight =
-        _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
-    weights[0] = _mm_unpacklo_epi8(weight, zero);
-    weights[1] = _mm_sub_epi16(d, weights[0]);
-  } else if (height == 8) {
-    const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
-    weights[0] = _mm_unpacklo_epi8(weight, zero);
-    weights[1] = _mm_sub_epi16(d, weights[0]);
-  } else {
-    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
-    weights[0] = _mm_unpacklo_epi8(weight, zero);
-    weights[1] = _mm_sub_epi16(d, weights[0]);
-    weights[2] = _mm_unpackhi_epi8(weight, zero);
-    weights[3] = _mm_sub_epi16(d, weights[2]);
-  }
-}
-
-static INLINE void smooth_v_pred_4xh(const __m128i *pixel,
-                                     const __m128i *weight, int h, uint8_t *dst,
-                                     ptrdiff_t stride) {
-  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
-  const __m128i inc = _mm_set1_epi16(0x202);
-  const __m128i gat = _mm_set1_epi32(0xc080400);
-  __m128i d = _mm_set1_epi16(0x100);
-
-  for (int i = 0; i < h; ++i) {
-    const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
-    const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
-    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
-    __m128i sum = _mm_madd_epi16(pixel[0], wh_sc);
-    sum = _mm_add_epi32(sum, pred_round);
-    sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
-    sum = _mm_shuffle_epi8(sum, gat);
-    *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
-    dst += stride;
-    d = _mm_add_epi16(d, inc);
-  }
-}
-
-void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels;
-  load_pixel_v_w4(above, left, 4, &pixels);
-
-  __m128i weights[2];
-  load_weight_v_w4(sm_weight_arrays, 4, weights);
-
-  smooth_v_pred_4xh(&pixels, weights, 4, dst, stride);
-}
-
-void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels;
-  load_pixel_v_w4(above, left, 8, &pixels);
-
-  __m128i weights[2];
-  load_weight_v_w4(sm_weight_arrays, 8, weights);
-
-  smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
-}
-
-void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i pixels;
-  load_pixel_v_w4(above, left, 16, &pixels);
-
-  __m128i weights[4];
-  load_weight_v_w4(sm_weight_arrays, 16, weights);
-
-  smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
-  dst += stride << 3;
-  smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride);
-}
-
-// pixels[0]: above and below_pred interleave vector, first half
-// pixels[1]: above and below_pred interleave vector, second half
-static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left,
-                                   int height, __m128i *pixels) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i d = _mm_loadl_epi64((const __m128i *)above);
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
-  d = _mm_unpacklo_epi8(d, zero);
-  pixels[0] = _mm_unpacklo_epi16(d, bp);
-  pixels[1] = _mm_unpackhi_epi16(d, bp);
-}
-
-// weight_h[0]: weight_h vector
-// weight_h[1]: scale - weight_h vector
-// weight_h[2]: same as [0], offset 8
-// weight_h[3]: same as [1], offset 8
-// weight_h[4]: same as [0], offset 16
-// weight_h[5]: same as [1], offset 16
-// weight_h[6]: same as [0], offset 24
-// weight_h[7]: same as [1], offset 24
-static INLINE void load_weight_v_w8(const uint8_t *weight_array, int height,
-                                    __m128i *weight_h) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-
-  if (height < 16) {
-    const int offset = height < 8 ? 4 : 8;
-    const __m128i weight =
-        _mm_loadu_si128((const __m128i *)&weight_array[offset]);
-    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-  } else if (height == 16) {
-    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
-    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(weight, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
-  } else {
-    const __m128i weight_lo =
-        _mm_loadu_si128((const __m128i *)&weight_array[32]);
-    weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
-    const __m128i weight_hi =
-        _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
-    weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
-    weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
-    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
-    weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
-  }
-}
-
-static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh,
-                                     int h, uint8_t *dst, ptrdiff_t stride) {
-  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
-  const __m128i inc = _mm_set1_epi16(0x202);
-  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  __m128i d = _mm_set1_epi16(0x100);
-
-  for (int i = 0; i < h; ++i) {
-    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
-    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
-    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
-    __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
-    __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
-
-    s0 = _mm_add_epi32(s0, pred_round);
-    s0 = _mm_srai_epi32(s0, sm_weight_log2_scale);
-
-    s1 = _mm_add_epi32(s1, pred_round);
-    s1 = _mm_srai_epi32(s1, sm_weight_log2_scale);
-
-    __m128i sum01 = _mm_packus_epi16(s0, s1);
-    sum01 = _mm_shuffle_epi8(sum01, gat);
-    _mm_storel_epi64((__m128i *)dst, sum01);
-    dst += stride;
-
-    d = _mm_add_epi16(d, inc);
-  }
-}
-
-void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_v_w8(above, left, 4, pixels);
-
-  __m128i wh[2];
-  load_weight_v_w8(sm_weight_arrays, 4, wh);
-
-  smooth_v_pred_8xh(pixels, wh, 4, dst, stride);
-}
-
-void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_v_w8(above, left, 8, pixels);
-
-  __m128i wh[2];
-  load_weight_v_w8(sm_weight_arrays, 8, wh);
-
-  smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
-}
-
-void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_v_w8(above, left, 16, pixels);
-
-  __m128i wh[4];
-  load_weight_v_w8(sm_weight_arrays, 16, wh);
-
-  smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
-  dst += stride << 3;
-  smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
-}
-
-void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_v_w8(above, left, 32, pixels);
-
-  __m128i wh[8];
-  load_weight_v_w8(sm_weight_arrays, 32, wh);
-
-  smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride);
-  dst += stride << 3;
-  smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
-  dst += stride << 3;
-  smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride);
-  dst += stride << 3;
-  smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride);
-}
-
-static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
-                                          const uint8_t *above,
-                                          const uint8_t *left, uint32_t bw,
-                                          uint32_t bh) {
-  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i scale_value =
-      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  const __m128i dup16 = _mm_set1_epi32(0x01000100);
-  const __m128i bottom_left =
-      _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
-  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  const __m128i round =
-      _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1)));
-
-  for (uint32_t y = 0; y < bh; ++y) {
-    const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
-    const __m128i scale_m_weights_y =
-        _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16);
-    const __m128i wl_y =
-        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0);
-
-    for (uint32_t x = 0; x < bw; x += 8) {
-      const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
-      // 8 -> 16
-      const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero);
-      const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y);
-      const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y);
-      // top_x * weights_y + scale_m_weights_y * bottom_left
-      __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
-      __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
-
-      pred_lo = _mm_add_epi32(pred_lo, round);
-      pred_hi = _mm_add_epi32(pred_hi, round);
-      pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
-      pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
-
-      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
-      pred = _mm_shuffle_epi8(pred, gat);
-      _mm_storel_epi64((__m128i *)(dst + x), pred);
-    }
-    dst += stride;
-  }
-}
-
-void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 16, 4);
-}
-
-void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 16, 8);
-}
-
-void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 16, 16);
-}
-
-void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 16, 32);
-}
-
-void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 32, 8);
-}
-
-void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 32, 16);
-}
-
-void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 32, 32);
-}
-
-void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 32, 64);
-}
-
-void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 64, 64);
-}
-
-void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 64, 32);
-}
-
-void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 64, 16);
-}
-
-void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 16, 64);
-}
-
-// -----------------------------------------------------------------------------
-// SMOOTH_H_PRED
-
-// pixels[0]: left vector
-// pixels[1]: right_pred vector
-static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
-                                   int height, __m128i *pixels) {
-  if (height == 4)
-    pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
-  else if (height == 8)
-    pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
-  else
-    pixels[0] = _mm_loadu_si128(((const __m128i *)left));
-  pixels[1] = _mm_set1_epi16((uint16_t)above[3]);
-}
-
-// weights[0]: weights_w and scale - weights_w interleave vector
-static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height,
-                                    __m128i *weights) {
-  (void)height;
-  const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
-  weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
-}
-
-static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
-                                     const __m128i *weight, int h, uint8_t *dst,
-                                     ptrdiff_t stride) {
-  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i gat = _mm_set1_epi32(0xc080400);
-  __m128i rep = _mm_set1_epi16((short)0x8000);
-
-  for (int i = 0; i < h; ++i) {
-    __m128i b = _mm_shuffle_epi8(pixel[0], rep);
-    b = _mm_unpacklo_epi16(b, pixel[1]);
-    __m128i sum = _mm_madd_epi16(b, weight[0]);
-
-    sum = _mm_add_epi32(sum, pred_round);
-    sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
-
-    sum = _mm_shuffle_epi8(sum, gat);
-    *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
-    dst += stride;
-
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w4(above, left, 4, pixels);
-
-  __m128i weights;
-  load_weight_h_w4(sm_weight_arrays, 4, &weights);
-
-  smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
-}
-
-void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w4(above, left, 8, pixels);
-
-  __m128i weights;
-  load_weight_h_w4(sm_weight_arrays, 8, &weights);
-
-  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
-}
-
-void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w4(above, left, 16, pixels);
-
-  __m128i weights;
-  load_weight_h_w4(sm_weight_arrays, 8, &weights);
-
-  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
-  dst += stride << 3;
-
-  pixels[0] = _mm_srli_si128(pixels[0], 8);
-  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
-}
-
-// pixels[0]: left vector
-// pixels[1]: right_pred vector
-// pixels[2]: left vector + 16
-// pixels[3]: right_pred vector
-static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left,
-                                   int height, __m128i *pixels) {
-  pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
-
-  if (height == 4) {
-    pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
-  } else if (height == 8) {
-    pixels[0] = _mm_loadl_epi64((const __m128i *)left);
-  } else if (height == 16) {
-    pixels[0] = _mm_load_si128((const __m128i *)left);
-  } else {
-    pixels[0] = _mm_load_si128((const __m128i *)left);
-    pixels[2] = _mm_load_si128((const __m128i *)(left + 16));
-    pixels[3] = pixels[1];
-  }
-}
-
-// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
-// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
-static INLINE void load_weight_h_w8(const uint8_t *weight_array, int height,
-                                    __m128i *weight_w) {
-  (void)height;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]);
-  const __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
-  const __m128i tmp2 = _mm_sub_epi16(d, tmp1);
-  weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
-  weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
-}
-
-static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww,
-                                     int h, uint8_t *dst, ptrdiff_t stride,
-                                     int second_half) {
-  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
-                            : _mm_set1_epi16((short)0x8000);
-
-  for (int i = 0; i < h; ++i) {
-    __m128i b = _mm_shuffle_epi8(pixels[0], rep);
-    b = _mm_unpacklo_epi16(b, pixels[1]);
-    __m128i sum0 = _mm_madd_epi16(b, ww[0]);
-    __m128i sum1 = _mm_madd_epi16(b, ww[1]);
-
-    sum0 = _mm_add_epi32(sum0, pred_round);
-    sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale);
-
-    sum1 = _mm_add_epi32(sum1, pred_round);
-    sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale);
-
-    sum0 = _mm_packus_epi16(sum0, sum1);
-    sum0 = _mm_shuffle_epi8(sum0, gat);
-    _mm_storel_epi64((__m128i *)dst, sum0);
-    dst += stride;
-
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w8(above, left, 4, pixels);
-
-  __m128i ww[2];
-  load_weight_h_w8(sm_weight_arrays, 4, ww);
-
-  smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0);
-}
-
-void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w8(above, left, 8, pixels);
-
-  __m128i ww[2];
-  load_weight_h_w8(sm_weight_arrays, 8, ww);
-
-  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
-}
-
-void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w8(above, left, 16, pixels);
-
-  __m128i ww[2];
-  load_weight_h_w8(sm_weight_arrays, 16, ww);
-
-  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1);
-}
-
-void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i pixels[4];
-  load_pixel_h_w8(above, left, 32, pixels);
-
-  __m128i ww[2];
-  load_weight_h_w8(sm_weight_arrays, 32, ww);
-
-  smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1);
-  dst += stride << 3;
-  smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1);
-}
-
-static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
-                                          const uint8_t *above,
-                                          const uint8_t *left, uint32_t bw,
-                                          uint32_t bh) {
-  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i scale_value =
-      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
-  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
-
-  for (uint32_t y = 0; y < bh; ++y) {
-    const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
-    const __m128i tr_ly =
-        _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
-
-    for (uint32_t x = 0; x < bw; x += 8) {
-      const __m128i weights_x =
-          _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
-      const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero);
-      const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw);
-      const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw);
-      const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw);
-      __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly);
-      __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly);
-
-      pred_lo = _mm_add_epi32(pred_lo, pred_round);
-      pred_hi = _mm_add_epi32(pred_hi, pred_round);
-
-      pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
-      pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
-
-      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
-      pred = _mm_shuffle_epi8(pred, gat);
-      _mm_storel_epi64((__m128i *)(dst + x), pred);
-    }
-    dst += stride;
-  }
-}
-
-void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 16, 4);
-}
-
-void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 16, 8);
-}
-
-void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 16, 16);
-}
-
-void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 16, 32);
-}
-
-void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 16, 64);
-}
-
-void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 32, 8);
-}
-
-void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 32, 16);
-}
-
-void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 32, 32);
-}
-
-void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 32, 64);
-}
-
-void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 64, 64);
-}
-
-void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 64, 32);
-}
-
-void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 64, 16);
-}

diff --git a/aom_dsp/x86/jnt_sad_ssse3.c b/aom_dsp/x86/jnt_sad_ssse3.c
index 6d7b3f6..7ef5eb5 100644
--- a/aom_dsp/x86/jnt_sad_ssse3.c
+++ b/aom_dsp/x86/jnt_sad_ssse3.c

@@ -192,48 +192,4 @@
 
   return res;
 }
-
-#define dist_wtd_sadMxN_sse2(m, n)                                            \
-  unsigned int aom_dist_wtd_sad##m##x##n##_avg_ssse3(                         \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
-    uint8_t comp_pred[m * n];                                                 \
-    aom_dist_wtd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \
-                               jcp_param);                                    \
-    return aom_sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n);          \
-  }
-
-#define dist_wtd_sadMxN_avx2(m, n)                                            \
-  unsigned int aom_dist_wtd_sad##m##x##n##_avg_avx2(                          \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
-    uint8_t comp_pred[m * n];                                                 \
-    aom_dist_wtd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \
-                               jcp_param);                                    \
-    return aom_sad##m##xh_avx2(src, src_stride, comp_pred, m, m, n);          \
-  }
-
-/* clang-format off */
-dist_wtd_sadMxN_sse2(128, 128)
-dist_wtd_sadMxN_sse2(128, 64)
-dist_wtd_sadMxN_sse2(64, 128)
-dist_wtd_sadMxN_sse2(64, 64)
-dist_wtd_sadMxN_sse2(64, 32)
-dist_wtd_sadMxN_sse2(32, 64)
-dist_wtd_sadMxN_sse2(32, 32)
-dist_wtd_sadMxN_sse2(32, 16)
-dist_wtd_sadMxN_sse2(16, 32)
-dist_wtd_sadMxN_sse2(16, 16)
-dist_wtd_sadMxN_sse2(16, 8)
-dist_wtd_sadMxN_sse2(8, 16)
-dist_wtd_sadMxN_sse2(8, 8)
-dist_wtd_sadMxN_sse2(8, 4)
-dist_wtd_sadMxN_sse2(4, 8)
-dist_wtd_sadMxN_sse2(4, 4)
-dist_wtd_sadMxN_sse2(4, 16)
-dist_wtd_sadMxN_sse2(16, 4)
-dist_wtd_sadMxN_sse2(8, 32)
-dist_wtd_sadMxN_sse2(32, 8)
-dist_wtd_sadMxN_sse2(16, 64)
-dist_wtd_sadMxN_sse2(64, 16)
-    /* clang-format on */
+/* clang-format on */

diff --git a/aom_dsp/x86/jnt_variance_ssse3.c b/aom_dsp/x86/jnt_variance_ssse3.c
deleted file mode 100644
index fa7e145..0000000
--- a/aom_dsp/x86/jnt_variance_ssse3.c
+++ /dev/null

@@ -1,193 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <assert.h>
-#include <emmintrin.h>  // SSE2
-#include <tmmintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
-
-#include "aom_dsp/x86/synonyms.h"
-
-void aom_var_filter_block2d_bil_first_pass_ssse3(
-    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
-    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter);
-
-void aom_var_filter_block2d_bil_second_pass_ssse3(
-    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
-    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter);
-
-static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1,
-                                        const __m128i *w, const __m128i *r,
-                                        void *const result) {
-  __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
-  __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w);
-  __m128i round_lo = _mm_add_epi16(mult_lo, *r);
-  __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS);
-
-  __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1);
-  __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w);
-  __m128i round_hi = _mm_add_epi16(mult_hi, *r);
-  __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS);
-
-  xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi));
-}
-
-void aom_dist_wtd_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
-                                      int width, int height, const uint8_t *ref,
-                                      int ref_stride,
-                                      const DIST_WTD_COMP_PARAMS *jcp_param) {
-  int i;
-  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
-  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
-  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
-                                 w1, w0, w1, w0);
-  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
-  const __m128i r =
-      _mm_set_epi16(round, round, round, round, round, round, round, round);
-
-  if (width >= 16) {
-    // Read 16 pixels one row at a time
-    assert(!(width & 15));
-    for (i = 0; i < height; ++i) {
-      int j;
-      for (j = 0; j < width; j += 16) {
-        __m128i p0 = xx_loadu_128(ref);
-        __m128i p1 = xx_loadu_128(pred);
-
-        compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
-
-        comp_pred += 16;
-        pred += 16;
-        ref += 16;
-      }
-      ref += ref_stride - width;
-    }
-  } else if (width >= 8) {
-    // Read 8 pixels two row at a time
-    assert(!(width & 7));
-    assert(!(width & 1));
-    for (i = 0; i < height; i += 2) {
-      __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
-      __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
-      __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
-      __m128i p1 = xx_loadu_128(pred);
-
-      compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
-
-      comp_pred += 16;
-      pred += 16;
-      ref += 2 * ref_stride;
-    }
-  } else {
-    // Read 4 pixels four row at a time
-    assert(!(width & 3));
-    assert(!(height & 3));
-    for (i = 0; i < height; i += 4) {
-      const uint8_t *row0 = ref + 0 * ref_stride;
-      const uint8_t *row1 = ref + 1 * ref_stride;
-      const uint8_t *row2 = ref + 2 * ref_stride;
-      const uint8_t *row3 = ref + 3 * ref_stride;
-
-      __m128i p0 =
-          _mm_setr_epi8(row0[0], row0[1], row0[2], row0[3], row1[0], row1[1],
-                        row1[2], row1[3], row2[0], row2[1], row2[2], row2[3],
-                        row3[0], row3[1], row3[2], row3[3]);
-      __m128i p1 = xx_loadu_128(pred);
-
-      compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
-
-      comp_pred += 16;
-      pred += 16;
-      ref += 4 * ref_stride;
-    }
-  }
-}
-
-void aom_dist_wtd_comp_avg_upsampled_pred_ssse3(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
-  int n;
-  int i;
-  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
-  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
-  assert(!(width * height & 15));
-  n = width * height >> 4;
-
-  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
-  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
-  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
-                                 w1, w0, w1, w0);
-  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
-  const __m128i r =
-      _mm_set_epi16(round, round, round, round, round, round, round, round);
-
-  for (i = 0; i < n; i++) {
-    __m128i p0 = xx_loadu_128(comp_pred);
-    __m128i p1 = xx_loadu_128(pred);
-
-    compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
-
-    comp_pred += 16;
-    pred += 16;
-  }
-}
-
-#define DIST_WTD_SUBPIX_AVG_VAR(W, H)                                      \
-  uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_ssse3(           \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,            \
-      const uint8_t *b, int b_stride, uint32_t *sse,                       \
-      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
-    uint16_t fdata3[(H + 1) * W];                                          \
-    uint8_t temp2[H * W];                                                  \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                            \
-                                                                           \
-    aom_var_filter_block2d_bil_first_pass_ssse3(                           \
-        a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
-    aom_var_filter_block2d_bil_second_pass_ssse3(                          \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);          \
-                                                                           \
-    aom_dist_wtd_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W,   \
-                                     jcp_param);                           \
-                                                                           \
-    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);              \
-  }
-
-DIST_WTD_SUBPIX_AVG_VAR(128, 128)
-DIST_WTD_SUBPIX_AVG_VAR(128, 64)
-DIST_WTD_SUBPIX_AVG_VAR(64, 128)
-DIST_WTD_SUBPIX_AVG_VAR(64, 64)
-DIST_WTD_SUBPIX_AVG_VAR(64, 32)
-DIST_WTD_SUBPIX_AVG_VAR(32, 64)
-DIST_WTD_SUBPIX_AVG_VAR(32, 32)
-DIST_WTD_SUBPIX_AVG_VAR(32, 16)
-DIST_WTD_SUBPIX_AVG_VAR(16, 32)
-DIST_WTD_SUBPIX_AVG_VAR(16, 16)
-DIST_WTD_SUBPIX_AVG_VAR(16, 8)
-DIST_WTD_SUBPIX_AVG_VAR(8, 16)
-DIST_WTD_SUBPIX_AVG_VAR(8, 8)
-DIST_WTD_SUBPIX_AVG_VAR(8, 4)
-DIST_WTD_SUBPIX_AVG_VAR(4, 8)
-DIST_WTD_SUBPIX_AVG_VAR(4, 4)
-DIST_WTD_SUBPIX_AVG_VAR(4, 16)
-DIST_WTD_SUBPIX_AVG_VAR(16, 4)
-DIST_WTD_SUBPIX_AVG_VAR(8, 32)
-DIST_WTD_SUBPIX_AVG_VAR(32, 8)
-DIST_WTD_SUBPIX_AVG_VAR(16, 64)
-DIST_WTD_SUBPIX_AVG_VAR(64, 16)

diff --git a/aom_dsp/x86/loopfilter_sse2.c b/aom_dsp/x86/loopfilter_sse2.c
deleted file mode 100644
index 6cd4033..0000000
--- a/aom_dsp/x86/loopfilter_sse2.c
+++ /dev/null

@@ -1,2103 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_ports/mem.h"
-#include "aom_ports/emmintrin_compat.h"
-#include "aom_dsp/x86/lpf_common_sse2.h"
-
-#if !CONFIG_NEW_DF
-static INLINE __m128i abs_diff(__m128i a, __m128i b) {
-  return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
-}
-
-// this function treats its input as 2 parallel 8x4 matrices, transposes each of
-// them to 4x8  independently while flipping the second matrix horizontally.
-// Used for 14 taps pq pairs creation
-static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
-                                        __m128i *x3, __m128i *q0p0,
-                                        __m128i *q1p1, __m128i *q2p2,
-                                        __m128i *q3p3, __m128i *q4p4,
-                                        __m128i *q5p5, __m128i *q6p6,
-                                        __m128i *q7p7) {
-  __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3;
-  w0 = _mm_unpacklo_epi8(
-      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-  w1 = _mm_unpacklo_epi8(
-      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-  w2 = _mm_unpackhi_epi8(
-      *x0, *x1);  // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115
-  w3 = _mm_unpackhi_epi8(
-      *x2, *x3);  // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315
-
-  ww0 = _mm_unpacklo_epi16(
-      w0, w1);  // 00 10 20 30 01 11 21 31        02 12 22 32 03 13 23 33
-  ww1 = _mm_unpackhi_epi16(
-      w0, w1);  // 04 14 24 34 05 15 25 35        06 16 26 36 07 17 27 37
-  ww2 = _mm_unpacklo_epi16(
-      w2, w3);  // 08 18 28 38 09 19 29 39       010 110 210 310 011 111 211 311
-  ww3 = _mm_unpackhi_epi16(
-      w2,
-      w3);  // 012 112 212 312 013 113 213 313  014 114 214 314 015 115 215 315
-
-  *q7p7 = _mm_unpacklo_epi32(
-      ww0,
-      _mm_srli_si128(
-          ww3, 12));  // 00 10 20 30  015 115 215 315  xx xx xx xx xx xx xx xx
-  *q6p6 = _mm_unpackhi_epi32(
-      _mm_slli_si128(ww0, 4),
-      ww3);  // 01 11 21 31  014 114 214 314  xx xx xx xxxx xx xx xx
-  *q5p5 = _mm_unpackhi_epi32(
-      ww0,
-      _mm_slli_si128(
-          ww3, 4));  // 02 12 22 32  013 113 213 313  xx xx xx x xx xx xx xxx
-  *q4p4 = _mm_unpacklo_epi32(
-      _mm_srli_si128(ww0, 12),
-      ww3);  // 03 13 23 33  012 112 212 312 xx xx xx xx xx xx xx xx
-  *q3p3 = _mm_unpacklo_epi32(
-      ww1,
-      _mm_srli_si128(
-          ww2, 12));  // 04 14 24 34  011 111 211 311 xx xx xx xx xx xx xx xx
-  *q2p2 = _mm_unpackhi_epi32(
-      _mm_slli_si128(ww1, 4),
-      ww2);  // 05 15 25 35   010 110 210 310 xx xx xx xx xx xx xx xx
-  *q1p1 = _mm_unpackhi_epi32(
-      ww1,
-      _mm_slli_si128(
-          ww2, 4));  // 06 16 26 36   09 19 29 39     xx xx xx xx xx xx xx xx
-  *q0p0 = _mm_unpacklo_epi32(
-      _mm_srli_si128(ww1, 12),
-      ww2);  // 07 17 27 37  08 18 28 38     xx xx xx xx xx xx xx xx
-}
-
-// this function treats its input as 2 parallel 8x4 matrices, transposes each of
-// them  independently while flipping the second matrix horizontaly  Used for 14
-// taps filter pq pairs inverse
-static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1,
-                                            __m128i *x2, __m128i *x3,
-                                            __m128i *x4, __m128i *x5,
-                                            __m128i *x6, __m128i *x7,
-                                            __m128i *pq0, __m128i *pq1,
-                                            __m128i *pq2, __m128i *pq3) {
-  __m128i w10, w11, w12, w13;
-  __m128i w0, w1, w2, w3, w4, w5;
-  __m128i d0, d1, d2, d3;
-
-  w0 = _mm_unpacklo_epi8(
-      *x0, *x1);  // p 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-  w1 = _mm_unpacklo_epi8(
-      *x2, *x3);  // p 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-  w2 = _mm_unpacklo_epi8(
-      *x4, *x5);  // p 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-  w3 = _mm_unpacklo_epi8(
-      *x6, *x7);  // p 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-
-  w4 = _mm_unpacklo_epi16(
-      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  w5 = _mm_unpacklo_epi16(
-      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-
-  d0 = _mm_unpacklo_epi32(
-      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-  d2 = _mm_unpackhi_epi32(
-      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-
-  w10 = _mm_unpacklo_epi8(
-      *x7, *x6);  // q xx xx xx xx xx xx xx xx 00 10 01 11 02 12 03 13
-  w11 = _mm_unpacklo_epi8(
-      *x5, *x4);  // q  xx xx xx xx xx xx xx xx 20 30 21 31 22 32 23 33
-  w12 = _mm_unpacklo_epi8(
-      *x3, *x2);  // q  xx xx xx xx xx xx xx xx 40 50 41 51 42 52 43 53
-  w13 = _mm_unpacklo_epi8(
-      *x1, *x0);  // q  xx xx xx xx xx xx xx xx 60 70 61 71 62 72 63 73
-
-  w4 = _mm_unpackhi_epi16(
-      w10, w11);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  w5 = _mm_unpackhi_epi16(
-      w12, w13);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-
-  d1 = _mm_unpacklo_epi32(
-      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-  d3 = _mm_unpackhi_epi32(
-      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-
-  *pq0 = _mm_unpacklo_epi64(d0, d1);  // pq
-  *pq1 = _mm_unpackhi_epi64(d0, d1);  // pq
-  *pq2 = _mm_unpacklo_epi64(d2, d3);  // pq
-  *pq3 = _mm_unpackhi_epi64(d2, d3);  // pq
-}
-
-static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0,
-                                          __m128i *hev, __m128i *mask,
-                                          __m128i *qs1qs0, __m128i *ps1ps0) {
-  __m128i filter, filter2filter1, work;
-  __m128i ps1ps0_work, qs1qs0_work;
-  __m128i hev1;
-  const __m128i t3t4 =
-      _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4);
-  const __m128i t80 = _mm_set1_epi8((char)0x80);
-  const __m128i ff = _mm_cmpeq_epi8(t80, t80);
-
-  ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
-  qs1qs0_work = _mm_xor_si128(*q1q0, t80);
-
-  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
-  work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
-  filter = _mm_and_si128(_mm_srli_si128(work, 4), *hev);
-  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
-  filter = _mm_subs_epi8(filter, work);
-  filter = _mm_subs_epi8(filter, work);
-  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */
-  filter = _mm_and_si128(filter, *mask); /* & mask */
-  filter = _mm_unpacklo_epi32(filter, filter);
-
-  /* filter1 = signed_char_clamp(filter + 4) >> 3; */
-  /* filter2 = signed_char_clamp(filter + 3) >> 3; */
-  filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
-  filter2filter1 =
-      _mm_unpacklo_epi8(filter2filter1, filter2filter1);  // goto 16 bit
-  filter2filter1 = _mm_srai_epi16(filter2filter1, 11);    /* >> 3 */
-  filter2filter1 = _mm_packs_epi16(filter2filter1, filter2filter1);
-
-  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
-  filter = _mm_subs_epi8(filter2filter1, ff);  /* + 1 */
-  filter = _mm_unpacklo_epi8(filter, filter);  // goto 16 bit
-  filter = _mm_srai_epi16(filter, 9);          /* round */
-  filter = _mm_packs_epi16(filter, filter);
-  filter = _mm_andnot_si128(*hev, filter);
-  filter = _mm_unpacklo_epi32(filter, filter);
-
-  filter2filter1 = _mm_unpacklo_epi32(filter2filter1, filter);
-  hev1 = _mm_srli_si128(filter2filter1, 8);
-  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
-  qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
-  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
-  ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
-
-  *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
-  *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
-}
-
-static AOM_FORCE_INLINE void filter4_dual_sse2(__m128i *p1p0, __m128i *q1q0,
-                                               __m128i *hev, __m128i *mask,
-                                               __m128i *qs1qs0,
-                                               __m128i *ps1ps0) {
-  const __m128i t3t4 =
-      _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);
-  const __m128i t80 = _mm_set1_epi8((char)0x80);
-  __m128i filter, filter2filter1, work;
-  __m128i ps1ps0_work, qs1qs0_work;
-  __m128i hev1;
-  const __m128i ff = _mm_cmpeq_epi8(t80, t80);
-
-  ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
-  qs1qs0_work = _mm_xor_si128(*q1q0, t80);
-
-  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
-  work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
-  filter = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
-  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
-  filter = _mm_subs_epi8(filter, work);
-  filter = _mm_subs_epi8(filter, work);
-  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */
-  filter = _mm_and_si128(filter, *mask); /* & mask */
-  filter = _mm_unpacklo_epi64(filter, filter);
-
-  /* filter1 = signed_char_clamp(filter + 4) >> 3; */
-  /* filter2 = signed_char_clamp(filter + 3) >> 3; */
-  filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
-  filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);
-  filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);
-  filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */
-  filter = _mm_srai_epi16(filter, 11);                 /* >> 3 */
-  filter2filter1 = _mm_packs_epi16(filter2filter1, filter);
-
-  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
-  filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */
-  filter = _mm_unpacklo_epi8(filter, filter);
-  filter = _mm_srai_epi16(filter, 9); /* round */
-  filter = _mm_packs_epi16(filter, filter);
-  filter = _mm_andnot_si128(*hev, filter);
-
-  hev1 = _mm_unpackhi_epi64(filter2filter1, filter);
-  filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);
-
-  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
-  qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
-  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
-  ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
-  *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
-  *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
-}
-
-static AOM_FORCE_INLINE void lpf_internal_4_sse2(
-    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
-    __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
-  __m128i q1p1, q0p0, p1p0, q1q0;
-  __m128i abs_p0q0, abs_p1q1;
-  __m128i mask, flat, hev;
-  const __m128i zero = _mm_setzero_si128();
-
-  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
-  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
-
-  p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);
-  q1q0 = _mm_srli_si128(p1p0, 8);
-
-  /* (abs(q1 - q0), abs(p1 - p0) */
-  flat = abs_diff(q1p1, q0p0);
-  /* abs(p1 - q1), abs(p0 - q0) */
-  __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
-
-  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
-  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
-  hev = _mm_unpacklo_epi8(flat, zero);
-
-  hev = _mm_cmpgt_epi16(hev, *thresh);
-  hev = _mm_packs_epi16(hev, hev);
-  hev = _mm_unpacklo_epi32(hev, hev);
-
-  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
-  abs_p1q1 = _mm_srli_si128(abs_p1q1p0q0, 4);           /* abs(p1 - q1) */
-  abs_p1q1 = _mm_unpacklo_epi8(abs_p1q1, abs_p1q1);
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
-  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
-  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
-
-  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
-  mask = _mm_unpacklo_epi32(mask, flat);
-  mask = _mm_subs_epu8(mask, *limit);
-  mask = _mm_cmpeq_epi8(mask, zero);
-  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 4));
-
-  filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
-}
-
-static AOM_FORCE_INLINE void lpf_internal_4_dual_sse2(
-    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
-    __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
-  __m128i q1p1, q0p0, p1p0, q1q0;
-  __m128i abs_p0q0, abs_p1q1;
-  __m128i mask, hev;
-  const __m128i zero = _mm_setzero_si128();
-
-  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
-  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
-
-  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
-  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
-
-  /* (abs(q1 - q0), abs(p1 - p0) */
-  __m128i flat = abs_diff(q1p1, q0p0);
-  /* abs(p1 - q1), abs(p0 - q0) */
-  const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
-
-  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
-  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
-  hev = _mm_unpacklo_epi8(flat, zero);
-
-  hev = _mm_cmpgt_epi16(hev, *thresh);
-  hev = _mm_packs_epi16(hev, hev);
-
-  /* const int8_t mask = filter_mask2(*limit, *blimit, */
-  /*                                  p1, p0, q0, q1); */
-  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
-  abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
-  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
-  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
-  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
-  mask = _mm_unpacklo_epi64(mask, flat);
-  mask = _mm_subs_epu8(mask, *limit);
-  mask = _mm_cmpeq_epi8(mask, zero);
-  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));
-
-  filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
-}
-
-void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
-                               const uint8_t *_blimit, const uint8_t *_limit,
-                               const uint8_t *_thresh) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
-                                     _mm_loadl_epi64((const __m128i *)_limit));
-  __m128i thresh =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
-
-  __m128i qs1qs0, ps1ps0;
-  __m128i p1, p0, q0, q1;
-
-  p1 = xx_loadl_32(s - 2 * p);
-  p0 = xx_loadl_32(s - 1 * p);
-  q0 = xx_loadl_32(s - 0 * p);
-  q1 = xx_loadl_32(s + 1 * p);
-
-  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0);
-
-  xx_storel_32(s - 1 * p, ps1ps0);
-  xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 4));
-  xx_storel_32(s + 0 * p, qs1qs0);
-  xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 4));
-}
-
-void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
-                             const uint8_t *_blimit, const uint8_t *_limit,
-                             const uint8_t *_thresh) {
-  __m128i p1p0, q1q0;
-  __m128i p1, p0, q0, q1;
-
-  const __m128i zero = _mm_setzero_si128();
-  __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
-                                     _mm_loadl_epi64((const __m128i *)_limit));
-  __m128i thresh =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
-
-  __m128i x0, x1, x2, x3;
-  __m128i d0, d1, d2, d3;
-  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
-  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
-  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
-  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
-
-  transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &p1, &p0, &q0, &q1);
-
-  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0);
-
-  // Transpose 8x4 to 4x8
-  p1 = _mm_srli_si128(p1p0, 4);
-  q1 = _mm_srli_si128(q1q0, 4);
-
-  transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
-
-  xx_storel_32(s + 0 * p - 2, d0);
-  xx_storel_32(s + 1 * p - 2, d1);
-  xx_storel_32(s + 2 * p - 2, d2);
-  xx_storel_32(s + 3 * p - 2, d3);
-}
-
-static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) {
-  xx_storel_32(s - (num + 1) * p, x);
-  xx_storel_32(s + num * p, _mm_srli_si128(x, 4));
-}
-
-static AOM_FORCE_INLINE void lpf_internal_14_dual_sse2(
-    __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
-    __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
-    __m128i *thresh) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi8(1);
-  __m128i mask, hev, flat, flat2;
-  __m128i qs0ps0, qs1ps1;
-  __m128i p1p0, q1q0, qs1qs0, ps1ps0;
-  __m128i abs_p1p0;
-
-  p1p0 = _mm_unpacklo_epi64(*q0p0, *q1p1);
-  q1q0 = _mm_unpackhi_epi64(*q0p0, *q1p1);
-
-  {
-    __m128i abs_p1q1, abs_p0q0, abs_q1q0;
-    __m128i fe, ff, work;
-    abs_p1p0 = abs_diff(*q1p1, *q0p0);
-    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
-    fe = _mm_set1_epi8((char)0xfe);
-    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    abs_p0q0 = abs_diff(p1p0, q1q0);
-    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
-    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
-
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, *thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-    // replicate for the further "merged variables" usage
-    hev = _mm_unpacklo_epi64(hev, hev);
-
-    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(abs_p1p0, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-
-    work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, *limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-  }
-
-  // lp filter - the same for 6, 8 and 14 versions
-  filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
-  qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0);
-  qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0);
-  // loopfilter done
-
-  __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
-  __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
-
-  __m128i work;
-  flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
-  flat = _mm_max_epu8(abs_p1p0, flat);
-  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
-  flat = _mm_subs_epu8(flat, one);
-  flat = _mm_cmpeq_epi8(flat, zero);
-  flat = _mm_and_si128(flat, mask);
-
-  // if flat ==0 then flat2 is zero as well and we don't need any calc below
-  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // flat and wide flat calculations
-
-    const __m128i eight = _mm_set1_epi16(8);
-    const __m128i four = _mm_set1_epi16(4);
-    __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
-    __m128i q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
-    __m128i pixelFilter_p, pixelFilter_q;
-    __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
-    __m128i sum_p6, sum_q6;
-    __m128i sum_p3, sum_q3, res_p, res_q;
-
-    p6_16 = _mm_unpacklo_epi8(*q6p6, zero);
-    p5_16 = _mm_unpacklo_epi8(*q5p5, zero);
-    p4_16 = _mm_unpacklo_epi8(*q4p4, zero);
-    p3_16 = _mm_unpacklo_epi8(*q3p3, zero);
-    p2_16 = _mm_unpacklo_epi8(*q2p2, zero);
-    p1_16 = _mm_unpacklo_epi8(*q1p1, zero);
-    p0_16 = _mm_unpacklo_epi8(*q0p0, zero);
-    q0_16 = _mm_unpackhi_epi8(*q0p0, zero);
-    q1_16 = _mm_unpackhi_epi8(*q1p1, zero);
-    q2_16 = _mm_unpackhi_epi8(*q2p2, zero);
-    q3_16 = _mm_unpackhi_epi8(*q3p3, zero);
-    q4_16 = _mm_unpackhi_epi8(*q4p4, zero);
-    q5_16 = _mm_unpackhi_epi8(*q5p5, zero);
-    q6_16 = _mm_unpackhi_epi8(*q6p6, zero);
-    pixelFilter_p = _mm_add_epi16(p5_16, _mm_add_epi16(p4_16, p3_16));
-    pixelFilter_q = _mm_add_epi16(q5_16, _mm_add_epi16(q4_16, q3_16));
-
-    pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
-    pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
-    pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
-    pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
-    pixelFilter_p =
-        _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
-    pixetFilter_p2p1p0 = _mm_add_epi16(
-        four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
-    res_p = _mm_srli_epi16(
-        _mm_add_epi16(pixelFilter_p,
-                      _mm_add_epi16(_mm_add_epi16(p6_16, p0_16),
-                                    _mm_add_epi16(p1_16, q0_16))),
-        4);
-    res_q = _mm_srli_epi16(
-        _mm_add_epi16(pixelFilter_p,
-                      _mm_add_epi16(_mm_add_epi16(q6_16, q0_16),
-                                    _mm_add_epi16(p0_16, q1_16))),
-        4);
-    flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
-
-    res_p = _mm_srli_epi16(
-        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
-    res_q = _mm_srli_epi16(
-        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
-
-    flat_q0p0 = _mm_packus_epi16(res_p, res_q);
-
-    sum_p6 = _mm_add_epi16(p6_16, p6_16);
-    sum_q6 = _mm_add_epi16(q6_16, q6_16);
-    sum_p3 = _mm_add_epi16(p3_16, p3_16);
-    sum_q3 = _mm_add_epi16(q3_16, q3_16);
-
-    pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p5_16);
-    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
-
-    res_p = _mm_srli_epi16(
-        _mm_add_epi16(
-            pixelFilter_p,
-            _mm_add_epi16(sum_p6,
-                          _mm_add_epi16(p1_16, _mm_add_epi16(p2_16, p0_16)))),
-        4);
-    res_q = _mm_srli_epi16(
-        _mm_add_epi16(
-            pixelFilter_q,
-            _mm_add_epi16(sum_q6,
-                          _mm_add_epi16(q1_16, _mm_add_epi16(q0_16, q2_16)))),
-        4);
-    flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
-
-    pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
-    pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
-    res_p = _mm_srli_epi16(
-        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
-    res_q = _mm_srli_epi16(
-        _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
-    flat_q1p1 = _mm_packus_epi16(res_p, res_q);
-
-    pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
-    pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
-
-    sum_p3 = _mm_add_epi16(sum_p3, p3_16);
-    sum_q3 = _mm_add_epi16(sum_q3, q3_16);
-
-    res_p = _mm_srli_epi16(
-        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
-    res_q = _mm_srli_epi16(
-        _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
-    flat_q2p2 = _mm_packus_epi16(res_p, res_q);
-
-    // work with flat2
-    flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
-    work = abs_diff(*q6p6, *q0p0);
-    flat2 = _mm_max_epu8(work, flat2);
-    flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
-    flat2 = _mm_subs_epu8(flat2, one);
-    flat2 = _mm_cmpeq_epi8(flat2, zero);
-    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
-
-    // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    flat = _mm_unpacklo_epi64(flat, flat);
-    *q2p2 = _mm_andnot_si128(flat, *q2p2);
-    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
-    *q2p2 = _mm_or_si128(*q2p2, flat_q2p2);
-
-    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
-    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
-    *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
-
-    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
-    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
-    *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
-
-    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
-
-      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
-      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
-
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(
-              pixelFilter_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))),
-          4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(
-              pixelFilter_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))),
-          4);
-      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
-      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
-
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
-
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(
-              pixelFilter_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))),
-          4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(
-              pixelFilter_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))),
-          4);
-      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
-      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
-
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
-
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(
-              pixelFilter_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))),
-          4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(
-              pixelFilter_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))),
-          4);
-      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
-      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
-
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(
-              pixelFilter_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))),
-          4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(
-              pixelFilter_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))),
-          4);
-      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
-
-      // wide flat
-      // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-      flat2 = _mm_unpacklo_epi64(flat2, flat2);
-
-      *q5p5 = _mm_andnot_si128(flat2, *q5p5);
-      flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
-      *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5);
-
-      *q4p4 = _mm_andnot_si128(flat2, *q4p4);
-      flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
-      *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4);
-
-      *q3p3 = _mm_andnot_si128(flat2, *q3p3);
-      flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
-      *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3);
-
-      *q2p2 = _mm_andnot_si128(flat2, *q2p2);
-      flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
-      *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2);
-
-      *q1p1 = _mm_andnot_si128(flat2, *q1p1);
-      flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
-      *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1);
-
-      *q0p0 = _mm_andnot_si128(flat2, *q0p0);
-      flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
-      *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0);
-    }
-  } else {
-    *q0p0 = qs0ps0;
-    *q1p1 = qs1ps1;
-  }
-}
-
-static AOM_FORCE_INLINE void lpf_internal_14_sse2(
-    __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
-    __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
-    __m128i *thresh) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi8(1);
-  __m128i mask, hev, flat, flat2;
-  __m128i flat2_pq[6], flat_pq[3];
-  __m128i qs0ps0, qs1ps1;
-  __m128i p1p0, q1q0, qs1qs0, ps1ps0;
-  __m128i abs_p1p0;
-
-  p1p0 = _mm_unpacklo_epi32(*q0p0, *q1p1);
-  q1q0 = _mm_srli_si128(p1p0, 8);
-
-  __m128i fe, ff, work;
-  {
-    __m128i abs_p1q1, abs_p0q0, abs_q1q0;
-    abs_p1p0 = abs_diff(*q1p1, *q0p0);
-    abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
-    fe = _mm_set1_epi8((char)0xfe);
-    ff = _mm_cmpeq_epi8(fe, fe);
-    abs_p0q0 = abs_diff(p1p0, q1q0);
-    abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
-
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-
-    hev = _mm_subs_epu8(flat, *thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-    // replicate for the further "merged variables" usage
-    hev = _mm_unpacklo_epi32(hev, hev);
-
-    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
-    mask = _mm_unpacklo_epi32(mask, zero);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(abs_p1p0, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-
-    work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
-    mask = _mm_subs_epu8(mask, *limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-  }
-
-  // lp filter - the same for 6, 8 and 14 versions
-  filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
-  qs0ps0 = _mm_unpacklo_epi32(ps1ps0, qs1qs0);
-  qs1ps1 = _mm_srli_si128(qs0ps0, 8);
-  // loopfilter done
-
-  flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
-  flat = _mm_max_epu8(abs_p1p0, flat);
-  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
-  flat = _mm_subs_epu8(flat, one);
-  flat = _mm_cmpeq_epi8(flat, zero);
-  flat = _mm_and_si128(flat, mask);
-  flat = _mm_unpacklo_epi32(flat, flat);
-  flat = _mm_unpacklo_epi64(flat, flat);
-
-  // if flat ==0 then flat2 is zero as well and we don't need any calc below
-  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // flat and wide flat calculations
-    __m128i q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
-    __m128i pq_16[7];
-    const __m128i eight = _mm_set1_epi16(8);
-    const __m128i four = _mm_set1_epi16(4);
-    __m128i sum_p6;
-    __m128i sum_p3;
-
-    pq_16[0] = _mm_unpacklo_epi8(*q0p0, zero);
-    pq_16[1] = _mm_unpacklo_epi8(*q1p1, zero);
-    pq_16[2] = _mm_unpacklo_epi8(*q2p2, zero);
-    pq_16[3] = _mm_unpacklo_epi8(*q3p3, zero);
-    pq_16[4] = _mm_unpacklo_epi8(*q4p4, zero);
-    pq_16[5] = _mm_unpacklo_epi8(*q5p5, zero);
-    pq_16[6] = _mm_unpacklo_epi8(*q6p6, zero);
-    q0_16 = _mm_srli_si128(pq_16[0], 8);
-    q1_16 = _mm_srli_si128(pq_16[1], 8);
-    q2_16 = _mm_srli_si128(pq_16[2], 8);
-    q3_16 = _mm_srli_si128(pq_16[3], 8);
-    q4_16 = _mm_srli_si128(pq_16[4], 8);
-    q5_16 = _mm_srli_si128(pq_16[5], 8);
-
-    __m128i flat_p[3], flat_q[3];
-    __m128i flat2_p[6], flat2_q[6];
-
-    __m128i work0, work0_0, work0_1, sum_p_0;
-    __m128i sum_p = _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[4], pq_16[3]));
-    __m128i sum_lp = _mm_add_epi16(pq_16[0], _mm_add_epi16(pq_16[2], pq_16[1]));
-    sum_p = _mm_add_epi16(sum_p, sum_lp);
-
-    __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
-    __m128i sum_q = _mm_srli_si128(sum_p, 8);
-
-    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
-    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
-
-    flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq_16[3], pq_16[0]));
-    flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q3_16, q0_16));
-
-    sum_p6 = _mm_add_epi16(pq_16[6], pq_16[6]);
-    sum_p3 = _mm_add_epi16(pq_16[3], pq_16[3]);
-
-    sum_q = _mm_sub_epi16(sum_p_0, pq_16[5]);
-    sum_p = _mm_sub_epi16(sum_p_0, q5_16);
-
-    work0_0 = _mm_add_epi16(_mm_add_epi16(pq_16[6], pq_16[0]), pq_16[1]);
-    work0_1 = _mm_add_epi16(
-        sum_p6, _mm_add_epi16(pq_16[1], _mm_add_epi16(pq_16[2], pq_16[0])));
-
-    sum_lq = _mm_sub_epi16(sum_lp, pq_16[2]);
-    sum_lp = _mm_sub_epi16(sum_lp, q2_16);
-
-    work0 = _mm_add_epi16(sum_p3, pq_16[1]);
-    flat_p[1] = _mm_add_epi16(sum_lp, work0);
-    flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
-
-    flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
-    flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
-    flat_pq[0] = _mm_packus_epi16(flat_pq[0], flat_pq[0]);
-    flat_pq[1] = _mm_packus_epi16(flat_pq[1], flat_pq[1]);
-
-    sum_lp = _mm_sub_epi16(sum_lp, q1_16);
-    sum_lq = _mm_sub_epi16(sum_lq, pq_16[1]);
-
-    sum_p3 = _mm_add_epi16(sum_p3, pq_16[3]);
-    work0 = _mm_add_epi16(sum_p3, pq_16[2]);
-
-    flat_p[2] = _mm_add_epi16(sum_lp, work0);
-    flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
-    flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
-    flat_pq[2] = _mm_packus_epi16(flat_pq[2], flat_pq[2]);
-
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
-
-    work = abs_diff(*q6p6, *q0p0);
-    flat2 = _mm_max_epu8(work, flat2);
-    flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 4));
-    flat2 = _mm_subs_epu8(flat2, one);
-    flat2 = _mm_cmpeq_epi8(flat2, zero);
-    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
-    flat2 = _mm_unpacklo_epi32(flat2, flat2);
-
-    // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
-    flat_pq[0] = _mm_and_si128(flat, flat_pq[0]);
-    *q0p0 = _mm_or_si128(qs0ps0, flat_pq[0]);
-
-    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
-    flat_pq[1] = _mm_and_si128(flat, flat_pq[1]);
-    *q1p1 = _mm_or_si128(qs1ps1, flat_pq[1]);
-
-    *q2p2 = _mm_andnot_si128(flat, *q2p2);
-    flat_pq[2] = _mm_and_si128(flat, flat_pq[2]);
-    *q2p2 = _mm_or_si128(*q2p2, flat_pq[2]);
-
-    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
-      flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q0_16));
-      flat2_q[0] = _mm_add_epi16(
-          sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq_16[0]));
-
-      flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
-      flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
-
-      flat2_pq[0] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
-      flat2_pq[1] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
-      flat2_pq[0] = _mm_packus_epi16(flat2_pq[0], flat2_pq[0]);
-      flat2_pq[1] = _mm_packus_epi16(flat2_pq[1], flat2_pq[1]);
-
-      sum_p = _mm_sub_epi16(sum_p, q4_16);
-      sum_q = _mm_sub_epi16(sum_q, pq_16[4]);
-
-      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
-      work0 = _mm_add_epi16(
-          sum_p6, _mm_add_epi16(pq_16[2], _mm_add_epi16(pq_16[3], pq_16[1])));
-      flat2_p[2] = _mm_add_epi16(sum_p, work0);
-      flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-      flat2_pq[2] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
-      flat2_pq[2] = _mm_packus_epi16(flat2_pq[2], flat2_pq[2]);
-
-      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
-      sum_p = _mm_sub_epi16(sum_p, q3_16);
-      sum_q = _mm_sub_epi16(sum_q, pq_16[3]);
-
-      work0 = _mm_add_epi16(
-          sum_p6, _mm_add_epi16(pq_16[3], _mm_add_epi16(pq_16[4], pq_16[2])));
-      flat2_p[3] = _mm_add_epi16(sum_p, work0);
-      flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-      flat2_pq[3] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
-      flat2_pq[3] = _mm_packus_epi16(flat2_pq[3], flat2_pq[3]);
-
-      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
-      sum_p = _mm_sub_epi16(sum_p, q2_16);
-      sum_q = _mm_sub_epi16(sum_q, pq_16[2]);
-
-      work0 = _mm_add_epi16(
-          sum_p6, _mm_add_epi16(pq_16[4], _mm_add_epi16(pq_16[5], pq_16[3])));
-      flat2_p[4] = _mm_add_epi16(sum_p, work0);
-      flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-      flat2_pq[4] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
-      flat2_pq[4] = _mm_packus_epi16(flat2_pq[4], flat2_pq[4]);
-
-      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
-      sum_p = _mm_sub_epi16(sum_p, q1_16);
-      sum_q = _mm_sub_epi16(sum_q, pq_16[1]);
-
-      work0 = _mm_add_epi16(
-          sum_p6, _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[6], pq_16[4])));
-      flat2_p[5] = _mm_add_epi16(sum_p, work0);
-      flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-      flat2_pq[5] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
-      flat2_pq[5] = _mm_packus_epi16(flat2_pq[5], flat2_pq[5]);
-
-      // wide flat
-      // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-      *q0p0 = _mm_andnot_si128(flat2, *q0p0);
-      flat2_pq[0] = _mm_and_si128(flat2, flat2_pq[0]);
-      *q0p0 = _mm_or_si128(*q0p0, flat2_pq[0]);
-
-      *q1p1 = _mm_andnot_si128(flat2, *q1p1);
-      flat2_pq[1] = _mm_and_si128(flat2, flat2_pq[1]);
-      *q1p1 = _mm_or_si128(*q1p1, flat2_pq[1]);
-
-      *q2p2 = _mm_andnot_si128(flat2, *q2p2);
-      flat2_pq[2] = _mm_and_si128(flat2, flat2_pq[2]);
-      *q2p2 = _mm_or_si128(*q2p2, flat2_pq[2]);
-
-      *q3p3 = _mm_andnot_si128(flat2, *q3p3);
-      flat2_pq[3] = _mm_and_si128(flat2, flat2_pq[3]);
-      *q3p3 = _mm_or_si128(*q3p3, flat2_pq[3]);
-
-      *q4p4 = _mm_andnot_si128(flat2, *q4p4);
-      flat2_pq[4] = _mm_and_si128(flat2, flat2_pq[4]);
-      *q4p4 = _mm_or_si128(*q4p4, flat2_pq[4]);
-
-      *q5p5 = _mm_andnot_si128(flat2, *q5p5);
-      flat2_pq[5] = _mm_and_si128(flat2, flat2_pq[5]);
-      *q5p5 = _mm_or_si128(*q5p5, flat2_pq[5]);
-    }
-  } else {
-    *q0p0 = qs0ps0;
-    *q1p1 = qs1ps1;
-  }
-}
-
-void aom_lpf_horizontal_14_sse2(unsigned char *s, int p,
-                                const unsigned char *_blimit,
-                                const unsigned char *_limit,
-                                const unsigned char *_thresh) {
-  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
-  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
-
-  q4p4 = _mm_unpacklo_epi32(xx_loadl_32(s - 5 * p), xx_loadl_32(s + 4 * p));
-  q3p3 = _mm_unpacklo_epi32(xx_loadl_32(s - 4 * p), xx_loadl_32(s + 3 * p));
-  q2p2 = _mm_unpacklo_epi32(xx_loadl_32(s - 3 * p), xx_loadl_32(s + 2 * p));
-  q1p1 = _mm_unpacklo_epi32(xx_loadl_32(s - 2 * p), xx_loadl_32(s + 1 * p));
-
-  q0p0 = _mm_unpacklo_epi32(xx_loadl_32(s - 1 * p), xx_loadl_32(s - 0 * p));
-
-  q5p5 = _mm_unpacklo_epi32(xx_loadl_32(s - 6 * p), xx_loadl_32(s + 5 * p));
-
-  q6p6 = _mm_unpacklo_epi32(xx_loadl_32(s - 7 * p), xx_loadl_32(s + 6 * p));
-
-  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
-                       &limit, &thresh);
-
-  store_buffer_horz_8(q0p0, p, 0, s);
-  store_buffer_horz_8(q1p1, p, 1, s);
-  store_buffer_horz_8(q2p2, p, 2, s);
-  store_buffer_horz_8(q3p3, p, 3, s);
-  store_buffer_horz_8(q4p4, p, 4, s);
-  store_buffer_horz_8(q5p5, p, 5, s);
-}
-
-static AOM_FORCE_INLINE void lpf_internal_6_dual_sse2(
-    __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
-    __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
-    __m128i *thresh) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i mask, hev, flat;
-  __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
-  __m128i p2_16, q2_16, p1_16, q1_16, p0_16, q0_16;
-  __m128i ps1ps0, qs1qs0;
-
-  q2p2 = _mm_unpacklo_epi64(*p2, *q2);
-  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
-  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
-
-  *p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
-  *q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
-
-  const __m128i one = _mm_set1_epi8(1);
-  const __m128i fe = _mm_set1_epi8((char)0xfe);
-  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
-
-  {
-    // filter_mask and hev_mask
-    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
-    abs_p1p0 = abs_diff(q1p1, q0p0);
-    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
-
-    abs_p0q0 = abs_diff(*p1p0, *q1q0);
-    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
-    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
-
-    // considering sse doesn't have unsigned elements comparison the idea is
-    // to find at least one case when X > limit, it means the corresponding
-    // mask bit is set.
-    // to achieve that we find global max value of all inputs of abs(x-y) or
-    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
-    // otherwise - not
-
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, *thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-    // replicate for the further "merged variables" usage
-    hev = _mm_unpacklo_epi64(hev, hev);
-
-    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(abs_p1p0, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-
-    work = abs_diff(q2p2, q1p1);
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, *limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-
-    // lp filter - the same for 6, 8 and 14 versions
-    filter4_dual_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
-
-    // flat_mask
-    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
-    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
-    flat = _mm_subs_epu8(flat, one);
-    flat = _mm_cmpeq_epi8(flat, zero);
-    flat = _mm_and_si128(flat, mask);
-    // replicate for the further "merged variables" usage
-    flat = _mm_unpacklo_epi64(flat, flat);
-  }
-
-  // 5 tap filter
-  // need it only if flat !=0
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
-    const __m128i four = _mm_set1_epi16(4);
-    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
-    p2_16 = _mm_unpacklo_epi8(*p2, zero);
-    p1_16 = _mm_unpacklo_epi8(*p1, zero);
-    p0_16 = _mm_unpacklo_epi8(*p0, zero);
-    q0_16 = _mm_unpacklo_epi8(*q0, zero);
-    q1_16 = _mm_unpacklo_epi8(*q1, zero);
-    q2_16 = _mm_unpacklo_epi8(*q2, zero);
-
-    // op1
-    workp_a = _mm_add_epi16(_mm_add_epi16(p0_16, p0_16),
-                            _mm_add_epi16(p1_16, p1_16));  // p0 *2 + p1 * 2
-    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
-                            p2_16);  // p2 + p0 * 2 + p1 * 2 + 4
-
-    workp_b = _mm_add_epi16(_mm_add_epi16(p2_16, p2_16), q0_16);
-    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
-                                 3);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
-
-    // op0
-    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q0_16), q1_16);  // q0 * 2 + q1
-    workp_a = _mm_add_epi16(workp_a,
-                            workp_b);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
-    workp_shft1 = _mm_srli_epi16(workp_a, 3);
-
-    flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
-
-    // oq0
-    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p2_16),
-                            p1_16);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
-    workp_b = _mm_add_epi16(q1_16, q2_16);
-    workp_a = _mm_add_epi16(
-        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
-    workp_shft0 = _mm_srli_epi16(workp_a, 3);
-
-    // oq1
-    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p1_16),
-                            p0_16);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
-    workp_b = _mm_add_epi16(q2_16, q2_16);
-    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
-                                 3);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
-
-    flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
-
-    qs1qs0 = _mm_andnot_si128(flat, *q1q0);
-    *q1q0 = _mm_and_si128(flat, flat_q0q1);
-    *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
-
-    ps1ps0 = _mm_andnot_si128(flat, *p1p0);
-    *p1p0 = _mm_and_si128(flat, flat_p1p0);
-    *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
-  }
-}
-
-static AOM_FORCE_INLINE void lpf_internal_6_sse2(
-    __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
-    __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
-    __m128i *thresh) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i mask, hev, flat;
-  __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
-  __m128i pq2_16, q2_16, pq1_16, pq0_16, q0_16;
-  __m128i ps1ps0, qs1qs0;
-
-  q2p2 = _mm_unpacklo_epi32(*p2, *q2);
-  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
-  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
-
-  *p1p0 = _mm_unpacklo_epi32(*p0, *p1);
-  *q1q0 = _mm_unpacklo_epi32(*q0, *q1);
-
-  const __m128i one = _mm_set1_epi8(1);
-  const __m128i fe = _mm_set1_epi8((char)0xfe);
-  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
-  {
-    // filter_mask and hev_mask
-    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
-    abs_p1p0 = abs_diff(q1p1, q0p0);
-    abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
-
-    abs_p0q0 = abs_diff(*p1p0, *q1q0);
-    abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
-
-    // considering sse doesn't have unsigned elements comparison the idea is
-    // to find at least one case when X > limit, it means the corresponding
-    // mask bit is set.
-    // to achieve that we find global max value of all inputs of abs(x-y) or
-    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
-    // otherwise - not
-
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, *thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-    // replicate for the further "merged variables" usage
-    hev = _mm_unpacklo_epi32(hev, hev);
-
-    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
-    mask = _mm_unpacklo_epi32(mask, zero);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(abs_p1p0, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-
-    work = abs_diff(q2p2, q1p1);
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
-    mask = _mm_subs_epu8(mask, *limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-
-    // lp filter - the same for 6, 8 and 14 versions
-    filter4_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
-
-    // flat_mask
-    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
-    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
-    flat = _mm_subs_epu8(flat, one);
-    flat = _mm_cmpeq_epi8(flat, zero);
-    flat = _mm_and_si128(flat, mask);
-    // replicate for the further "merged variables" usage
-    flat = _mm_unpacklo_epi32(flat, flat);
-    flat = _mm_unpacklo_epi64(flat, flat);
-  }
-
-  // 5 tap filter
-  // need it only if flat !=0
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
-    const __m128i four = _mm_set1_epi16(4);
-    __m128i workp_a, workp_b, workp_c;
-    __m128i pq0x2_pq1, pq1_pq2;
-    pq2_16 = _mm_unpacklo_epi8(q2p2, zero);
-    pq1_16 = _mm_unpacklo_epi8(q1p1, zero);
-    pq0_16 = _mm_unpacklo_epi8(q0p0, zero);
-    q0_16 = _mm_srli_si128(pq0_16, 8);
-    q2_16 = _mm_srli_si128(pq2_16, 8);
-
-    // op1
-    pq0x2_pq1 =
-        _mm_add_epi16(_mm_add_epi16(pq0_16, pq0_16), pq1_16);  // p0 *2 + p1
-    pq1_pq2 = _mm_add_epi16(pq1_16, pq2_16);                   // p1 + p2
-    workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
-                            pq1_pq2);  // p2 + p0 * 2 + p1 * 2 + 4
-
-    workp_b = _mm_add_epi16(_mm_add_epi16(pq2_16, pq2_16), q0_16);
-    workp_b =
-        _mm_add_epi16(workp_a, workp_b);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
-
-    // op0
-    workp_c = _mm_srli_si128(pq0x2_pq1, 8);  // q0 * 2 + q1
-    workp_a = _mm_add_epi16(workp_a,
-                            workp_c);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
-    workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
-    workp_b = _mm_srli_epi16(workp_b, 3);
-
-    flat_p1p0 = _mm_packus_epi16(workp_b, workp_b);
-
-    // oq0
-    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq2_16),
-                            pq1_16);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
-    workp_b = _mm_srli_si128(pq1_pq2, 8);
-    workp_a = _mm_add_epi16(
-        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
-    // workp_shft0 = _mm_srli_epi16(workp_a, 3);
-
-    // oq1
-    workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq1_16),
-                            pq0_16);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
-    workp_b = _mm_add_epi16(q2_16, q2_16);
-    workp_b =
-        _mm_add_epi16(workp_c, workp_b);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
-
-    workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
-    workp_a = _mm_srli_epi16(workp_a, 3);
-
-    flat_q0q1 = _mm_packus_epi16(workp_a, workp_a);
-
-    qs1qs0 = _mm_andnot_si128(flat, *q1q0);
-    *q1q0 = _mm_and_si128(flat, flat_q0q1);
-    *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
-
-    ps1ps0 = _mm_andnot_si128(flat, *p1p0);
-    *p1p0 = _mm_and_si128(flat, flat_p1p0);
-    *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
-  }
-}
-
-void aom_lpf_horizontal_6_sse2(unsigned char *s, int p,
-                               const unsigned char *_blimit,
-                               const unsigned char *_limit,
-                               const unsigned char *_thresh) {
-  __m128i p2, p1, p0, q0, q1, q2;
-  __m128i p1p0, q1q0;
-  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
-  __m128i limit = _mm_load_si128((__m128i *)_limit);
-  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
-
-  p2 = xx_loadl_32(s - 3 * p);
-  p1 = xx_loadl_32(s - 2 * p);
-  p0 = xx_loadl_32(s - 1 * p);
-  q0 = xx_loadl_32(s - 0 * p);
-  q1 = xx_loadl_32(s + 1 * p);
-  q2 = xx_loadl_32(s + 2 * p);
-
-  lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
-                      &limit, &thresh);
-
-  xx_storel_32(s - 1 * p, p1p0);
-  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
-  xx_storel_32(s + 0 * p, q1q0);
-  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
-}
-
-void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p,
-                                    const unsigned char *_blimit0,
-                                    const unsigned char *_limit0,
-                                    const unsigned char *_thresh0,
-                                    const unsigned char *_blimit1,
-                                    const unsigned char *_limit1,
-                                    const unsigned char *_thresh1) {
-  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
-                                      _mm_load_si128((__m128i *)_blimit1));
-  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
-                                     _mm_load_si128((__m128i *)_limit1));
-  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
-                                      _mm_load_si128((__m128i *)_thresh1));
-
-  __m128i p2, p1, p0, q0, q1, q2;
-  __m128i p1p0, q1q0;
-
-  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
-  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
-
-  lpf_internal_6_dual_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
-                           &limit, &thresh);
-
-  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
-  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
-  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
-}
-
-static AOM_FORCE_INLINE void lpf_internal_8_sse2(
-    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
-    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
-    __m128i *blimit, __m128i *limit, __m128i *thresh) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i mask, hev, flat;
-  __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
-      flat_p1p0, flat_q0q1;
-  __m128i q2p2, q1p1, q0p0;
-  __m128i q1q0, p1p0, ps1ps0, qs1qs0;
-  __m128i work_pq, opq2, pq2;
-
-  q3p3 = _mm_unpacklo_epi32(*p3, *q3);
-  q2p2 = _mm_unpacklo_epi32(*p2, *q2);
-  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
-  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
-
-  p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);  // p1p0 q1q0
-  q1q0 = _mm_srli_si128(p1p0, 8);
-
-  // filter_mask and hev_mask
-
-  // considering sse doesn't have unsigned elements comparison the idea is to
-  // find at least one case when X > limit, it means the corresponding  mask
-  // bit is set.
-  // to achieve that we find global max value of all inputs of abs(x-y) or
-  // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
-  // otherwise - not
-
-  const __m128i one = _mm_set1_epi8(1);
-  const __m128i fe = _mm_set1_epi8((char)0xfe);
-  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
-  __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
-
-  abs_p1p0 = abs_diff(q1p1, q0p0);
-  abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
-
-  abs_p0q0 = abs_diff(p1p0, q1q0);
-  abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
-
-  flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-  hev = _mm_subs_epu8(flat, *thresh);
-  hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-  // replicate for the further "merged variables" usage
-  hev = _mm_unpacklo_epi32(hev, hev);
-
-  abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-  mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
-  mask = _mm_unpacklo_epi32(mask, zero);
-  mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-  mask = _mm_max_epu8(abs_p1p0, mask);
-  // mask |= (abs(p1 - p0) > limit) * -1;
-  // mask |= (abs(q1 - q0) > limit) * -1;
-
-  work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
-
-  mask = _mm_max_epu8(work, mask);
-  mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
-  mask = _mm_subs_epu8(mask, *limit);
-  mask = _mm_cmpeq_epi8(mask, zero);
-
-  // lp filter - the same for 6, 8 and 14 versions
-  filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
-
-  // flat_mask4
-  flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
-  flat = _mm_max_epu8(abs_p1p0, flat);
-
-  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
-  flat = _mm_subs_epu8(flat, one);
-  flat = _mm_cmpeq_epi8(flat, zero);
-  flat = _mm_and_si128(flat, mask);
-  // replicate for the further "merged variables" usage
-  flat = _mm_unpacklo_epi32(flat, flat);
-  flat = _mm_unpacklo_epi64(flat, flat);
-
-  // filter8 need it only if flat !=0
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
-    const __m128i four = _mm_set1_epi16(4);
-    __m128i workp_a, workp_b, workp_c, workp_d, workp_shft1, workp_shft2;
-    p2_16 = _mm_unpacklo_epi8(*p2, zero);
-    p1_16 = _mm_unpacklo_epi8(*p1, zero);
-    p0_16 = _mm_unpacklo_epi8(*p0, zero);
-    q0_16 = _mm_unpacklo_epi8(*q0, zero);
-    q1_16 = _mm_unpacklo_epi8(*q1, zero);
-    q2_16 = _mm_unpacklo_epi8(*q2, zero);
-    p3_16 = _mm_unpacklo_epi8(*p3, zero);
-    q3_16 = _mm_unpacklo_epi8(*q3, zero);
-
-    // op2
-    workp_a =
-        _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
-    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
-    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
-    workp_shft2 = _mm_add_epi16(workp_a, workp_b);
-
-    // op1
-    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
-    workp_c = _mm_add_epi16(workp_a, workp_b);
-    // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // op0
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
-    workp_d = _mm_add_epi16(workp_a, workp_b);
-    // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    workp_c = _mm_unpacklo_epi64(workp_d, workp_c);
-    workp_c = _mm_srli_epi16(workp_c, 3);
-    flat_p1p0 = _mm_packus_epi16(workp_c, workp_c);
-
-    // oq0
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
-    // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-    workp_c = _mm_add_epi16(workp_a, workp_b);
-
-    // oq1
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
-    workp_d = _mm_add_epi16(workp_a, workp_b);
-    // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    workp_c = _mm_unpacklo_epi64(workp_c, workp_d);
-    workp_c = _mm_srli_epi16(workp_c, 3);
-    flat_q0q1 = _mm_packus_epi16(workp_c, workp_c);
-
-    // oq2
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
-    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
-
-    workp_c = _mm_unpacklo_epi64(workp_shft2, workp_shft1);
-    workp_c = _mm_srli_epi16(workp_c, 3);
-
-    opq2 = _mm_packus_epi16(workp_c, workp_c);
-
-    work_pq = _mm_andnot_si128(flat, q2p2);
-    pq2 = _mm_and_si128(flat, opq2);
-    *p2 = _mm_or_si128(work_pq, pq2);
-    *q2 = _mm_srli_si128(*p2, 4);
-
-    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
-    q1q0 = _mm_and_si128(flat, flat_q0q1);
-    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
-
-    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
-    p1p0 = _mm_and_si128(flat, flat_p1p0);
-    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
-  }
-}
-
-static AOM_FORCE_INLINE void lpf_internal_8_dual_sse2(
-    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
-    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
-    __m128i *blimit, __m128i *limit, __m128i *thresh) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i mask, hev, flat;
-  __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
-      flat_p1p0, flat_q0q1;
-  __m128i q2p2, q1p1, q0p0;
-  __m128i q1q0, p1p0, ps1ps0, qs1qs0;
-  __m128i work_pq, opq2, pq2;
-
-  q3p3 = _mm_unpacklo_epi64(*p3, *q3);
-  q2p2 = _mm_unpacklo_epi64(*p2, *q2);
-  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
-  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
-
-  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
-  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
-
-  {
-    // filter_mask and hev_mask
-
-    // considering sse doesn't have unsigned elements comparison the idea is to
-    // find at least one case when X > limit, it means the corresponding  mask
-    // bit is set.
-    // to achieve that we find global max value of all inputs of abs(x-y) or
-    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
-    // otherwise - not
-
-    const __m128i one = _mm_set1_epi8(1);
-    const __m128i fe = _mm_set1_epi8((char)0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(fe, fe);
-    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
-
-    abs_p1p0 = abs_diff(q1p1, q0p0);
-    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
-
-    abs_p0q0 = abs_diff(p1p0, q1q0);
-    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
-    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, abs_p0q0);
-
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, *thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-    // replicate for the further "merged variables" usage
-    hev = _mm_unpacklo_epi64(hev, hev);
-
-    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(abs_p1p0, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-
-    work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
-
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, *limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-
-    // lp filter - the same for 6, 8 and 14 versions
-    filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
-
-    // flat_mask4
-    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
-    flat = _mm_max_epu8(abs_p1p0, flat);
-
-    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
-    flat = _mm_subs_epu8(flat, one);
-    flat = _mm_cmpeq_epi8(flat, zero);
-    flat = _mm_and_si128(flat, mask);
-    // replicate for the further "merged variables" usage
-    flat = _mm_unpacklo_epi64(flat, flat);
-  }
-
-  // filter8 need it only if flat !=0
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
-    const __m128i four = _mm_set1_epi16(4);
-
-    __m128i workp_a, workp_b, workp_shft0, workp_shft1, workp_shft2;
-    p2_16 = _mm_unpacklo_epi8(*p2, zero);
-    p1_16 = _mm_unpacklo_epi8(*p1, zero);
-    p0_16 = _mm_unpacklo_epi8(*p0, zero);
-    q0_16 = _mm_unpacklo_epi8(*q0, zero);
-    q1_16 = _mm_unpacklo_epi8(*q1, zero);
-    q2_16 = _mm_unpacklo_epi8(*q2, zero);
-    p3_16 = _mm_unpacklo_epi8(*p3, zero);
-    q3_16 = _mm_unpacklo_epi8(*q3, zero);
-
-    // op2
-    workp_a =
-        _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
-    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
-    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
-    workp_shft2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // op1
-    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
-    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // op0
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
-    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
-
-    // oq0
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
-    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // oq1
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
-    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
-
-    // oq2
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
-    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    opq2 = _mm_packus_epi16(workp_shft2, workp_shft1);
-
-    work_pq = _mm_andnot_si128(flat, q2p2);
-    pq2 = _mm_and_si128(flat, opq2);
-    *p2 = _mm_or_si128(work_pq, pq2);
-    *q2 = _mm_srli_si128(*p2, 8);
-
-    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
-    q1q0 = _mm_and_si128(flat, flat_q0q1);
-    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
-
-    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
-    p1p0 = _mm_and_si128(flat, flat_p1p0);
-    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
-  }
-}
-
-void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
-                               const unsigned char *_blimit,
-                               const unsigned char *_limit,
-                               const unsigned char *_thresh) {
-  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
-  __m128i q1q0, p1p0;
-  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
-
-  p3 = xx_loadl_32(s - 4 * p);
-  p2 = xx_loadl_32(s - 3 * p);
-  p1 = xx_loadl_32(s - 2 * p);
-  p0 = xx_loadl_32(s - 1 * p);
-  q0 = xx_loadl_32(s - 0 * p);
-  q1 = xx_loadl_32(s + 1 * p);
-  q2 = xx_loadl_32(s + 2 * p);
-  q3 = xx_loadl_32(s + 3 * p);
-
-  lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
-                      &blimit, &limit, &thresh);
-
-  xx_storel_32(s - 1 * p, p1p0);
-  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
-  xx_storel_32(s + 0 * p, q1q0);
-  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
-  xx_storel_32(s - 3 * p, p2);
-  xx_storel_32(s + 2 * p, q2);
-}
-
-void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p,
-                                     const unsigned char *_blimit0,
-                                     const unsigned char *_limit0,
-                                     const unsigned char *_thresh0,
-                                     const unsigned char *_blimit1,
-                                     const unsigned char *_limit1,
-                                     const unsigned char *_thresh1) {
-  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
-  __m128i blimit =
-      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
-                         _mm_load_si128((const __m128i *)_blimit1));
-  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
-                                     _mm_load_si128((const __m128i *)_limit1));
-  __m128i thresh =
-      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
-                         _mm_load_si128((const __m128i *)_thresh1));
-
-  q4p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 5 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 4 * p)));
-  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
-  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
-
-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
-
-  q5p5 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 6 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 5 * p)));
-
-  q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 6 * p)));
-
-  lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
-                            &blimit, &limit, &thresh);
-
-  _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
-  _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8));
-  _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1p1, 8));
-  _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
-  _mm_storel_epi64((__m128i *)(s + 2 * p), _mm_srli_si128(q2p2, 8));
-  _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
-  _mm_storel_epi64((__m128i *)(s + 3 * p), _mm_srli_si128(q3p3, 8));
-  _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
-  _mm_storel_epi64((__m128i *)(s + 4 * p), _mm_srli_si128(q4p4, 8));
-  _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
-  _mm_storel_epi64((__m128i *)(s + 5 * p), _mm_srli_si128(q5p5, 8));
-}
-
-void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
-                                    const uint8_t *_limit0,
-                                    const uint8_t *_thresh0,
-                                    const uint8_t *_blimit1,
-                                    const uint8_t *_limit1,
-                                    const uint8_t *_thresh1) {
-  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
-                                      _mm_load_si128((__m128i *)_blimit1));
-  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
-                                     _mm_load_si128((__m128i *)_limit1));
-  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
-                                      _mm_load_si128((__m128i *)_thresh1));
-
-  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
-  __m128i q1q0, p1p0;
-
-  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
-  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
-  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
-  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
-
-  lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
-                           &blimit, &limit, &thresh);
-
-  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
-  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
-  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
-  _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
-  _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
-}
-
-void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
-                                    const unsigned char *_blimit0,
-                                    const unsigned char *_limit0,
-                                    const unsigned char *_thresh0,
-                                    const unsigned char *_blimit1,
-                                    const unsigned char *_limit1,
-                                    const unsigned char *_thresh1) {
-  __m128i p1, p0, q0, q1;
-  __m128i qs1qs0, ps1ps0;
-
-  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
-  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i blimit =
-      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
-                         _mm_load_si128((const __m128i *)_blimit1));
-  const __m128i limit =
-      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
-                         _mm_load_si128((const __m128i *)_limit1));
-
-  __m128i l = _mm_unpacklo_epi64(blimit, limit);
-
-  __m128i thresh0 =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
-
-  __m128i thresh1 =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
-
-  __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
-
-  lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
-
-  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);
-  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8));
-  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(qs1qs0, 8));
-}
-
-void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
-                                  const uint8_t *_limit0,
-                                  const uint8_t *_thresh0,
-                                  const uint8_t *_blimit1,
-                                  const uint8_t *_limit1,
-                                  const uint8_t *_thresh1) {
-  __m128i p0, q0, q1, p1;
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i qs1qs0, ps1ps0;
-
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i blimit =
-      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
-                         _mm_load_si128((const __m128i *)_blimit1));
-  const __m128i limit =
-      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
-                         _mm_load_si128((const __m128i *)_limit1));
-
-  __m128i l = _mm_unpacklo_epi64(blimit, limit);
-
-  __m128i thresh0 =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
-
-  __m128i thresh1 =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
-
-  __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
-
-  x0 = _mm_loadl_epi64((__m128i *)((s - 2)));
-  x1 = _mm_loadl_epi64((__m128i *)((s - 2) + p));
-  x2 = _mm_loadl_epi64((__m128i *)((s - 2) + 2 * p));
-  x3 = _mm_loadl_epi64((__m128i *)((s - 2) + 3 * p));
-  x4 = _mm_loadl_epi64((__m128i *)((s - 2) + 4 * p));
-  x5 = _mm_loadl_epi64((__m128i *)((s - 2) + 5 * p));
-  x6 = _mm_loadl_epi64((__m128i *)((s - 2) + 6 * p));
-  x7 = _mm_loadl_epi64((__m128i *)((s - 2) + 7 * p));
-
-  transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0,
-                        &q1);
-
-  lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
-
-  p1 = _mm_srli_si128(ps1ps0, 8);
-  q1 = _mm_srli_si128(qs1qs0, 8);
-
-  transpose4x8_8x4_sse2(&p1, &ps1ps0, &qs1qs0, &q1, &d0, &d1, &d2, &d3, &d4,
-                        &d5, &d6, &d7);
-
-  xx_storel_32((s - 2 + 0 * p), d0);
-  xx_storel_32((s - 2 + 1 * p), d1);
-  xx_storel_32((s - 2 + 2 * p), d2);
-  xx_storel_32((s - 2 + 3 * p), d3);
-  xx_storel_32((s - 2 + 4 * p), d4);
-  xx_storel_32((s - 2 + 5 * p), d5);
-  xx_storel_32((s - 2 + 6 * p), d6);
-  xx_storel_32((s - 2 + 7 * p), d7);
-}
-
-void aom_lpf_vertical_6_sse2(unsigned char *s, int p,
-                             const unsigned char *_blimit,
-                             const unsigned char *_limit,
-                             const unsigned char *_thresh) {
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i x2, x1, x0, x3;
-  __m128i p0, q0;
-  __m128i p1p0, q1q0;
-  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
-  __m128i limit = _mm_load_si128((__m128i *)_limit);
-  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
-
-  x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
-  x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
-  x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
-  x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
-
-  transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
-                        &d7);
-
-  lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit,
-                      &limit, &thresh);
-
-  p0 = _mm_srli_si128(p1p0, 4);
-  q0 = _mm_srli_si128(q1q0, 4);
-
-  transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
-
-  xx_storel_32(s + 0 * p - 2, d0);
-  xx_storel_32(s + 1 * p - 2, d1);
-  xx_storel_32(s + 2 * p - 2, d2);
-  xx_storel_32(s + 3 * p - 2, d3);
-}
-
-void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
-                                  const uint8_t *_limit0,
-                                  const uint8_t *_thresh0,
-                                  const uint8_t *_blimit1,
-                                  const uint8_t *_limit1,
-                                  const uint8_t *_thresh1) {
-  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
-                                      _mm_load_si128((__m128i *)_blimit1));
-  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
-                                     _mm_load_si128((__m128i *)_limit1));
-  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
-                                      _mm_load_si128((__m128i *)_thresh1));
-
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i p0, q0;
-  __m128i p1p0, q1q0;
-  __m128i d0d1, d2d3, d4d5, d6d7;
-
-  x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
-  x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
-  x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
-  x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
-  x4 = _mm_loadl_epi64((__m128i *)((s - 3) + 4 * p));
-  x5 = _mm_loadl_epi64((__m128i *)((s - 3) + 5 * p));
-  x6 = _mm_loadl_epi64((__m128i *)((s - 3) + 6 * p));
-  x7 = _mm_loadl_epi64((__m128i *)((s - 3) + 7 * p));
-
-  transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
-                    &d6d7);
-
-  d1 = _mm_srli_si128(d0d1, 8);
-  d3 = _mm_srli_si128(d2d3, 8);
-  d5 = _mm_srli_si128(d4d5, 8);
-  d7 = _mm_srli_si128(d6d7, 8);
-
-  lpf_internal_6_dual_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0,
-                           &blimit, &limit, &thresh);
-
-  p0 = _mm_srli_si128(p1p0, 8);
-  q0 = _mm_srli_si128(q1q0, 8);
-
-  transpose4x8_8x4_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3, &d4, &d5,
-                        &d6, &d7);
-
-  xx_storel_32((s - 2 + 0 * p), d0);
-  xx_storel_32((s - 2 + 1 * p), d1);
-  xx_storel_32((s - 2 + 2 * p), d2);
-  xx_storel_32((s - 2 + 3 * p), d3);
-  xx_storel_32((s - 2 + 4 * p), d4);
-  xx_storel_32((s - 2 + 5 * p), d5);
-  xx_storel_32((s - 2 + 6 * p), d6);
-  xx_storel_32((s - 2 + 7 * p), d7);
-}
-
-void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
-                             const unsigned char *_blimit,
-                             const unsigned char *_limit,
-                             const unsigned char *_thresh) {
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-
-  __m128i p0, q0;
-  __m128i x2, x1, x0, x3;
-  __m128i q1q0, p1p0;
-  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
-
-  x3 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p));
-  x2 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p));
-  x1 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p));
-  x0 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p));
-
-  transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
-                        &d7);
-  // Loop filtering
-  lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0,
-                      &blimit, &limit, &thresh);
-
-  p0 = _mm_srli_si128(p1p0, 4);
-  q0 = _mm_srli_si128(q1q0, 4);
-
-  transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1,
-                        &d2, &d3);
-
-  _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0);
-  _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), d1);
-  _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2);
-  _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), d3);
-}
-
-void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
-                                  const uint8_t *_limit0,
-                                  const uint8_t *_thresh0,
-                                  const uint8_t *_blimit1,
-                                  const uint8_t *_limit1,
-                                  const uint8_t *_thresh1) {
-  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
-                                      _mm_load_si128((__m128i *)_blimit1));
-  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
-                                     _mm_load_si128((__m128i *)_limit1));
-  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
-                                      _mm_load_si128((__m128i *)_thresh1));
-
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i d1, d3, d5, d7;
-  __m128i q1q0, p1p0;
-  __m128i p1, q1;
-  __m128i d0d1, d2d3, d4d5, d6d7;
-
-  x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p));
-  x1 = _mm_loadl_epi64((__m128i *)(s - 4 + 1 * p));
-  x2 = _mm_loadl_epi64((__m128i *)(s - 4 + 2 * p));
-  x3 = _mm_loadl_epi64((__m128i *)(s - 4 + 3 * p));
-  x4 = _mm_loadl_epi64((__m128i *)(s - 4 + 4 * p));
-  x5 = _mm_loadl_epi64((__m128i *)(s - 4 + 5 * p));
-  x6 = _mm_loadl_epi64((__m128i *)(s - 4 + 6 * p));
-  x7 = _mm_loadl_epi64((__m128i *)(s - 4 + 7 * p));
-
-  transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
-                    &d6d7);
-
-  d1 = _mm_srli_si128(d0d1, 8);
-  d3 = _mm_srli_si128(d2d3, 8);
-  d5 = _mm_srli_si128(d4d5, 8);
-  d7 = _mm_srli_si128(d6d7, 8);
-
-  lpf_internal_8_dual_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5,
-                           &q1q0, &p1p0, &blimit, &limit, &thresh);
-
-  p1 = _mm_srli_si128(p1p0, 8);
-  q1 = _mm_srli_si128(q1q0, 8);
-
-  transpose8x8_sse2(&d0d1, &d1, &p1, &p1p0, &q1q0, &q1, &d6d7, &d7, &d0d1,
-                    &d2d3, &d4d5, &d6d7);
-
-  _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1);
-  _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8));
-  _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2d3);
-  _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), _mm_srli_si128(d2d3, 8));
-  _mm_storel_epi64((__m128i *)(s - 4 + 4 * p), d4d5);
-  _mm_storel_epi64((__m128i *)(s - 4 + 5 * p), _mm_srli_si128(d4d5, 8));
-  _mm_storel_epi64((__m128i *)(s - 4 + 6 * p), d6d7);
-  _mm_storel_epi64((__m128i *)(s - 4 + 7 * p), _mm_srli_si128(d6d7, 8));
-}
-
-void aom_lpf_vertical_14_sse2(unsigned char *s, int p,
-                              const unsigned char *_blimit,
-                              const unsigned char *_limit,
-                              const unsigned char *_thresh) {
-  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
-  __m128i x6, x5, x4, x3;
-  __m128i pq0, pq1, pq2, pq3;
-  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
-  __m128i limit = _mm_load_si128((__m128i *)_limit);
-  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
-
-  x6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
-  x5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
-  x4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
-  x3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
-
-  transpose_pq_14_sse2(&x6, &x5, &x4, &x3, &q0p0, &q1p1, &q2p2, &q3p3, &q4p4,
-                       &q5p5, &q6p6, &q7p7);
-
-  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
-                       &limit, &thresh);
-
-  transpose_pq_14_inv_sse2(&q7p7, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
-                           &q0p0, &pq0, &pq1, &pq2, &pq3);
-  _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), pq0);
-  _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), pq1);
-  _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), pq2);
-  _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), pq3);
-}
-
-void aom_lpf_vertical_14_dual_sse2(
-    unsigned char *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1) {
-  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
-  __m128i x7, x6, x5, x4, x3, x2, x1, x0;
-  __m128i d0d1, d2d3, d4d5, d6d7, d8d9, d10d11, d12d13, d14d15;
-  __m128i q0, q1, q2, q3, q7;
-  __m128i p0p1, p2p3, p4p5, p6p7;
-
-  __m128i blimit =
-      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
-                         _mm_load_si128((const __m128i *)_blimit1));
-  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
-                                     _mm_load_si128((const __m128i *)_limit1));
-  __m128i thresh =
-      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
-                         _mm_load_si128((const __m128i *)_thresh1));
-
-  x7 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
-  x6 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
-  x5 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
-  x4 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
-  x3 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * p));
-  x2 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * p));
-  x1 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * p));
-  x0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * p));
-
-  transpose8x16_16x8_sse2(&x7, &x6, &x5, &x4, &x3, &x2, &x1, &x0, &d0d1, &d2d3,
-                          &d4d5, &d6d7, &d8d9, &d10d11, &d12d13, &d14d15);
-
-  q6p6 = _mm_unpacklo_epi64(d2d3, _mm_srli_si128(d12d13, 8));
-  q5p5 = _mm_unpacklo_epi64(d4d5, _mm_srli_si128(d10d11, 8));
-  q4p4 = _mm_unpacklo_epi64(d6d7, _mm_srli_si128(d8d9, 8));
-  q3p3 = _mm_unpacklo_epi64(d8d9, _mm_srli_si128(d6d7, 8));
-  q2p2 = _mm_unpacklo_epi64(d10d11, _mm_srli_si128(d4d5, 8));
-  q1p1 = _mm_unpacklo_epi64(d12d13, _mm_srli_si128(d2d3, 8));
-  q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8));
-  q7 = _mm_srli_si128(d14d15, 8);
-
-  lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
-                            &blimit, &limit, &thresh);
-
-  x0 = _mm_srli_si128(q0p0, 8);
-  x1 = _mm_srli_si128(q1p1, 8);
-  x2 = _mm_srli_si128(q2p2, 8);
-  x3 = _mm_srli_si128(q3p3, 8);
-  x4 = _mm_srli_si128(q4p4, 8);
-  x5 = _mm_srli_si128(q5p5, 8);
-  x6 = _mm_srli_si128(q6p6, 8);
-
-  transpose16x8_8x16_sse2(&d0d1, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
-                          &q0p0, &x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &p0p1,
-                          &p2p3, &p4p5, &p6p7, &q0, &q1, &q2, &q3);
-
-  _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), p0p1);
-  _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), p2p3);
-  _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), p4p5);
-  _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), p6p7);
-  _mm_storeu_si128((__m128i *)(s - 8 + 4 * p), q0);
-  _mm_storeu_si128((__m128i *)(s - 8 + 5 * p), q1);
-  _mm_storeu_si128((__m128i *)(s - 8 + 6 * p), q2);
-  _mm_storeu_si128((__m128i *)(s - 8 + 7 * p), q3);
-}
-#endif  // !CONFIG_NEW_DF

diff --git a/aom_dsp/x86/obmc_sad_avx2.c b/aom_dsp/x86/obmc_sad_avx2.c
index 582b05d..2cb3cb3 100644
--- a/aom_dsp/x86/obmc_sad_avx2.c
+++ b/aom_dsp/x86/obmc_sad_avx2.c

@@ -23,128 +23,6 @@
 #include "aom_dsp/x86/synonyms.h"
 
 ////////////////////////////////////////////////////////////////////////////////
-// 8 bit
-////////////////////////////////////////////////////////////////////////////////
-
-static INLINE unsigned int obmc_sad_w4_avx2(const uint8_t *pre,
-                                            const int pre_stride,
-                                            const int32_t *wsrc,
-                                            const int32_t *mask,
-                                            const int height) {
-  int n = 0;
-  __m256i v_sad_d = _mm256_setzero_si256();
-  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
-
-  do {
-    const __m128i v_p_b_0 = xx_loadl_32(pre);
-    const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride);
-    const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1);
-    const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
-    const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
-
-    const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
-
-    const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
-    const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
-
-    // Rounded absolute difference
-    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
-    const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
-
-    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
-
-    n += 8;
-    pre += pre_stride << 1;
-  } while (n < 8 * (height >> 1));
-
-  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
-  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
-  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
-  return xx_hsum_epi32_si32(v_sad_d_0);
-}
-
-static INLINE unsigned int obmc_sad_w8n_avx2(
-    const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
-    const int32_t *mask, const int width, const int height) {
-  const int pre_step = pre_stride - width;
-  int n = 0;
-  __m256i v_sad_d = _mm256_setzero_si256();
-  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
-  assert(width >= 8);
-  assert(IS_POWER_OF_TWO(width));
-
-  do {
-    const __m128i v_p0_b = xx_loadl_64(pre + n);
-    const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
-    const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
-
-    const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
-
-    const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
-    const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
-
-    // Rounded absolute difference
-    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
-    const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
-
-    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
-
-    n += 8;
-
-    if ((n & (width - 1)) == 0) pre += pre_step;
-  } while (n < width * height);
-
-  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
-  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
-  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
-  return xx_hsum_epi32_si32(v_sad_d_0);
-}
-
-#define OBMCSADWXH(w, h)                                          \
-  unsigned int aom_obmc_sad##w##x##h##_avx2(                      \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
-      const int32_t *msk) {                                       \
-    if (w == 4) {                                                 \
-      return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h);     \
-    } else {                                                      \
-      return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \
-    }                                                             \
-  }
-
-OBMCSADWXH(128, 128)
-OBMCSADWXH(128, 64)
-OBMCSADWXH(64, 128)
-OBMCSADWXH(64, 64)
-OBMCSADWXH(64, 32)
-OBMCSADWXH(32, 64)
-OBMCSADWXH(32, 32)
-OBMCSADWXH(32, 16)
-OBMCSADWXH(16, 32)
-OBMCSADWXH(16, 16)
-OBMCSADWXH(16, 8)
-OBMCSADWXH(8, 16)
-OBMCSADWXH(8, 8)
-OBMCSADWXH(8, 4)
-OBMCSADWXH(4, 8)
-OBMCSADWXH(4, 4)
-OBMCSADWXH(4, 16)
-OBMCSADWXH(16, 4)
-OBMCSADWXH(8, 32)
-OBMCSADWXH(32, 8)
-OBMCSADWXH(16, 64)
-OBMCSADWXH(64, 16)
-
-////////////////////////////////////////////////////////////////////////////////
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
 

diff --git a/aom_dsp/x86/obmc_sad_sse4.c b/aom_dsp/x86/obmc_sad_sse4.c
index 6fdf354..47e99cb 100644
--- a/aom_dsp/x86/obmc_sad_sse4.c
+++ b/aom_dsp/x86/obmc_sad_sse4.c

@@ -23,128 +23,6 @@
 #include "aom_dsp/x86/synonyms.h"
 
 ////////////////////////////////////////////////////////////////////////////////
-// 8 bit
-////////////////////////////////////////////////////////////////////////////////
-
-static AOM_FORCE_INLINE unsigned int obmc_sad_w4(const uint8_t *pre,
-                                                 const int pre_stride,
-                                                 const int32_t *wsrc,
-                                                 const int32_t *mask,
-                                                 const int height) {
-  const int pre_step = pre_stride - 4;
-  int n = 0;
-  __m128i v_sad_d = _mm_setzero_si128();
-
-  do {
-    const __m128i v_p_b = xx_loadl_32(pre + n);
-    const __m128i v_m_d = xx_load_128(mask + n);
-    const __m128i v_w_d = xx_load_128(wsrc + n);
-
-    const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
-
-    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
-    const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
-
-    // Rounded absolute difference
-    const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
-
-    v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
-
-    n += 4;
-
-    if (n % 4 == 0) pre += pre_step;
-  } while (n < 4 * height);
-
-  return xx_hsum_epi32_si32(v_sad_d);
-}
-
-static AOM_FORCE_INLINE unsigned int obmc_sad_w8n(
-    const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
-    const int32_t *mask, const int width, const int height) {
-  const int pre_step = pre_stride - width;
-  int n = 0;
-  __m128i v_sad_d = _mm_setzero_si128();
-
-  assert(width >= 8);
-  assert(IS_POWER_OF_TWO(width));
-
-  do {
-    const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
-    const __m128i v_m1_d = xx_load_128(mask + n + 4);
-    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
-    const __m128i v_p0_b = xx_loadl_32(pre + n);
-    const __m128i v_m0_d = xx_load_128(mask + n);
-    const __m128i v_w0_d = xx_load_128(wsrc + n);
-
-    const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
-    const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
-    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
-
-    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
-    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
-    const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
-    const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
-
-    // Rounded absolute difference
-    const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
-    const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
-
-    v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
-    v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
-
-    n += 8;
-
-    if (n % width == 0) pre += pre_step;
-  } while (n < width * height);
-
-  return xx_hsum_epi32_si32(v_sad_d);
-}
-
-#define OBMCSADWXH(w, h)                                       \
-  unsigned int aom_obmc_sad##w##x##h##_sse4_1(                 \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
-      const int32_t *msk) {                                    \
-    if (w == 4) {                                              \
-      return obmc_sad_w4(pre, pre_stride, wsrc, msk, h);       \
-    } else {                                                   \
-      return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h);   \
-    }                                                          \
-  }
-
-OBMCSADWXH(128, 128)
-OBMCSADWXH(128, 64)
-OBMCSADWXH(64, 128)
-OBMCSADWXH(64, 64)
-OBMCSADWXH(64, 32)
-OBMCSADWXH(32, 64)
-OBMCSADWXH(32, 32)
-OBMCSADWXH(32, 16)
-OBMCSADWXH(16, 32)
-OBMCSADWXH(16, 16)
-OBMCSADWXH(16, 8)
-OBMCSADWXH(8, 16)
-OBMCSADWXH(8, 8)
-OBMCSADWXH(8, 4)
-OBMCSADWXH(4, 8)
-OBMCSADWXH(4, 4)
-OBMCSADWXH(4, 16)
-OBMCSADWXH(16, 4)
-OBMCSADWXH(8, 32)
-OBMCSADWXH(32, 8)
-OBMCSADWXH(16, 64)
-OBMCSADWXH(64, 16)
-
-////////////////////////////////////////////////////////////////////////////////
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
 

diff --git a/aom_dsp/x86/obmc_variance_avx2.c b/aom_dsp/x86/obmc_variance_avx2.c
deleted file mode 100644
index 5214f67..0000000
--- a/aom_dsp/x86/obmc_variance_avx2.c
+++ /dev/null

@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <assert.h>
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-#include "aom_ports/mem.h"
-#include "aom/aom_integer.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// 8 bit
-////////////////////////////////////////////////////////////////////////////////
-
-static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
-                                     const int32_t *wsrc, const int32_t *mask,
-                                     unsigned int *const sse, int *const sum,
-                                     const int w, const int h) {
-  int n = 0, width, height = h;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_d = _mm_setzero_si128();
-  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
-  __m128i v_d;
-  const uint8_t *pre_temp;
-  assert(w >= 8);
-  assert(IS_POWER_OF_TWO(w));
-  assert(IS_POWER_OF_TWO(h));
-  do {
-    width = w;
-    pre_temp = pre;
-    do {
-      const __m128i v_p_b = _mm_loadl_epi64((const __m128i *)pre_temp);
-      const __m256i v_m_d = _mm256_loadu_si256((__m256i const *)(mask + n));
-      const __m256i v_w_d = _mm256_loadu_si256((__m256i const *)(wsrc + n));
-      const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b);
-
-      // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-      // boundaries. We use pmaddwd, as it has lower latency on Haswell
-      // than pmulld but produces the same result with these inputs.
-      const __m256i v_pm_d = _mm256_madd_epi16(v_p0_d, v_m_d);
-      const __m256i v_diff0_d = _mm256_sub_epi32(v_w_d, v_pm_d);
-
-      const __m256i v_sign_d = _mm256_srai_epi32(v_diff0_d, 31);
-      const __m256i v_tmp_d =
-          _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign_d);
-      const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp_d, 12);
-      const __m128i v_rdiff_d = _mm256_castsi256_si128(v_rdiff0_d);
-      const __m128i v_rdiff1_d = _mm256_extracti128_si256(v_rdiff0_d, 1);
-
-      const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff_d, v_rdiff1_d);
-      const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
-
-      v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
-      v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
-      v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
-
-      pre_temp += 8;
-      n += 8;
-      width -= 8;
-    } while (width > 0);
-    pre += pre_stride;
-    height -= 1;
-  } while (height > 0);
-  v_d = _mm_hadd_epi32(v_sum_d, v_sse_d);
-  v_d = _mm_hadd_epi32(v_d, v_d);
-  *sum = _mm_cvtsi128_si32(v_d);
-  *sse = _mm_cvtsi128_si32(_mm_srli_si128(v_d, 4));
-}
-
-static INLINE void obmc_variance_w16n(const uint8_t *pre, const int pre_stride,
-                                      const int32_t *wsrc, const int32_t *mask,
-                                      unsigned int *const sse, int *const sum,
-                                      const int w, const int h) {
-  int n = 0, width, height = h;
-  __m256i v_d;
-  __m128i res0;
-  const uint8_t *pre_temp;
-  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
-  __m256i v_sum_d = _mm256_setzero_si256();
-  __m256i v_sse_d = _mm256_setzero_si256();
-
-  assert(w >= 16);
-  assert(IS_POWER_OF_TWO(w));
-  assert(IS_POWER_OF_TWO(h));
-  do {
-    width = w;
-    pre_temp = pre;
-    do {
-      const __m128i v_p_b = _mm_loadu_si128((__m128i *)pre_temp);
-      const __m256i v_m0_d = _mm256_loadu_si256((__m256i const *)(mask + n));
-      const __m256i v_w0_d = _mm256_loadu_si256((__m256i const *)(wsrc + n));
-      const __m256i v_m1_d =
-          _mm256_loadu_si256((__m256i const *)(mask + n + 8));
-      const __m256i v_w1_d =
-          _mm256_loadu_si256((__m256i const *)(wsrc + n + 8));
-
-      const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b);
-      const __m256i v_p1_d = _mm256_cvtepu8_epi32(_mm_srli_si128(v_p_b, 8));
-
-      const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
-      const __m256i v_pm1_d = _mm256_madd_epi16(v_p1_d, v_m1_d);
-
-      const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
-      const __m256i v_diff1_d = _mm256_sub_epi32(v_w1_d, v_pm1_d);
-
-      const __m256i v_sign0_d = _mm256_srai_epi32(v_diff0_d, 31);
-      const __m256i v_sign1_d = _mm256_srai_epi32(v_diff1_d, 31);
-
-      const __m256i v_tmp0_d =
-          _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign0_d);
-      const __m256i v_tmp1_d =
-          _mm256_add_epi32(_mm256_add_epi32(v_diff1_d, v_bias_d), v_sign1_d);
-
-      const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp0_d, 12);
-      const __m256i v_rdiff2_d = _mm256_srai_epi32(v_tmp1_d, 12);
-
-      const __m256i v_rdiff1_d = _mm256_add_epi32(v_rdiff0_d, v_rdiff2_d);
-      const __m256i v_rdiff01_w = _mm256_packs_epi32(v_rdiff0_d, v_rdiff2_d);
-      const __m256i v_sqrdiff_d = _mm256_madd_epi16(v_rdiff01_w, v_rdiff01_w);
-
-      v_sum_d = _mm256_add_epi32(v_sum_d, v_rdiff1_d);
-      v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff_d);
-
-      pre_temp += 16;
-      n += 16;
-      width -= 16;
-    } while (width > 0);
-    pre += pre_stride;
-    height -= 1;
-  } while (height > 0);
-
-  v_d = _mm256_hadd_epi32(v_sum_d, v_sse_d);
-  v_d = _mm256_hadd_epi32(v_d, v_d);
-  res0 = _mm256_castsi256_si128(v_d);
-  res0 = _mm_add_epi32(res0, _mm256_extractf128_si256(v_d, 1));
-  *sum = _mm_cvtsi128_si32(res0);
-  *sse = _mm_cvtsi128_si32(_mm_srli_si128(res0, 4));
-}
-
-#define OBMCVARWXH(W, H)                                                \
-  unsigned int aom_obmc_variance##W##x##H##_avx2(                       \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc,          \
-      const int32_t *mask, unsigned int *sse) {                         \
-    int sum;                                                            \
-    if (W == 4) {                                                       \
-      obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H);      \
-    } else if (W == 8) {                                                \
-      obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H);  \
-    } else {                                                            \
-      obmc_variance_w16n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
-    }                                                                   \
-                                                                        \
-    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));       \
-  }
-
-OBMCVARWXH(128, 128)
-OBMCVARWXH(128, 64)
-OBMCVARWXH(64, 128)
-OBMCVARWXH(64, 64)
-OBMCVARWXH(64, 32)
-OBMCVARWXH(32, 64)
-OBMCVARWXH(32, 32)
-OBMCVARWXH(32, 16)
-OBMCVARWXH(16, 32)
-OBMCVARWXH(16, 16)
-OBMCVARWXH(16, 8)
-OBMCVARWXH(8, 16)
-OBMCVARWXH(8, 8)
-OBMCVARWXH(8, 4)
-OBMCVARWXH(4, 8)
-OBMCVARWXH(4, 4)
-OBMCVARWXH(4, 16)
-OBMCVARWXH(16, 4)
-OBMCVARWXH(8, 32)
-OBMCVARWXH(32, 8)
-OBMCVARWXH(16, 64)
-OBMCVARWXH(64, 16)

diff --git a/aom_dsp/x86/obmc_variance_sse4.c b/aom_dsp/x86/obmc_variance_sse4.c
index 12ee91f..7aad27f 100644
--- a/aom_dsp/x86/obmc_variance_sse4.c
+++ b/aom_dsp/x86/obmc_variance_sse4.c

@@ -24,147 +24,6 @@
 #include "aom_dsp/x86/synonyms.h"
 
 ////////////////////////////////////////////////////////////////////////////////
-// 8 bit
-////////////////////////////////////////////////////////////////////////////////
-
-void aom_var_filter_block2d_bil_first_pass_ssse3(
-    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
-    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter);
-
-void aom_var_filter_block2d_bil_second_pass_ssse3(
-    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
-    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter);
-
-static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
-                                     const int32_t *wsrc, const int32_t *mask,
-                                     unsigned int *const sse, int *const sum,
-                                     const int w, const int h) {
-  const int pre_step = pre_stride - w;
-  int n = 0;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_d = _mm_setzero_si128();
-
-  assert(w >= 8);
-  assert(IS_POWER_OF_TWO(w));
-  assert(IS_POWER_OF_TWO(h));
-
-  do {
-    const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
-    const __m128i v_m1_d = xx_load_128(mask + n + 4);
-    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
-    const __m128i v_p0_b = xx_loadl_32(pre + n);
-    const __m128i v_m0_d = xx_load_128(mask + n);
-    const __m128i v_w0_d = xx_load_128(wsrc + n);
-
-    const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
-    const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
-    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
-
-    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
-    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
-
-    const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
-    const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
-    const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
-    const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
-
-    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
-    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
-    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
-
-    n += 8;
-
-    if (n % w == 0) pre += pre_step;
-  } while (n < w * h);
-
-  *sum = xx_hsum_epi32_si32(v_sum_d);
-  *sse = xx_hsum_epi32_si32(v_sse_d);
-}
-
-#define OBMCVARWXH(W, H)                                               \
-  unsigned int aom_obmc_variance##W##x##H##_sse4_1(                    \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc,         \
-      const int32_t *mask, unsigned int *sse) {                        \
-    int sum;                                                           \
-    if (W == 4) {                                                      \
-      obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H);     \
-    } else {                                                           \
-      obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
-    }                                                                  \
-    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));      \
-  }
-
-OBMCVARWXH(128, 128)
-OBMCVARWXH(128, 64)
-OBMCVARWXH(64, 128)
-OBMCVARWXH(64, 64)
-OBMCVARWXH(64, 32)
-OBMCVARWXH(32, 64)
-OBMCVARWXH(32, 32)
-OBMCVARWXH(32, 16)
-OBMCVARWXH(16, 32)
-OBMCVARWXH(16, 16)
-OBMCVARWXH(16, 8)
-OBMCVARWXH(8, 16)
-OBMCVARWXH(8, 8)
-OBMCVARWXH(8, 4)
-OBMCVARWXH(4, 8)
-OBMCVARWXH(4, 4)
-OBMCVARWXH(4, 16)
-OBMCVARWXH(16, 4)
-OBMCVARWXH(8, 32)
-OBMCVARWXH(32, 8)
-OBMCVARWXH(16, 64)
-OBMCVARWXH(64, 16)
-
-#include "config/aom_dsp_rtcd.h"
-
-#define OBMC_SUBPIX_VAR(W, H)                                                \
-  uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1(                    \
-      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,          \
-      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {         \
-    uint16_t fdata3[(H + 1) * W];                                            \
-    uint8_t temp2[H * W];                                                    \
-                                                                             \
-    aom_var_filter_block2d_bil_first_pass_ssse3(                             \
-        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
-    aom_var_filter_block2d_bil_second_pass_ssse3(                            \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
-                                                                             \
-    return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse);   \
-  }
-
-OBMC_SUBPIX_VAR(128, 128)
-OBMC_SUBPIX_VAR(128, 64)
-OBMC_SUBPIX_VAR(64, 128)
-OBMC_SUBPIX_VAR(64, 64)
-OBMC_SUBPIX_VAR(64, 32)
-OBMC_SUBPIX_VAR(32, 64)
-OBMC_SUBPIX_VAR(32, 32)
-OBMC_SUBPIX_VAR(32, 16)
-OBMC_SUBPIX_VAR(16, 32)
-OBMC_SUBPIX_VAR(16, 16)
-OBMC_SUBPIX_VAR(16, 8)
-OBMC_SUBPIX_VAR(8, 16)
-OBMC_SUBPIX_VAR(8, 8)
-OBMC_SUBPIX_VAR(8, 4)
-OBMC_SUBPIX_VAR(4, 8)
-OBMC_SUBPIX_VAR(4, 4)
-OBMC_SUBPIX_VAR(4, 16)
-OBMC_SUBPIX_VAR(16, 4)
-OBMC_SUBPIX_VAR(8, 32)
-OBMC_SUBPIX_VAR(32, 8)
-OBMC_SUBPIX_VAR(16, 64)
-OBMC_SUBPIX_VAR(64, 16)
-
-////////////////////////////////////////////////////////////////////////////////
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
 static INLINE void hbd_obmc_variance_w4(

diff --git a/aom_dsp/x86/sad4d_sse2.asm b/aom_dsp/x86/sad4d_sse2.asm
deleted file mode 100644
index b720f9c..0000000
--- a/aom_dsp/x86/sad4d_sse2.asm
+++ /dev/null

@@ -1,480 +0,0 @@
-;
-; Copyright (c) 2021, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 3-Clause Clear License and the
-; Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear License was
-; not distributed with this source code in the LICENSE file, you can obtain it
-; at aomedia.org/license/software-license/bsd-3-c-c/.  If the Alliance for Open Media Patent
-; License 1.0 was not distributed with this source code in the PATENTS file, you
-; can obtain it at aomedia.org/license/patent-license/.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro AVG_4x2x4 2
-  movh                  m2, [second_predq]
-  movlhps               m2, m2
-  pavgb                 %1, m2
-  pavgb                 %2, m2
-  lea                   second_predq, [second_predq+8]
-%endmacro
-; 'mflag' affect a lot how the code works.
-;
-; When 'mflag' is false, the 'src_strideq' resides in register,
-; [srcq + src_strideq + offset] is allowed, so we can simply
-; use such form to access src memory and don't bother to update
-; 'srcq' at each line. We only update 'srcq' each two-lines using
-; a compact LEA instruction like [srcq+src_strideq*2].
-;
-; When 'mflag' is true, the 'src_strideq' resides in memory.
-; we cannot use above form to access memory, we have to update
-; 'srcq' at each line break. As we process two parts (first,second)
-; together in each macro function, the second part may also sit
-; in the next line, which means we also need to possibly add
-; one 'src_strideq' to 'srcq' before processing second part.
-
-%macro HANDLE_FIRST_OFFSET 2
-  %define first_offset %2
-  %if mflag == 0 && %1 == 1
-    %define first_offset (src_strideq + %2)
-  %endif
-%endmacro
-
-; first_extraline, second_extraline, in_line_offset
-%macro HANDLE_SECOND_OFFSET 3
-  %define second_offset %3
-  %if mflag && %1 == 0 && %2 == 1
-    add srcq, src_strideq
-  %endif
-  %if mflag == 0 && %2 == 1
-    %define second_offset (src_strideq + %3)
-  %endif
-%endmacro
-
-; Notes for line_ending:
-; 0 -- not a line ending
-; 1 -- line ending of a odd line [line numbers starts from one]
-; 2 -- line ending of a even line
-; This is specically designed to handle when src_strideq is a
-; memory position, under such case, we can not accomplish
-; complex address calculation using LEA, and fall back to
-; using simple ADD instruction at each line ending.
-%macro ADVANCE_END_OF_LINE 1
-  %if mflag
-    add srcq, src_strideq
-  %endif
-  %if mflag == 0 && %1 == 2
-    lea                 srcq, [srcq +src_strideq*2]
-  %endif
-
-  %if %1 == 2
-    lea                ref1q, [ref1q+ref_strideq*2]
-    lea                ref2q, [ref2q+ref_strideq*2]
-    lea                ref3q, [ref3q+ref_strideq*2]
-    lea                ref4q, [ref4q+ref_strideq*2]
-  %endif
-%endmacro
-
-; Please note that the second_offset of src is for in_line_offset,
-; so it is less than src_stride.
-; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, do_avg,
-;               {first, second}_extraline, line_ending
-%macro PROCESS_4x2x4 9
-  HANDLE_FIRST_OFFSET   %7, %2
-  movd                  m0, [srcq + first_offset]
-  HANDLE_SECOND_OFFSET  %7, %8, %4
-%if %1 == 1
-  movd                  m6, [ref1q+%3]
-  movd                  m4, [ref2q+%3]
-  movd                  m7, [ref3q+%3]
-  movd                  m5, [ref4q+%3]
-
-  movd                  m1, [srcq + second_offset]
-  movd                  m2, [ref1q+%5]
-  punpckldq             m0, m1
-  punpckldq             m6, m2
-  movd                  m1, [ref2q+%5]
-  movd                  m2, [ref3q+%5]
-  movd                  m3, [ref4q+%5]
-  punpckldq             m4, m1
-  punpckldq             m7, m2
-  punpckldq             m5, m3
-  movlhps               m0, m0
-  movlhps               m6, m4
-  movlhps               m7, m5
-%if %6 == 1
-  AVG_4x2x4             m6, m7
-%endif
-  psadbw                m6, m0
-  psadbw                m7, m0
-%else
-  movd                  m1, [ref1q+%3]
-  movd                  m5, [ref1q+%5]
-  movd                  m2, [ref2q+%3]
-  movd                  m4, [ref2q+%5]
-  punpckldq             m1, m5
-  punpckldq             m2, m4
-  movd                  m3, [ref3q+%3]
-  movd                  m5, [ref3q+%5]
-  punpckldq             m3, m5
-  movd                  m4, [ref4q+%3]
-  movd                  m5, [ref4q+%5]
-  punpckldq             m4, m5
-  movd                  m5, [srcq + second_offset]
-  punpckldq             m0, m5
-  movlhps               m0, m0
-  movlhps               m1, m2
-  movlhps               m3, m4
-%if %6 == 1
-  AVG_4x2x4             m1, m3
-%endif
-  psadbw                m1, m0
-  psadbw                m3, m0
-  paddd                 m6, m1
-  paddd                 m7, m3
-%endif
-%if %9 > 0
-  ADVANCE_END_OF_LINE %9
-%endif
-%endmacro
-
-; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, do_avg,
-;               {first,second}_extraline, line_ending
-%macro PROCESS_8x2x4 9
-  HANDLE_FIRST_OFFSET   %7, %2
-  movh                  m0, [srcq + first_offset]
-  HANDLE_SECOND_OFFSET  %7, %8, %4
-%if %1 == 1
-  movh                  m4, [ref1q+%3]
-  movh                  m5, [ref2q+%3]
-  movh                  m6, [ref3q+%3]
-  movh                  m7, [ref4q+%3]
-  movhps                m0, [srcq + second_offset]
-  movhps                m4, [ref1q+%5]
-  movhps                m5, [ref2q+%5]
-  movhps                m6, [ref3q+%5]
-  movhps                m7, [ref4q+%5]
-%if %6 == 1
-  movu                  m3, [second_predq]
-  pavgb                 m4, m3
-  pavgb                 m5, m3
-  pavgb                 m6, m3
-  pavgb                 m7, m3
-  lea                   second_predq, [second_predq+mmsize]
-%endif
-  psadbw                m4, m0
-  psadbw                m5, m0
-  psadbw                m6, m0
-  psadbw                m7, m0
-%else
-  movh                  m1, [ref1q+%3]
-  movh                  m2, [ref2q+%3]
-  movhps                m0, [srcq + second_offset]
-  movhps                m1, [ref1q+%5]
-  movhps                m2, [ref2q+%5]
-%if %6 == 1
-  movu                  m3, [second_predq]
-  pavgb                 m1, m3
-  pavgb                 m2, m3
-%endif
-  psadbw                m1, m0
-  psadbw                m2, m0
-  paddd                 m4, m1
-  paddd                 m5, m2
-
-  movh                  m1, [ref3q+%3]
-  movhps                m1, [ref3q+%5]
-  movh                  m2, [ref4q+%3]
-  movhps                m2, [ref4q+%5]
-%if %6 == 1
-  pavgb                 m1, m3
-  pavgb                 m2, m3
-  lea                   second_predq, [second_predq+mmsize]
-%endif
-  psadbw                m1, m0
-  psadbw                m2, m0
-  paddd                 m6, m1
-  paddd                 m7, m2
-%endif
-%if %9 > 0
-  ADVANCE_END_OF_LINE %9
-%endif
-%endmacro
-
-; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, do_avg,
-;                {first,second}_extraline, line_ending
-%macro PROCESS_16x2x4 9
-  ; 1st 16 px
-  HANDLE_FIRST_OFFSET   %7, %2
-  mova                  m0, [srcq + first_offset]
-  HANDLE_SECOND_OFFSET  %7, %8, %4
-%if %1 == 1
-  movu                  m4, [ref1q+%3]
-  movu                  m5, [ref2q+%3]
-  movu                  m6, [ref3q+%3]
-  movu                  m7, [ref4q+%3]
-%if %6 == 1
-  movu                  m3, [second_predq]
-  pavgb                 m4, m3
-  pavgb                 m5, m3
-  pavgb                 m6, m3
-  pavgb                 m7, m3
-  lea                   second_predq, [second_predq+mmsize]
-%endif
-  psadbw                m4, m0
-  psadbw                m5, m0
-  psadbw                m6, m0
-  psadbw                m7, m0
-%else ; %1 == 1
-  movu                  m1, [ref1q+%3]
-  movu                  m2, [ref2q+%3]
-%if %6 == 1
-  movu                  m3, [second_predq]
-  pavgb                 m1, m3
-  pavgb                 m2, m3
-%endif
-  psadbw                m1, m0
-  psadbw                m2, m0
-  paddd                 m4, m1
-  paddd                 m5, m2
-
-  movu                  m1, [ref3q+%3]
-  movu                  m2, [ref4q+%3]
-%if %6 == 1
-  pavgb                 m1, m3
-  pavgb                 m2, m3
-  lea                   second_predq, [second_predq+mmsize]
-%endif
-  psadbw                m1, m0
-  psadbw                m2, m0
-  paddd                 m6, m1
-  paddd                 m7, m2
-%endif ; %1 == 1
-
-  ; 2nd 16 px
-  mova                  m0, [srcq + second_offset]
-  movu                  m1, [ref1q+%5]
-  movu                  m2, [ref2q+%5]
-
-%if %6 == 1
-  movu                  m3, [second_predq]
-  pavgb                 m1, m3
-  pavgb                 m2, m3
-%endif
-  psadbw                m1, m0
-  psadbw                m2, m0
-  paddd                 m4, m1
-  paddd                 m5, m2
-
-  movu                  m1, [ref3q+%5]
-  movu                  m2, [ref4q+%5]
-
-%if %9 > 0
-  ADVANCE_END_OF_LINE %9
-%endif
-
-%if %6 == 1
-  pavgb                 m1, m3
-  pavgb                 m2, m3
-  lea                   second_predq, [second_predq+mmsize]
-%endif
-  psadbw                m1, m0
-  psadbw                m2, m0
-  paddd                 m6, m1
-  paddd                 m7, m2
-%endmacro
-
-; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, do_avg,
-;                {first,second}_extraline, line_ending
-%macro PROCESS_32x2x4 9
-  PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16, %6, %7, %7, %8 - %7
-  PROCESS_16x2x4  0, %4, %5, %4 + 16, %5 + 16, %6, %8, %8, %9
-%endmacro
-
-; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, do_avg,
-;                {first,second}_extraline, line_ending
-%macro PROCESS_64x2x4 9
-  PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32, %6, %7, %7, %8 - %7
-  PROCESS_32x2x4  0, %4, %5, %4 + 32, %5 + 32, %6, %8, %8, %9
-%endmacro
-
-; PROCESS_128x2x4 first, off_{first,second}_{src,ref}, do_avg,
-;                 {first,second}_extraline, line_ending
-%macro PROCESS_128x2x4 9
-  PROCESS_64x2x4 %1, %2, %3, %2 + 64, %3 + 64, %6, %7, %7, %8 - %7
-  PROCESS_64x2x4  0, %4, %5, %4 + 64, %5 + 64, %6, %8, %8, %9
-%endmacro
-
-; void aom_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
-;                         uint8_t *ref[4], int ref_stride,
-;                         uint32_t res[4]);
-; Macro Arguments:
-;   1: Width
-;   2: Height
-;   3: If 0, then normal sad, else avg
-;   4: If 0, then normal sad, else skip rows
-%macro SADNXN4D 2-4 0,0
-%if %4 == 1  ; skip rows
-%if UNIX64
-cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
-                              res, ref2, ref3, ref4
-%else
-cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
-                              ref2, ref3, ref4
-%endif
-%elif %3 == 0  ; normal sad
-%if UNIX64
-cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
-                              res, ref2, ref3, ref4
-%else
-cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
-                              ref2, ref3, ref4
-%endif
-%else ; avg
-%if UNIX64
-cglobal sad%1x%2x4d_avg, 6, 10, 8, src, src_stride, ref1, ref_stride, \
-                                  second_pred, res, ref2, ref3, ref4
-%else
-cglobal sad%1x%2x4d_avg, 5, 7, 8, src, ref4, ref1, ref_stride, \
-                                  second_pred, ref2, ref3
-  %define src_strideq r1mp
-  %define src_strided r1mp
-%endif
-%endif
-
-  %define mflag ((1 - UNIX64) & %3)
-%if %4 == 1
-  lea          src_strided, [2*src_strided]
-  lea          ref_strided, [2*ref_strided]
-%endif
-  movsxdifnidn src_strideq, src_strided
-  movsxdifnidn ref_strideq, ref_strided
-
-  mov                ref2q, [ref1q+gprsize*1]
-  mov                ref3q, [ref1q+gprsize*2]
-  mov                ref4q, [ref1q+gprsize*3]
-  mov                ref1q, [ref1q+gprsize*0]
-
-  PROCESS_%1x2x4 1, 0, 0, 0, ref_strideq, %3, 0, 1, 2
-%if %4 == 1  ; downsample number of rows by 2
-%define num_rep (%2-8)/4
-%else
-%define num_rep (%2-4)/2
-%endif
-%rep num_rep
-  PROCESS_%1x2x4 0, 0, 0, 0, ref_strideq, %3, 0, 1, 2
-%endrep
-%undef num_rep
-  PROCESS_%1x2x4 0, 0, 0, 0, ref_strideq, %3, 0, 1, 2
-
-%if %3 == 0
-  %define resultq r4
-  %define resultmp r4mp
-%else
-  %define resultq r5
-  %define resultmp r5mp
-%endif
-
-%if %1 > 4
-  pslldq                m5, 4
-  pslldq                m7, 4
-  por                   m4, m5
-  por                   m6, m7
-  mova                  m5, m4
-  mova                  m7, m6
-  punpcklqdq            m4, m6
-  punpckhqdq            m5, m7
-  paddd                 m4, m5
-%if %4 == 1
-  pslld                 m4, 1
-%endif
-  movifnidn             resultq, resultmp
-  movu                [resultq], m4
-  RET
-%else
-  pshufd            m6, m6, 0x08
-  pshufd            m7, m7, 0x08
-%if %4 == 1
-  pslld                 m6, 1
-  pslld                 m7, 1
-%endif
-  movifnidn             resultq, resultmp
-  movq              [resultq+0], m6
-  movq              [resultq+8], m7
-  RET
-%endif
-%endmacro
-
-INIT_XMM sse2
-SADNXN4D 128, 128
-SADNXN4D 128,  64
-SADNXN4D  64, 128
-SADNXN4D  64,  64
-SADNXN4D  64,  32
-SADNXN4D  32,  64
-SADNXN4D  32,  32
-SADNXN4D  32,  16
-SADNXN4D  16,  32
-SADNXN4D  16,  16
-SADNXN4D  16,   8
-SADNXN4D   8,  16
-SADNXN4D   8,   8
-SADNXN4D   8,   4
-SADNXN4D   4,   8
-SADNXN4D   4,   4
-SADNXN4D   4,  16
-SADNXN4D  16,   4
-SADNXN4D   8,  32
-SADNXN4D  32,   8
-SADNXN4D  16,  64
-SADNXN4D  64,  16
-SADNXN4D 128, 128, 1
-SADNXN4D 128,  64, 1
-SADNXN4D  64, 128, 1
-SADNXN4D  64,  64, 1
-SADNXN4D  64,  32, 1
-SADNXN4D  32,  64, 1
-SADNXN4D  32,  32, 1
-SADNXN4D  32,  16, 1
-SADNXN4D  16,  32, 1
-SADNXN4D  16,  16, 1
-SADNXN4D  16,   8, 1
-SADNXN4D   8,  16, 1
-SADNXN4D   8,   8, 1
-SADNXN4D   8,   4, 1
-SADNXN4D   4,   8, 1
-SADNXN4D   4,   4, 1
-SADNXN4D   4,  16, 1
-SADNXN4D  16,   4, 1
-SADNXN4D   8,  32, 1
-SADNXN4D  32,   8, 1
-SADNXN4D  16,  64, 1
-SADNXN4D  64,  16, 1
-SADNXN4D 128, 128, 0, 1
-SADNXN4D 128,  64, 0, 1
-SADNXN4D  64, 128, 0, 1
-SADNXN4D  64,  64, 0, 1
-SADNXN4D  64,  32, 0, 1
-SADNXN4D  32,  64, 0, 1
-SADNXN4D  32,  32, 0, 1
-SADNXN4D  32,  16, 0, 1
-SADNXN4D  16,  32, 0, 1
-SADNXN4D  16,  16, 0, 1
-SADNXN4D  16,   8, 0, 1
-SADNXN4D   8,  16, 0, 1
-SADNXN4D   8,   8, 0, 1
-SADNXN4D   4,   8, 0, 1
-SADNXN4D   4,  16, 0, 1
-SADNXN4D   8,  32, 0, 1
-SADNXN4D  32,   8, 0, 1
-SADNXN4D  16,  64, 0, 1
-SADNXN4D  64,  16, 0, 1
-
-; Different assembly is needed when the height gets subsampled to 2
-; SADNXN4D 16,  4, 0, 1
-; SADNXN4D  8,  4, 0, 1
-; SADNXN4D  4,  4, 0, 1

diff --git a/aom_dsp/x86/sad_avx2.c b/aom_dsp/x86/sad_avx2.c
deleted file mode 100644
index 991d63d..0000000
--- a/aom_dsp/x86/sad_avx2.c
+++ /dev/null

@@ -1,220 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_ports/mem.h"
-
-static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
-                                        const uint8_t *ref_ptr, int ref_stride,
-                                        int h) {
-  int i, res;
-  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
-  __m256i sum_sad = _mm256_setzero_si256();
-  __m256i sum_sad_h;
-  __m128i sum_sad128;
-  for (i = 0; i < h; i++) {
-    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
-    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
-    sad1_reg =
-        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
-    sad2_reg = _mm256_sad_epu8(
-        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
-    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
-    ref_ptr += ref_stride;
-    src_ptr += src_stride;
-  }
-  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
-  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
-  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
-  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
-  res = _mm_cvtsi128_si32(sum_sad128);
-  _mm256_zeroupper();
-  return res;
-}
-
-static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
-                                        const uint8_t *ref_ptr, int ref_stride,
-                                        int h) {
-  int i, res;
-  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
-  __m256i sum_sad = _mm256_setzero_si256();
-  __m256i sum_sad_h;
-  __m128i sum_sad128;
-  int ref2_stride = ref_stride << 1;
-  int src2_stride = src_stride << 1;
-  int max = h >> 1;
-  for (i = 0; i < max; i++) {
-    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
-    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
-    sad1_reg =
-        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
-    sad2_reg = _mm256_sad_epu8(
-        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
-    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
-    ref_ptr += ref2_stride;
-    src_ptr += src2_stride;
-  }
-  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
-  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
-  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
-  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
-  res = _mm_cvtsi128_si32(sum_sad128);
-  _mm256_zeroupper();
-  return res;
-}
-
-#define FSAD64_H(h)                                                           \
-  unsigned int aom_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
-                                    const uint8_t *ref_ptr, int ref_stride) { \
-    return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
-  }
-
-#define FSADS64_H(h)                                                          \
-  unsigned int aom_sad_skip_64x##h##_avx2(                                    \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
-      int ref_stride) {                                                       \
-    return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
-                            h / 2);                                           \
-  }
-
-#define FSAD32_H(h)                                                           \
-  unsigned int aom_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
-                                    const uint8_t *ref_ptr, int ref_stride) { \
-    return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
-  }
-
-#define FSADS32_H(h)                                                          \
-  unsigned int aom_sad_skip_32x##h##_avx2(                                    \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
-      int ref_stride) {                                                       \
-    return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
-                            h / 2);                                           \
-  }
-
-#define FSAD64   \
-  FSAD64_H(64);  \
-  FSAD64_H(32);  \
-  FSADS64_H(64); \
-  FSADS64_H(32);
-
-#define FSAD32   \
-  FSAD32_H(64);  \
-  FSAD32_H(32);  \
-  FSAD32_H(16);  \
-  FSADS32_H(64); \
-  FSADS32_H(32); \
-  FSADS32_H(16);
-
-/* clang-format off */
-FSAD64
-FSAD32
-/* clang-format on */
-
-#undef FSAD64
-#undef FSAD32
-#undef FSAD64_H
-#undef FSAD32_H
-
-#define FSADAVG64_H(h)                                                        \
-  unsigned int aom_sad64x##h##_avg_avx2(                                      \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
-      int ref_stride, const uint8_t *second_pred) {                           \
-    int i, res;                                                               \
-    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
-    __m256i sum_sad = _mm256_setzero_si256();                                 \
-    __m256i sum_sad_h;                                                        \
-    __m128i sum_sad128;                                                       \
-    for (i = 0; i < h; i++) {                                                 \
-      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
-      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
-      ref1_reg = _mm256_avg_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
-      ref2_reg = _mm256_avg_epu8(                                             \
-          ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
-      sad1_reg = _mm256_sad_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
-      sad2_reg = _mm256_sad_epu8(                                             \
-          ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
-      sum_sad =                                                               \
-          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
-      ref_ptr += ref_stride;                                                  \
-      src_ptr += src_stride;                                                  \
-      second_pred += 64;                                                      \
-    }                                                                         \
-    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
-    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
-    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
-    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    _mm256_zeroupper();                                                       \
-    return res;                                                               \
-  }
-
-#define FSADAVG32_H(h)                                                        \
-  unsigned int aom_sad32x##h##_avg_avx2(                                      \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
-      int ref_stride, const uint8_t *second_pred) {                           \
-    int i, res;                                                               \
-    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
-    __m256i sum_sad = _mm256_setzero_si256();                                 \
-    __m256i sum_sad_h;                                                        \
-    __m128i sum_sad128;                                                       \
-    int ref2_stride = ref_stride << 1;                                        \
-    int src2_stride = src_stride << 1;                                        \
-    int max = h >> 1;                                                         \
-    for (i = 0; i < max; i++) {                                               \
-      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
-      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
-      ref1_reg = _mm256_avg_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
-      ref2_reg = _mm256_avg_epu8(                                             \
-          ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
-      sad1_reg = _mm256_sad_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
-      sad2_reg = _mm256_sad_epu8(                                             \
-          ref2_reg,                                                           \
-          _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
-      sum_sad =                                                               \
-          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
-      ref_ptr += ref2_stride;                                                 \
-      src_ptr += src2_stride;                                                 \
-      second_pred += 64;                                                      \
-    }                                                                         \
-    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
-    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
-    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
-    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    _mm256_zeroupper();                                                       \
-    return res;                                                               \
-  }
-
-#define FSADAVG64  \
-  FSADAVG64_H(64); \
-  FSADAVG64_H(32);
-
-#define FSADAVG32  \
-  FSADAVG32_H(64); \
-  FSADAVG32_H(32); \
-  FSADAVG32_H(16);
-
-/* clang-format off */
-FSADAVG64
-FSADAVG32
-/* clang-format on */
-
-#undef FSADAVG64
-#undef FSADAVG32
-#undef FSADAVG64_H
-#undef FSADAVG32_H

diff --git a/aom_dsp/x86/sad_sse2.asm b/aom_dsp/x86/sad_sse2.asm
deleted file mode 100644
index bd8ef45..0000000
--- a/aom_dsp/x86/sad_sse2.asm
+++ /dev/null

@@ -1,432 +0,0 @@
-;
-; Copyright (c) 2021, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 3-Clause Clear License and the
-; Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear License was
-; not distributed with this source code in the LICENSE file, you can obtain it
-; at aomedia.org/license/software-license/bsd-3-c-c/.  If the Alliance for Open Media Patent
-; License 1.0 was not distributed with this source code in the PATENTS file, you
-; can obtain it at aomedia.org/license/patent-license/.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-; Macro Arguments
-; Arg 1: Width
-; Arg 2: Height
-; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit
-; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
-%macro SAD_FN 4
-%if %4 == 0 ; normal sad
-%if %3 == 5
-cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
-%else ; %3 == 7
-cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
-                            src_stride3, ref_stride3, n_rows
-%endif ; %3 == 5/7
-
-%elif %4 == 2 ; skip
-%if %3 == 5
-cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
-%else ; %3 == 7
-cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
-                            src_stride3, ref_stride3, n_rows
-%endif ; %3 == 5/7
-
-%else
-%if %3 == 5
-cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
-                                    second_pred, n_rows
-%else ; %3 == 7
-cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
-                                              ref, ref_stride, \
-                                              second_pred, \
-                                              src_stride3, ref_stride3
-%if ARCH_X86_64
-%define n_rowsd r7d
-%else ; x86-32
-%define n_rowsd dword r0m
-%endif ; x86-32/64
-%endif ; %3 == 5/7
-%endif ; sad/avg/skip
-%if %4 == 2; skip rows so double the stride
-lea           src_strided, [src_strided*2]
-lea           ref_strided, [ref_strided*2]
-%endif ; %4 skip
-  movsxdifnidn src_strideq, src_strided
-  movsxdifnidn ref_strideq, ref_strided
-%if %3 == 7
-  lea         src_stride3q, [src_strideq*3]
-  lea         ref_stride3q, [ref_strideq*3]
-%endif ; %3 == 7
-%endmacro
-
-; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride,
-;                                  uint8_t *ref, int ref_stride);
-%macro SAD128XN 1-2 0
-  SAD_FN 128, %1, 5, %2
-%if %2 == 2
-  mov              n_rowsd, %1/2
-%else
-  mov              n_rowsd, %1
-%endif
-  pxor                  m0, m0
-
-.loop:
-  movu                  m1, [refq]
-  movu                  m2, [refq+16]
-  movu                  m3, [refq+32]
-  movu                  m4, [refq+48]
-%if %2 == 1
-  pavgb                 m1, [second_predq+mmsize*0]
-  pavgb                 m2, [second_predq+mmsize*1]
-  pavgb                 m3, [second_predq+mmsize*2]
-  pavgb                 m4, [second_predq+mmsize*3]
-%endif
-  psadbw                m1, [srcq]
-  psadbw                m2, [srcq+16]
-  psadbw                m3, [srcq+32]
-  psadbw                m4, [srcq+48]
-
-  paddd                 m1, m2
-  paddd                 m3, m4
-  paddd                 m0, m1
-  paddd                 m0, m3
-
-  movu                  m1, [refq+64]
-  movu                  m2, [refq+80]
-  movu                  m3, [refq+96]
-  movu                  m4, [refq+112]
-%if %2 == 1
-  pavgb                 m1, [second_predq+mmsize*4]
-  pavgb                 m2, [second_predq+mmsize*5]
-  pavgb                 m3, [second_predq+mmsize*6]
-  pavgb                 m4, [second_predq+mmsize*7]
-  lea         second_predq, [second_predq+mmsize*8]
-%endif
-  psadbw                m1, [srcq+64]
-  psadbw                m2, [srcq+80]
-  psadbw                m3, [srcq+96]
-  psadbw                m4, [srcq+112]
-
-  add                 refq, ref_strideq
-  add                 srcq, src_strideq
-
-  paddd                 m1, m2
-  paddd                 m3, m4
-  paddd                 m0, m1
-  paddd                 m0, m3
-
-  sub              n_rowsd, 1
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-%if %2 == 2 ; we skipped rows, so now we need to double the sad
-  pslld                 m0, 1
-%endif
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-SAD128XN 128     ; sad128x128_sse2
-SAD128XN 128, 1  ; sad128x128_avg_sse2
-SAD128XN 128, 2  ; sad128x128_skip_sse2
-SAD128XN 64      ; sad128x64_sse2
-SAD128XN 64, 1   ; sad128x64_avg_sse2
-SAD128XN 64, 2   ; sad128x64_skip_sse2
-
-
-; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride,
-;                                uint8_t *ref, int ref_stride);
-%macro SAD64XN 1-2 0
-  SAD_FN 64, %1, 5, %2
-%if %2 == 2
-  mov              n_rowsd, %1/2
-%else
-  mov              n_rowsd, %1
-%endif
-  pxor                  m0, m0
-.loop:
-  movu                  m1, [refq]
-  movu                  m2, [refq+16]
-  movu                  m3, [refq+32]
-  movu                  m4, [refq+48]
-%if %2 == 1
-  pavgb                 m1, [second_predq+mmsize*0]
-  pavgb                 m2, [second_predq+mmsize*1]
-  pavgb                 m3, [second_predq+mmsize*2]
-  pavgb                 m4, [second_predq+mmsize*3]
-  lea         second_predq, [second_predq+mmsize*4]
-%endif
-  psadbw                m1, [srcq]
-  psadbw                m2, [srcq+16]
-  psadbw                m3, [srcq+32]
-  psadbw                m4, [srcq+48]
-  paddd                 m1, m2
-  paddd                 m3, m4
-  add                 refq, ref_strideq
-  paddd                 m0, m1
-  add                 srcq, src_strideq
-  paddd                 m0, m3
-  dec              n_rowsd
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-%if %2 == 2 ; we skipped rows, so now we need to double the sad
-  pslld                 m0, 1
-%endif
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-SAD64XN 128     ; sad64x128_sse2
-SAD64XN  64     ; sad64x64_sse2
-SAD64XN  32     ; sad64x32_sse2
-SAD64XN  16     ; sad64x16_sse2
-SAD64XN 128, 1  ; sad64x128_avg_sse2
-SAD64XN  64, 1  ; sad64x64_avg_sse2
-SAD64XN  32, 1  ; sad64x32_avg_sse2
-SAD64XN  16, 1  ; sad64x16_avg_sse2
-SAD64XN 128, 2  ; sad64x128_skip_sse2
-SAD64XN  64, 2  ; sad64x64_skip_sse2
-SAD64XN  32, 2  ; sad64x32_skip_sse2
-SAD64XN  16, 2  ; sad64x16_skip_sse2
-
-; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride,
-;                                uint8_t *ref, int ref_stride);
-%macro SAD32XN 1-2 0
-  SAD_FN 32, %1, 5, %2
-%if %2 == 2
-  mov              n_rowsd, %1/4
-%else
-  mov              n_rowsd, %1/2
-%endif
-  pxor                  m0, m0
-.loop:
-  movu                  m1, [refq]
-  movu                  m2, [refq+16]
-  movu                  m3, [refq+ref_strideq]
-  movu                  m4, [refq+ref_strideq+16]
-%if %2 == 1
-  pavgb                 m1, [second_predq+mmsize*0]
-  pavgb                 m2, [second_predq+mmsize*1]
-  pavgb                 m3, [second_predq+mmsize*2]
-  pavgb                 m4, [second_predq+mmsize*3]
-  lea         second_predq, [second_predq+mmsize*4]
-%endif
-  psadbw                m1, [srcq]
-  psadbw                m2, [srcq+16]
-  psadbw                m3, [srcq+src_strideq]
-  psadbw                m4, [srcq+src_strideq+16]
-  paddd                 m1, m2
-  paddd                 m3, m4
-  lea                 refq, [refq+ref_strideq*2]
-  paddd                 m0, m1
-  lea                 srcq, [srcq+src_strideq*2]
-  paddd                 m0, m3
-  dec              n_rowsd
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-%if %2 == 2 ; we skipped rows, so now we need to double the sad
-  pslld                 m0, 1
-%endif
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-SAD32XN 64    ; sad32x64_sse2
-SAD32XN 32    ; sad32x32_sse2
-SAD32XN 16    ; sad32x16_sse2
-SAD32XN  8    ; sad_32x8_sse2
-SAD32XN 64, 1 ; sad32x64_avg_sse2
-SAD32XN 32, 1 ; sad32x32_avg_sse2
-SAD32XN 16, 1 ; sad32x16_avg_sse2
-SAD32XN  8, 1 ; sad_32x8_avg_sse2
-SAD32XN 64, 2 ; sad32x64_skip_sse2
-SAD32XN 32, 2 ; sad32x32_skip_sse2
-SAD32XN 16, 2 ; sad32x16_skip_sse2
-SAD32XN  8, 2 ; sad_32x8_skip_sse2
-
-; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
-;                                    uint8_t *ref, int ref_stride);
-%macro SAD16XN 1-2 0
-  SAD_FN 16, %1, 7, %2
-%if %2 == 2
-  mov              n_rowsd, %1/8
-%else
-  mov              n_rowsd, %1/4
-%endif
-  pxor                  m0, m0
-
-.loop:
-  movu                  m1, [refq]
-  movu                  m2, [refq+ref_strideq]
-  movu                  m3, [refq+ref_strideq*2]
-  movu                  m4, [refq+ref_stride3q]
-%if %2 == 1
-  pavgb                 m1, [second_predq+mmsize*0]
-  pavgb                 m2, [second_predq+mmsize*1]
-  pavgb                 m3, [second_predq+mmsize*2]
-  pavgb                 m4, [second_predq+mmsize*3]
-  lea         second_predq, [second_predq+mmsize*4]
-%endif
-  psadbw                m1, [srcq]
-  psadbw                m2, [srcq+src_strideq]
-  psadbw                m3, [srcq+src_strideq*2]
-  psadbw                m4, [srcq+src_stride3q]
-  paddd                 m1, m2
-  paddd                 m3, m4
-  lea                 refq, [refq+ref_strideq*4]
-  paddd                 m0, m1
-  lea                 srcq, [srcq+src_strideq*4]
-  paddd                 m0, m3
-  dec              n_rowsd
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-%if %2 == 2 ; we skipped rows, so now we need to double the sad
-  pslld                 m0, 1
-%endif
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-SAD16XN 64    ; sad_16x64_sse2
-SAD16XN 32    ; sad16x32_sse2
-SAD16XN 16    ; sad16x16_sse2
-SAD16XN  8    ; sad16x8_sse2
-SAD16XN  4    ; sad_16x4_sse2
-SAD16XN 64, 1 ; sad_16x64_avg_sse2
-SAD16XN 32, 1 ; sad16x32_avg_sse2
-SAD16XN 16, 1 ; sad16x16_avg_sse2
-SAD16XN  8, 1 ; sad16x8_avg_sse2
-SAD16XN  4, 1 ; sad_16x4_avg_sse2
-SAD16XN 64, 2 ; sad_16x64_skip_sse2
-SAD16XN 32, 2 ; sad16x32_skip_sse2
-SAD16XN 16, 2 ; sad16x16_skip_sse2
-SAD16XN  8, 2 ; sad16x8_skip_sse2
-
-; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
-;                                   uint8_t *ref, int ref_stride);
-%macro SAD8XN 1-2 0
-  SAD_FN 8, %1, 7, %2
-%if %2 == 2
-  mov              n_rowsd, %1/8
-%else
-  mov              n_rowsd, %1/4
-%endif
-  pxor                  m0, m0
-
-.loop:
-  movh                  m1, [refq]
-  movhps                m1, [refq+ref_strideq]
-  movh                  m2, [refq+ref_strideq*2]
-  movhps                m2, [refq+ref_stride3q]
-%if %2 == 1
-  pavgb                 m1, [second_predq+mmsize*0]
-  pavgb                 m2, [second_predq+mmsize*1]
-  lea         second_predq, [second_predq+mmsize*2]
-%endif
-  movh                  m3, [srcq]
-  movhps                m3, [srcq+src_strideq]
-  movh                  m4, [srcq+src_strideq*2]
-  movhps                m4, [srcq+src_stride3q]
-  psadbw                m1, m3
-  psadbw                m2, m4
-  lea                 refq, [refq+ref_strideq*4]
-  paddd                 m0, m1
-  lea                 srcq, [srcq+src_strideq*4]
-  paddd                 m0, m2
-  dec              n_rowsd
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-%if %2 == 2 ; we skipped rows, so now we need to double the sad
-  pslld                 m0, 1
-%endif
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-SAD8XN 32    ; sad_8x32_sse2
-SAD8XN 16    ; sad8x16_sse2
-SAD8XN  8    ; sad8x8_sse2
-SAD8XN  4    ; sad8x4_sse2
-SAD8XN 32, 1 ; sad_8x32_avg_sse2
-SAD8XN 16, 1 ; sad8x16_avg_sse2
-SAD8XN  8, 1 ; sad8x8_avg_sse2
-SAD8XN  4, 1 ; sad8x4_avg_sse2
-SAD8XN 32, 2 ; sad_8x32_skip_sse2
-SAD8XN 16, 2 ; sad8x16_skip_sse2
-SAD8XN  8, 2 ; sad8x8_skip_sse2
-
-; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
-;                                   uint8_t *ref, int ref_stride);
-%macro SAD4XN 1-2 0
-  SAD_FN 4, %1, 7, %2
-%if %2 == 2
-  mov              n_rowsd, %1/8
-%else
-  mov              n_rowsd, %1/4
-%endif
-  pxor                  m0, m0
-
-.loop:
-  movd                  m1, [refq]
-  movd                  m2, [refq+ref_strideq]
-  movd                  m3, [refq+ref_strideq*2]
-  movd                  m4, [refq+ref_stride3q]
-  punpckldq             m1, m2
-  punpckldq             m3, m4
-  movlhps               m1, m3
-%if %2 == 1
-  pavgb                 m1, [second_predq+mmsize*0]
-  lea         second_predq, [second_predq+mmsize*1]
-%endif
-  movd                  m2, [srcq]
-  movd                  m5, [srcq+src_strideq]
-  movd                  m4, [srcq+src_strideq*2]
-  movd                  m3, [srcq+src_stride3q]
-  punpckldq             m2, m5
-  punpckldq             m4, m3
-  movlhps               m2, m4
-  psadbw                m1, m2
-  lea                 refq, [refq+ref_strideq*4]
-  paddd                 m0, m1
-  lea                 srcq, [srcq+src_strideq*4]
-  dec              n_rowsd
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-%if %2 == 2 ; we skipped rows, so now we need to double the sad
-  pslld                 m0, 1
-%endif
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-SAD4XN 16 ; sad_4x16_sse2
-SAD4XN  8 ; sad4x8_sse
-SAD4XN  4 ; sad4x4_sse
-SAD4XN 16, 1 ; sad_4x16_avg_sse2
-SAD4XN  8, 1 ; sad4x8_avg_sse
-SAD4XN  4, 1 ; sad4x4_avg_sse
-SAD4XN 16, 2 ; sad_4x16_skip_sse2
-SAD4XN  8, 2 ; sad4x8_skip_sse

diff --git a/aom_dsp/x86/sse_avx2.c b/aom_dsp/x86/sse_avx2.c
index 817cb63..fc05e3f 100644
--- a/aom_dsp/x86/sse_avx2.c
+++ b/aom_dsp/x86/sse_avx2.c

@@ -18,21 +18,6 @@
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_dsp/x86/synonyms_avx2.h"
 
-static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
-                                const uint8_t *b) {
-  const __m256i v_a0 = yy_loadu_256(a);
-  const __m256i v_b0 = yy_loadu_256(b);
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero);
-  const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero);
-  const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero);
-  const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero);
-  const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w);
-  const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w);
-  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w));
-  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w));
-}
-
 static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
   int64_t sum;
   __m256i zero = _mm256_setzero_si256();
@@ -65,153 +50,6 @@
   return sum;
 }
 
-static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride,
-                                 const uint8_t *b, int b_stride, __m256i *sum) {
-  const __m128i v_a0 = xx_loadl_32(a);
-  const __m128i v_a1 = xx_loadl_32(a + a_stride);
-  const __m128i v_a2 = xx_loadl_32(a + a_stride * 2);
-  const __m128i v_a3 = xx_loadl_32(a + a_stride * 3);
-  const __m128i v_b0 = xx_loadl_32(b);
-  const __m128i v_b1 = xx_loadl_32(b + b_stride);
-  const __m128i v_b2 = xx_loadl_32(b + b_stride * 2);
-  const __m128i v_b3 = xx_loadl_32(b + b_stride * 3);
-  const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1),
-                                             _mm_unpacklo_epi32(v_a2, v_a3));
-  const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1),
-                                             _mm_unpacklo_epi32(v_b2, v_b3));
-  const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123);
-  const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123);
-  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
-  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
-}
-static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride,
-                                 const uint8_t *b, int b_stride, __m256i *sum) {
-  const __m128i v_a0 = xx_loadl_64(a);
-  const __m128i v_a1 = xx_loadl_64(a + a_stride);
-  const __m128i v_b0 = xx_loadl_64(b);
-  const __m128i v_b1 = xx_loadl_64(b + b_stride);
-  const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1));
-  const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1));
-  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
-  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
-}
-int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
-                     int b_stride, int width, int height) {
-  int32_t y = 0;
-  int64_t sse = 0;
-  __m256i sum = _mm256_setzero_si256();
-  __m256i zero = _mm256_setzero_si256();
-  switch (width) {
-    case 4:
-      do {
-        sse_w4x4_avx2(a, a_stride, b, b_stride, &sum);
-        a += a_stride << 2;
-        b += b_stride << 2;
-        y += 4;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 8:
-      do {
-        sse_w8x2_avx2(a, a_stride, b, b_stride, &sum);
-        a += a_stride << 1;
-        b += b_stride << 1;
-        y += 2;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 16:
-      do {
-        const __m128i v_a0 = xx_loadu_128(a);
-        const __m128i v_a1 = xx_loadu_128(a + a_stride);
-        const __m128i v_b0 = xx_loadu_128(b);
-        const __m128i v_b1 = xx_loadu_128(b + b_stride);
-        const __m256i v_a =
-            _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01);
-        const __m256i v_b =
-            _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01);
-        const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero);
-        const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero);
-        const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero);
-        const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero);
-        const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl);
-        const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu);
-        const __m256i temp =
-            _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub),
-                             _mm256_madd_epi16(v_bsub, v_bsub));
-        sum = _mm256_add_epi32(sum, temp);
-        a += a_stride << 1;
-        b += b_stride << 1;
-        y += 2;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 32:
-      do {
-        sse_w32_avx2(&sum, a, b);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 64:
-      do {
-        sse_w32_avx2(&sum, a, b);
-        sse_w32_avx2(&sum, a + 32, b + 32);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 128:
-      do {
-        sse_w32_avx2(&sum, a, b);
-        sse_w32_avx2(&sum, a + 32, b + 32);
-        sse_w32_avx2(&sum, a + 64, b + 64);
-        sse_w32_avx2(&sum, a + 96, b + 96);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    default:
-      if ((width & 0x07) == 0) {
-        do {
-          int i = 0;
-          do {
-            sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
-            i += 8;
-          } while (i < width);
-          a += a_stride << 1;
-          b += b_stride << 1;
-          y += 2;
-        } while (y < height);
-      } else {
-        do {
-          int i = 0;
-          do {
-            sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
-            const uint8_t *a2 = a + i + (a_stride << 1);
-            const uint8_t *b2 = b + i + (b_stride << 1);
-            sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum);
-            i += 8;
-          } while (i + 4 < width);
-          sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum);
-          a += a_stride << 2;
-          b += b_stride << 2;
-          y += 4;
-        } while (y < height);
-      }
-      sse = summary_all_avx2(&sum);
-      break;
-  }
-
-  return sse;
-}
-
 static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
                                        const uint16_t *b) {
   const __m256i v_a_w = yy_loadu_256(a);

diff --git a/aom_dsp/x86/sse_sse4.c b/aom_dsp/x86/sse_sse4.c
index 8ab6cf0..035b2b6 100644
--- a/aom_dsp/x86/sse_sse4.c
+++ b/aom_dsp/x86/sse_sse4.c

@@ -36,145 +36,6 @@
   *sum64 = _mm_add_epi64(sum1, *sum64);
 }
 
-static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
-                                  const uint8_t *b) {
-  const __m128i v_a0 = xx_loadu_128(a);
-  const __m128i v_b0 = xx_loadu_128(b);
-  const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0);
-  const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8));
-  const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0);
-  const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8));
-  const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w);
-  const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w);
-  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w));
-  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w));
-}
-
-static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride,
-                                 const uint8_t *b, int b_stride, __m128i *sum) {
-  const __m128i v_a0 = xx_loadl_32(a);
-  const __m128i v_a1 = xx_loadl_32(a + a_stride);
-  const __m128i v_b0 = xx_loadl_32(b);
-  const __m128i v_b1 = xx_loadl_32(b + b_stride);
-  const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1));
-  const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1));
-  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
-  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
-}
-static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b,
-                               __m128i *sum) {
-  const __m128i v_a0 = xx_loadl_64(a);
-  const __m128i v_b0 = xx_loadl_64(b);
-  const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0);
-  const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0);
-  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
-  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
-}
-
-int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
-                       int b_stride, int width, int height) {
-  int y = 0;
-  int64_t sse = 0;
-  __m128i sum = _mm_setzero_si128();
-  switch (width) {
-    case 4:
-      do {
-        sse4x2_sse4_1(a, a_stride, b, b_stride, &sum);
-        a += a_stride << 1;
-        b += b_stride << 1;
-        y += 2;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 8:
-      do {
-        sse8_sse4_1(a, b, &sum);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 16:
-      do {
-        sse_w16_sse4_1(&sum, a, b);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 32:
-      do {
-        sse_w16_sse4_1(&sum, a, b);
-        sse_w16_sse4_1(&sum, a + 16, b + 16);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 64:
-      do {
-        sse_w16_sse4_1(&sum, a, b);
-        sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
-        sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
-        sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 128:
-      do {
-        sse_w16_sse4_1(&sum, a, b);
-        sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
-        sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
-        sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
-        sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4);
-        sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5);
-        sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6);
-        sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    default:
-      if (width & 0x07) {
-        do {
-          int i = 0;
-          do {
-            sse8_sse4_1(a + i, b + i, &sum);
-            sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum);
-            i += 8;
-          } while (i + 4 < width);
-          sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum);
-          a += (a_stride << 1);
-          b += (b_stride << 1);
-          y += 2;
-        } while (y < height);
-      } else {
-        do {
-          int i = 0;
-          do {
-            sse8_sse4_1(a + i, b + i, &sum);
-            i += 8;
-          } while (i < width);
-          a += a_stride;
-          b += b_stride;
-          y += 1;
-        } while (y < height);
-      }
-      sse = summary_all_sse4(&sum);
-      break;
-  }
-
-  return sse;
-}
-
 static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a,
                                           int a_stride, const uint16_t *b,
                                           int b_stride) {

diff --git a/aom_dsp/x86/ssim_sse2_x86_64.asm b/aom_dsp/x86/ssim_sse2_x86_64.asm
deleted file mode 100644
index 0eb7ff5..0000000
--- a/aom_dsp/x86/ssim_sse2_x86_64.asm
+++ /dev/null

@@ -1,222 +0,0 @@
-;
-; Copyright (c) 2021, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 3-Clause Clear License and the
-; Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear License was
-; not distributed with this source code in the LICENSE file, you can obtain it
-; at aomedia.org/license/software-license/bsd-3-c-c/.  If the Alliance for Open Media Patent
-; License 1.0 was not distributed with this source code in the PATENTS file, you
-; can obtain it at aomedia.org/license/patent-license/.
-;
-
-;
-
-%include "aom_ports/x86_abi_support.asm"
-
-; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
-%macro TABULATE_SSIM 0
-        paddusw         xmm15, xmm3  ; sum_s
-        paddusw         xmm14, xmm4  ; sum_r
-        movdqa          xmm1, xmm3
-        pmaddwd         xmm1, xmm1
-        paddd           xmm13, xmm1 ; sum_sq_s
-        movdqa          xmm2, xmm4
-        pmaddwd         xmm2, xmm2
-        paddd           xmm12, xmm2 ; sum_sq_r
-        pmaddwd         xmm3, xmm4
-        paddd           xmm11, xmm3  ; sum_sxr
-%endmacro
-
-; Sum across the register %1 starting with q words
-%macro SUM_ACROSS_Q 1
-        movdqa          xmm2,%1
-        punpckldq       %1,xmm0
-        punpckhdq       xmm2,xmm0
-        paddq           %1,xmm2
-        movdqa          xmm2,%1
-        punpcklqdq      %1,xmm0
-        punpckhqdq      xmm2,xmm0
-        paddq           %1,xmm2
-%endmacro
-
-; Sum across the register %1 starting with q words
-%macro SUM_ACROSS_W 1
-        movdqa          xmm1, %1
-        punpcklwd       %1,xmm0
-        punpckhwd       xmm1,xmm0
-        paddd           %1, xmm1
-        SUM_ACROSS_Q    %1
-%endmacro
-
-SECTION .text
-
-;void ssim_parms_sse2(
-;    unsigned char *s,
-;    int sp,
-;    unsigned char *r,
-;    int rp
-;    uint32_t *sum_s,
-;    uint32_t *sum_r,
-;    uint32_t *sum_sq_s,
-;    uint32_t *sum_sq_r,
-;    uint32_t *sum_sxr);
-;
-; TODO: Use parm passing through structure, probably don't need the pxors
-; ( calling app will initialize to 0 ) could easily fit everything in sse2
-; without too much hastle, and can probably do better estimates with psadw
-; or pavgb At this point this is just meant to be first pass for calculating
-; all the parms needed for 16x16 ssim so we can play with dssim as distortion
-; in mode selection code.
-globalsym(aom_ssim_parms_16x16_sse2)
-sym(aom_ssim_parms_16x16_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 15
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0) ;s
-    mov             rcx,        arg(1) ;sp
-    mov             rdi,        arg(2) ;r
-    mov             rax,        arg(3) ;rp
-
-    pxor            xmm0, xmm0
-    pxor            xmm15,xmm15  ;sum_s
-    pxor            xmm14,xmm14  ;sum_r
-    pxor            xmm13,xmm13  ;sum_sq_s
-    pxor            xmm12,xmm12  ;sum_sq_r
-    pxor            xmm11,xmm11  ;sum_sxr
-
-    mov             rdx, 16      ;row counter
-.NextRow:
-
-    ;grab source and reference pixels
-    movdqu          xmm5, [rsi]
-    movdqu          xmm6, [rdi]
-    movdqa          xmm3, xmm5
-    movdqa          xmm4, xmm6
-    punpckhbw       xmm3, xmm0 ; high_s
-    punpckhbw       xmm4, xmm0 ; high_r
-
-    TABULATE_SSIM
-
-    movdqa          xmm3, xmm5
-    movdqa          xmm4, xmm6
-    punpcklbw       xmm3, xmm0 ; low_s
-    punpcklbw       xmm4, xmm0 ; low_r
-
-    TABULATE_SSIM
-
-    add             rsi, rcx   ; next s row
-    add             rdi, rax   ; next r row
-
-    dec             rdx        ; counter
-    jnz .NextRow
-
-    SUM_ACROSS_W    xmm15
-    SUM_ACROSS_W    xmm14
-    SUM_ACROSS_Q    xmm13
-    SUM_ACROSS_Q    xmm12
-    SUM_ACROSS_Q    xmm11
-
-    mov             rdi,arg(4)
-    movd            [rdi], xmm15;
-    mov             rdi,arg(5)
-    movd            [rdi], xmm14;
-    mov             rdi,arg(6)
-    movd            [rdi], xmm13;
-    mov             rdi,arg(7)
-    movd            [rdi], xmm12;
-    mov             rdi,arg(8)
-    movd            [rdi], xmm11;
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void ssim_parms_sse2(
-;    unsigned char *s,
-;    int sp,
-;    unsigned char *r,
-;    int rp
-;    uint32_t *sum_s,
-;    uint32_t *sum_r,
-;    uint32_t *sum_sq_s,
-;    uint32_t *sum_sq_r,
-;    uint32_t *sum_sxr);
-;
-; TODO: Use parm passing through structure, probably don't need the pxors
-; ( calling app will initialize to 0 ) could easily fit everything in sse2
-; without too much hastle, and can probably do better estimates with psadw
-; or pavgb At this point this is just meant to be first pass for calculating
-; all the parms needed for 16x16 ssim so we can play with dssim as distortion
-; in mode selection code.
-globalsym(aom_ssim_parms_8x8_sse2)
-sym(aom_ssim_parms_8x8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 15
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0) ;s
-    mov             rcx,        arg(1) ;sp
-    mov             rdi,        arg(2) ;r
-    mov             rax,        arg(3) ;rp
-
-    pxor            xmm0, xmm0
-    pxor            xmm15,xmm15  ;sum_s
-    pxor            xmm14,xmm14  ;sum_r
-    pxor            xmm13,xmm13  ;sum_sq_s
-    pxor            xmm12,xmm12  ;sum_sq_r
-    pxor            xmm11,xmm11  ;sum_sxr
-
-    mov             rdx, 8      ;row counter
-.NextRow:
-
-    ;grab source and reference pixels
-    movq            xmm3, [rsi]
-    movq            xmm4, [rdi]
-    punpcklbw       xmm3, xmm0 ; low_s
-    punpcklbw       xmm4, xmm0 ; low_r
-
-    TABULATE_SSIM
-
-    add             rsi, rcx   ; next s row
-    add             rdi, rax   ; next r row
-
-    dec             rdx        ; counter
-    jnz .NextRow
-
-    SUM_ACROSS_W    xmm15
-    SUM_ACROSS_W    xmm14
-    SUM_ACROSS_Q    xmm13
-    SUM_ACROSS_Q    xmm12
-    SUM_ACROSS_Q    xmm11
-
-    mov             rdi,arg(4)
-    movd            [rdi], xmm15;
-    mov             rdi,arg(5)
-    movd            [rdi], xmm14;
-    mov             rdi,arg(6)
-    movd            [rdi], xmm13;
-    mov             rdi,arg(7)
-    movd            [rdi], xmm12;
-    mov             rdi,arg(8)
-    movd            [rdi], xmm11;
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret

diff --git a/aom_dsp/x86/subpel_variance_sse2.asm b/aom_dsp/x86/subpel_variance_sse2.asm
deleted file mode 100644
index 431f966..0000000
--- a/aom_dsp/x86/subpel_variance_sse2.asm
+++ /dev/null

@@ -1,1470 +0,0 @@
-;
-; Copyright (c) 2021, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 3-Clause Clear License and the
-; Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear License was
-; not distributed with this source code in the LICENSE file, you can obtain it
-; at aomedia.org/license/software-license/bsd-3-c-c/.  If the Alliance for Open Media Patent
-; License 1.0 was not distributed with this source code in the PATENTS file, you
-; can obtain it at aomedia.org/license/patent-license/.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_8: times  8 dw  8
-bilin_filter_m_sse2: times  8 dw 16
-                     times  8 dw  0
-                     times  8 dw 14
-                     times  8 dw  2
-                     times  8 dw 12
-                     times  8 dw  4
-                     times  8 dw 10
-                     times  8 dw  6
-                     times 16 dw  8
-                     times  8 dw  6
-                     times  8 dw 10
-                     times  8 dw  4
-                     times  8 dw 12
-                     times  8 dw  2
-                     times  8 dw 14
-
-bilin_filter_m_ssse3: times  8 db 16,  0
-                      times  8 db 14,  2
-                      times  8 db 12,  4
-                      times  8 db 10,  6
-                      times 16 db  8
-                      times  8 db  6, 10
-                      times  8 db  4, 12
-                      times  8 db  2, 14
-
-SECTION .text
-
-; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
-;                               int x_offset, int y_offset,
-;                               const uint8_t *dst, ptrdiff_t dst_stride,
-;                               int height, unsigned int *sse);
-;
-; This function returns the SE and stores SSE in the given pointer.
-
-%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
-  psubw                %3, %4
-  psubw                %1, %2
-  paddw                %5, %3
-  pmaddwd              %3, %3
-  paddw                %5, %1
-  pmaddwd              %1, %1
-  paddd                %6, %3
-  paddd                %6, %1
-%endmacro
-
-%macro STORE_AND_RET 1
-%if %1 > 4
-  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
-  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
-  ; We have to sign-extend it before adding the words within the register
-  ; and outputing to a dword.
-  pcmpgtw              m5, m6           ; mask for 0 > x
-  movhlps              m3, m7
-  punpcklwd            m4, m6, m5
-  punpckhwd            m6, m5           ; sign-extend m6 word->dword
-  paddd                m7, m3
-  paddd                m6, m4
-  pshufd               m3, m7, 0x1
-  movhlps              m4, m6
-  paddd                m7, m3
-  paddd                m6, m4
-  mov                  r1, ssem         ; r1 = unsigned int *sse
-  pshufd               m4, m6, 0x1
-  movd               [r1], m7           ; store sse
-  paddd                m6, m4
-  movd               raxd, m6           ; store sum as return value
-%else ; 4xh
-  pshuflw              m4, m6, 0xe
-  pshuflw              m3, m7, 0xe
-  paddw                m6, m4
-  paddd                m7, m3
-  pcmpgtw              m5, m6           ; mask for 0 > x
-  mov                  r1, ssem         ; r1 = unsigned int *sse
-  punpcklwd            m6, m5           ; sign-extend m6 word->dword
-  movd               [r1], m7           ; store sse
-  pshuflw              m4, m6, 0xe
-  paddd                m6, m4
-  movd               raxd, m6           ; store sum as return value
-%endif
-  RET
-%endmacro
-
-%macro INC_SRC_BY_SRC_STRIDE  0
-%if ARCH_X86=1 && CONFIG_PIC=1
-  add                srcq, src_stridemp
-%else
-  add                srcq, src_strideq
-%endif
-%endmacro
-
-%macro SUBPEL_VARIANCE 1-2 0 ; W
-%if cpuflag(ssse3)
-%define bilin_filter_m bilin_filter_m_ssse3
-%define filter_idx_shift 4
-%else
-%define bilin_filter_m bilin_filter_m_sse2
-%define filter_idx_shift 5
-%endif
-; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
-; 11, not 13, if the registers are ordered correctly. May make a minor speed
-; difference on Win64
-
-%if ARCH_X86_64
-  %if %2 == 1 ; avg
-    cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
-                                        x_offset, y_offset, dst, dst_stride, \
-                                        sec, sec_stride, height, sse
-    %define sec_str sec_strideq
-  %else
-    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
-                                    x_offset, y_offset, dst, dst_stride, \
-                                    height, sse
-  %endif
-  %define block_height heightd
-  %define bilin_filter sseq
-%else
-  %if CONFIG_PIC=1
-    %if %2 == 1 ; avg
-      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
-                                          x_offset, y_offset, dst, dst_stride, \
-                                          sec, sec_stride, height, sse
-      %define block_height dword heightm
-      %define sec_str sec_stridemp
-    %else
-      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
-                                      x_offset, y_offset, dst, dst_stride, \
-                                      height, sse
-      %define block_height heightd
-    %endif
-
-    ; reuse argument stack space
-    %define g_bilin_filterm x_offsetm
-    %define g_pw_8m y_offsetm
-
-    ;Store bilin_filter and pw_8 location in stack
-    %if GET_GOT_DEFINED == 1
-      GET_GOT eax
-      add esp, 4                ; restore esp
-    %endif
-
-    lea ecx, [GLOBAL(bilin_filter_m)]
-    mov g_bilin_filterm, ecx
-
-    lea ecx, [GLOBAL(pw_8)]
-    mov g_pw_8m, ecx
-
-    LOAD_IF_USED 0, 1         ; load eax, ecx back
-  %else
-    %if %2 == 1 ; avg
-      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
-                                          x_offset, y_offset, \
-                                          dst, dst_stride, sec, sec_stride, \
-                                          height, sse
-      %define block_height dword heightm
-      %define sec_str sec_stridemp
-    %else
-      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
-                                      x_offset, y_offset, dst, dst_stride, \
-                                      height, sse
-      %define block_height heightd
-    %endif
-    %define bilin_filter bilin_filter_m
-  %endif
-%endif
-
-%if %1 == 4
-  %define movx movd
-%else
-  %define movx movh
-%endif
-
-  ASSERT               %1 <= 16         ; m6 overflows if w > 16
-  pxor                 m6, m6           ; sum
-  pxor                 m7, m7           ; sse
-  ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
-  ; could perhaps use it for something more productive then
-  pxor                 m5, m5           ; dedicated zero register
-%if %1 < 16
-  sar                   block_height, 1
-%if %2 == 1 ; avg
-  shl             sec_str, 1
-%endif
-%endif
-
-  ; FIXME(rbultje) replace by jumptable?
-  test          x_offsetd, x_offsetd
-  jnz .x_nonzero
-  ; x_offset == 0
-  test          y_offsetd, y_offsetd
-  jnz .x_zero_y_nonzero
-
-  ; x_offset == 0 && y_offset == 0
-.x_zero_y_zero_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  mova                 m1, [dstq]
-%if %2 == 1 ; avg
-  pavgb                m0, [secq]
-  punpckhbw            m3, m1, m5
-  punpcklbw            m1, m5
-%endif
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-
-%if %2 == 0 ; !avg
-  punpckhbw            m3, m1, m5
-  punpcklbw            m1, m5
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  add                srcq, src_strideq
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-%if %2 == 1 ; avg
-%if %1 > 4
-  movhps               m0, [srcq+src_strideq]
-%else ; 4xh
-  movx                 m1, [srcq+src_strideq]
-  punpckldq            m0, m1
-%endif
-%else ; !avg
-  movx                 m2, [srcq+src_strideq]
-%endif
-
-  movx                 m1, [dstq]
-  movx                 m3, [dstq+dst_strideq]
-
-%if %2 == 1 ; avg
-%if %1 > 4
-  pavgb                m0, [secq]
-%else
-  movh                 m2, [secq]
-  pavgb                m0, m2
-%endif
-  punpcklbw            m3, m5
-  punpcklbw            m1, m5
-%if %1 > 4
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else ; 4xh
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%else ; !avg
-  punpcklbw            m0, m5
-  punpcklbw            m2, m5
-  punpcklbw            m3, m5
-  punpcklbw            m1, m5
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_zero_y_zero_loop
-  STORE_AND_RET %1
-
-.x_zero_y_nonzero:
-  cmp           y_offsetd, 4
-  jne .x_zero_y_nonhalf
-
-  ; x_offset == 0 && y_offset == 0.5
-.x_zero_y_half_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m4, [srcq+src_strideq]
-  mova                 m1, [dstq]
-  pavgb                m0, m4
-  punpckhbw            m3, m1, m5
-%if %2 == 1 ; avg
-  pavgb                m0, [secq]
-%endif
-  punpcklbw            m1, m5
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  add                srcq, src_strideq
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-  movx                 m2, [srcq+src_strideq]
-%if %2 == 1 ; avg
-%if %1 > 4
-  movhps               m2, [srcq+src_strideq*2]
-%else ; 4xh
-  movx                 m1, [srcq+src_strideq*2]
-  punpckldq            m2, m1
-%endif
-  movx                 m1, [dstq]
-%if %1 > 4
-  movlhps              m0, m2
-%else ; 4xh
-  punpckldq            m0, m2
-%endif
-  movx                 m3, [dstq+dst_strideq]
-  pavgb                m0, m2
-  punpcklbw            m1, m5
-%if %1 > 4
-  pavgb                m0, [secq]
-  punpcklbw            m3, m5
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else ; 4xh
-  movh                 m4, [secq]
-  pavgb                m0, m4
-  punpcklbw            m3, m5
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%else ; !avg
-  movx                 m4, [srcq+src_strideq*2]
-  movx                 m1, [dstq]
-  pavgb                m0, m2
-  movx                 m3, [dstq+dst_strideq]
-  pavgb                m2, m4
-  punpcklbw            m0, m5
-  punpcklbw            m2, m5
-  punpcklbw            m3, m5
-  punpcklbw            m1, m5
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_zero_y_half_loop
-  STORE_AND_RET %1
-
-.x_zero_y_nonhalf:
-  ; x_offset == 0 && y_offset == bilin interpolation
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
-  mova                 m8, [bilin_filter+y_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
-  mova                 m9, [bilin_filter+y_offsetq+16]
-%endif
-  mova                m10, [GLOBAL(pw_8)]
-%define filter_y_a m8
-%define filter_y_b m9
-%define filter_rnd m10
-%else ; x86-32 or mmx
-%if ARCH_X86=1 && CONFIG_PIC=1
-; x_offset == 0, reuse x_offset reg
-%define tempq x_offsetq
-  add y_offsetq, g_bilin_filterm
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           y_offsetq, bilin_filter
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-.x_zero_y_other_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m4, [srcq+src_strideq]
-  mova                 m1, [dstq]
-%if cpuflag(ssse3)
-  punpckhbw            m2, m0, m4
-  punpcklbw            m0, m4
-  pmaddubsw            m2, filter_y_a
-  pmaddubsw            m0, filter_y_a
-  paddw                m2, filter_rnd
-  paddw                m0, filter_rnd
-%else
-  punpckhbw            m2, m0, m5
-  punpckhbw            m3, m4, m5
-  punpcklbw            m0, m5
-  punpcklbw            m4, m5
-  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
-  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
-  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
-  ; slightly faster because of pmullw latency. It would also cut our rodata
-  ; tables in half for this function, and save 1-2 registers on x86-64.
-  pmullw               m2, filter_y_a
-  pmullw               m3, filter_y_b
-  paddw                m2, filter_rnd
-  pmullw               m0, filter_y_a
-  pmullw               m4, filter_y_b
-  paddw                m0, filter_rnd
-  paddw                m2, m3
-  paddw                m0, m4
-%endif
-  psraw                m2, 4
-  psraw                m0, 4
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-  packuswb             m0, m2
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%endif
-  punpckhbw            m3, m1, m5
-  punpcklbw            m1, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  add                srcq, src_strideq
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-  movx                 m2, [srcq+src_strideq]
-  movx                 m4, [srcq+src_strideq*2]
-  movx                 m3, [dstq+dst_strideq]
-%if cpuflag(ssse3)
-  movx                 m1, [dstq]
-  punpcklbw            m0, m2
-  punpcklbw            m2, m4
-  pmaddubsw            m0, filter_y_a
-  pmaddubsw            m2, filter_y_a
-  punpcklbw            m3, m5
-  paddw                m2, filter_rnd
-  paddw                m0, filter_rnd
-%else
-  punpcklbw            m0, m5
-  punpcklbw            m2, m5
-  punpcklbw            m4, m5
-  pmullw               m0, filter_y_a
-  pmullw               m1, m2, filter_y_b
-  punpcklbw            m3, m5
-  paddw                m0, filter_rnd
-  pmullw               m2, filter_y_a
-  pmullw               m4, filter_y_b
-  paddw                m0, m1
-  paddw                m2, filter_rnd
-  movx                 m1, [dstq]
-  paddw                m2, m4
-%endif
-  psraw                m0, 4
-  psraw                m2, 4
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-%if %1 == 4
-  movlhps              m0, m2
-%endif
-  packuswb             m0, m2
-%if %1 > 4
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else ; 4xh
-  movh                 m2, [secq]
-  pavgb                m0, m2
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%endif
-  punpcklbw            m1, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_zero_y_other_loop
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
-  STORE_AND_RET %1
-
-.x_nonzero:
-  cmp           x_offsetd, 4
-  jne .x_nonhalf
-  ; x_offset == 0.5
-  test          y_offsetd, y_offsetd
-  jnz .x_half_y_nonzero
-
-  ; x_offset == 0.5 && y_offset == 0
-.x_half_y_zero_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m4, [srcq+1]
-  mova                 m1, [dstq]
-  pavgb                m0, m4
-  punpckhbw            m3, m1, m5
-%if %2 == 1 ; avg
-  pavgb                m0, [secq]
-%endif
-  punpcklbw            m1, m5
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  add                srcq, src_strideq
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-  movx                 m4, [srcq+1]
-%if %2 == 1 ; avg
-%if %1 > 4
-  movhps               m0, [srcq+src_strideq]
-  movhps               m4, [srcq+src_strideq+1]
-%else ; 4xh
-  movx                 m1, [srcq+src_strideq]
-  punpckldq            m0, m1
-  movx                 m2, [srcq+src_strideq+1]
-  punpckldq            m4, m2
-%endif
-  movx                 m1, [dstq]
-  movx                 m3, [dstq+dst_strideq]
-  pavgb                m0, m4
-  punpcklbw            m3, m5
-%if %1 > 4
-  pavgb                m0, [secq]
-  punpcklbw            m1, m5
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else ; 4xh
-  movh                 m2, [secq]
-  pavgb                m0, m2
-  punpcklbw            m1, m5
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%else ; !avg
-  movx                 m2, [srcq+src_strideq]
-  movx                 m1, [dstq]
-  pavgb                m0, m4
-  movx                 m4, [srcq+src_strideq+1]
-  movx                 m3, [dstq+dst_strideq]
-  pavgb                m2, m4
-  punpcklbw            m0, m5
-  punpcklbw            m2, m5
-  punpcklbw            m3, m5
-  punpcklbw            m1, m5
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_half_y_zero_loop
-  STORE_AND_RET %1
-
-.x_half_y_nonzero:
-  cmp           y_offsetd, 4
-  jne .x_half_y_nonhalf
-
-  ; x_offset == 0.5 && y_offset == 0.5
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m3, [srcq+1]
-  add                srcq, src_strideq
-  pavgb                m0, m3
-.x_half_y_half_loop:
-  movu                 m4, [srcq]
-  movu                 m3, [srcq+1]
-  mova                 m1, [dstq]
-  pavgb                m4, m3
-  punpckhbw            m3, m1, m5
-  pavgb                m0, m4
-%if %2 == 1 ; avg
-  punpcklbw            m1, m5
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-  punpcklbw            m1, m5
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-  mova                 m0, m4
-
-  add                srcq, src_strideq
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-  movx                 m3, [srcq+1]
-  add                srcq, src_strideq
-  pavgb                m0, m3
-.x_half_y_half_loop:
-  movx                 m2, [srcq]
-  movx                 m3, [srcq+1]
-%if %2 == 1 ; avg
-%if %1 > 4
-  movhps               m2, [srcq+src_strideq]
-  movhps               m3, [srcq+src_strideq+1]
-%else
-  movx                 m1, [srcq+src_strideq]
-  punpckldq            m2, m1
-  movx                 m1, [srcq+src_strideq+1]
-  punpckldq            m3, m1
-%endif
-  pavgb                m2, m3
-%if %1 > 4
-  movlhps              m0, m2
-  movhlps              m4, m2
-%else ; 4xh
-  punpckldq            m0, m2
-  pshuflw              m4, m2, 0xe
-%endif
-  movx                 m1, [dstq]
-  pavgb                m0, m2
-  movx                 m3, [dstq+dst_strideq]
-%if %1 > 4
-  pavgb                m0, [secq]
-%else
-  movh                 m2, [secq]
-  pavgb                m0, m2
-%endif
-  punpcklbw            m3, m5
-  punpcklbw            m1, m5
-%if %1 > 4
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%else ; !avg
-  movx                 m4, [srcq+src_strideq]
-  movx                 m1, [srcq+src_strideq+1]
-  pavgb                m2, m3
-  pavgb                m4, m1
-  pavgb                m0, m2
-  pavgb                m2, m4
-  movx                 m1, [dstq]
-  movx                 m3, [dstq+dst_strideq]
-  punpcklbw            m0, m5
-  punpcklbw            m2, m5
-  punpcklbw            m3, m5
-  punpcklbw            m1, m5
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-  mova                 m0, m4
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_half_y_half_loop
-  STORE_AND_RET %1
-
-.x_half_y_nonhalf:
-  ; x_offset == 0.5 && y_offset == bilin interpolation
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
-  mova                 m8, [bilin_filter+y_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
-  mova                 m9, [bilin_filter+y_offsetq+16]
-%endif
-  mova                m10, [GLOBAL(pw_8)]
-%define filter_y_a m8
-%define filter_y_b m9
-%define filter_rnd m10
-%else  ;x86_32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; x_offset == 0.5. We can reuse x_offset reg
-%define tempq x_offsetq
-  add y_offsetq, g_bilin_filterm
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           y_offsetq, bilin_filter
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m3, [srcq+1]
-  add                srcq, src_strideq
-  pavgb                m0, m3
-.x_half_y_other_loop:
-  movu                 m4, [srcq]
-  movu                 m2, [srcq+1]
-  mova                 m1, [dstq]
-  pavgb                m4, m2
-%if cpuflag(ssse3)
-  punpckhbw            m2, m0, m4
-  punpcklbw            m0, m4
-  pmaddubsw            m2, filter_y_a
-  pmaddubsw            m0, filter_y_a
-  paddw                m2, filter_rnd
-  paddw                m0, filter_rnd
-  psraw                m2, 4
-%else
-  punpckhbw            m2, m0, m5
-  punpckhbw            m3, m4, m5
-  pmullw               m2, filter_y_a
-  pmullw               m3, filter_y_b
-  paddw                m2, filter_rnd
-  punpcklbw            m0, m5
-  paddw                m2, m3
-  punpcklbw            m3, m4, m5
-  pmullw               m0, filter_y_a
-  pmullw               m3, filter_y_b
-  paddw                m0, filter_rnd
-  psraw                m2, 4
-  paddw                m0, m3
-%endif
-  punpckhbw            m3, m1, m5
-  psraw                m0, 4
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-  packuswb             m0, m2
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%endif
-  punpcklbw            m1, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-  mova                 m0, m4
-
-  add                srcq, src_strideq
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-  movx                 m3, [srcq+1]
-  add                srcq, src_strideq
-  pavgb                m0, m3
-%if notcpuflag(ssse3)
-  punpcklbw            m0, m5
-%endif
-.x_half_y_other_loop:
-  movx                 m2, [srcq]
-  movx                 m1, [srcq+1]
-  movx                 m4, [srcq+src_strideq]
-  movx                 m3, [srcq+src_strideq+1]
-  pavgb                m2, m1
-  pavgb                m4, m3
-  movx                 m3, [dstq+dst_strideq]
-%if cpuflag(ssse3)
-  movx                 m1, [dstq]
-  punpcklbw            m0, m2
-  punpcklbw            m2, m4
-  pmaddubsw            m0, filter_y_a
-  pmaddubsw            m2, filter_y_a
-  punpcklbw            m3, m5
-  paddw                m0, filter_rnd
-  paddw                m2, filter_rnd
-%else
-  punpcklbw            m2, m5
-  punpcklbw            m4, m5
-  pmullw               m0, filter_y_a
-  pmullw               m1, m2, filter_y_b
-  punpcklbw            m3, m5
-  paddw                m0, filter_rnd
-  pmullw               m2, filter_y_a
-  paddw                m0, m1
-  pmullw               m1, m4, filter_y_b
-  paddw                m2, filter_rnd
-  paddw                m2, m1
-  movx                 m1, [dstq]
-%endif
-  psraw                m0, 4
-  psraw                m2, 4
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-%if %1 == 4
-  movlhps              m0, m2
-%endif
-  packuswb             m0, m2
-%if %1 > 4
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else
-  movh                 m2, [secq]
-  pavgb                m0, m2
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%endif
-  punpcklbw            m1, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-  mova                 m0, m4
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_half_y_other_loop
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
-  STORE_AND_RET %1
-
-.x_nonhalf:
-  test          y_offsetd, y_offsetd
-  jnz .x_nonhalf_y_nonzero
-
-  ; x_offset == bilin interpolation && y_offset == 0
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
-  mova                 m8, [bilin_filter+x_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
-  mova                 m9, [bilin_filter+x_offsetq+16]
-%endif
-  mova                m10, [GLOBAL(pw_8)]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_rnd m10
-%else    ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-;y_offset == 0. We can reuse y_offset reg.
-%define tempq y_offsetq
-  add x_offsetq, g_bilin_filterm
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           x_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-.x_other_y_zero_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m4, [srcq+1]
-  mova                 m1, [dstq]
-%if cpuflag(ssse3)
-  punpckhbw            m2, m0, m4
-  punpcklbw            m0, m4
-  pmaddubsw            m2, filter_x_a
-  pmaddubsw            m0, filter_x_a
-  paddw                m2, filter_rnd
-  paddw                m0, filter_rnd
-%else
-  punpckhbw            m2, m0, m5
-  punpckhbw            m3, m4, m5
-  punpcklbw            m0, m5
-  punpcklbw            m4, m5
-  pmullw               m2, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m2, filter_rnd
-  pmullw               m0, filter_x_a
-  pmullw               m4, filter_x_b
-  paddw                m0, filter_rnd
-  paddw                m2, m3
-  paddw                m0, m4
-%endif
-  psraw                m2, 4
-  psraw                m0, 4
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-  packuswb             m0, m2
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%endif
-  punpckhbw            m3, m1, m5
-  punpcklbw            m1, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  add                srcq, src_strideq
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-  movx                 m1, [srcq+1]
-  movx                 m2, [srcq+src_strideq]
-  movx                 m4, [srcq+src_strideq+1]
-  movx                 m3, [dstq+dst_strideq]
-%if cpuflag(ssse3)
-  punpcklbw            m0, m1
-  movx                 m1, [dstq]
-  punpcklbw            m2, m4
-  pmaddubsw            m0, filter_x_a
-  pmaddubsw            m2, filter_x_a
-  punpcklbw            m3, m5
-  paddw                m0, filter_rnd
-  paddw                m2, filter_rnd
-%else
-  punpcklbw            m0, m5
-  punpcklbw            m1, m5
-  punpcklbw            m2, m5
-  punpcklbw            m4, m5
-  pmullw               m0, filter_x_a
-  pmullw               m1, filter_x_b
-  punpcklbw            m3, m5
-  paddw                m0, filter_rnd
-  pmullw               m2, filter_x_a
-  pmullw               m4, filter_x_b
-  paddw                m0, m1
-  paddw                m2, filter_rnd
-  movx                 m1, [dstq]
-  paddw                m2, m4
-%endif
-  psraw                m0, 4
-  psraw                m2, 4
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-%if %1 == 4
-  movlhps              m0, m2
-%endif
-  packuswb             m0, m2
-%if %1 > 4
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else
-  movh                 m2, [secq]
-  pavgb                m0, m2
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%endif
-  punpcklbw            m1, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_other_y_zero_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_rnd
-  STORE_AND_RET %1
-
-.x_nonhalf_y_nonzero:
-  cmp           y_offsetd, 4
-  jne .x_nonhalf_y_nonhalf
-
-  ; x_offset == bilin interpolation && y_offset == 0.5
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
-  mova                 m8, [bilin_filter+x_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
-  mova                 m9, [bilin_filter+x_offsetq+16]
-%endif
-  mova                m10, [GLOBAL(pw_8)]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_rnd m10
-%else    ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; y_offset == 0.5. We can reuse y_offset reg.
-%define tempq y_offsetq
-  add x_offsetq, g_bilin_filterm
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           x_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq+1]
-%if cpuflag(ssse3)
-  punpckhbw            m2, m0, m1
-  punpcklbw            m0, m1
-  pmaddubsw            m2, filter_x_a
-  pmaddubsw            m0, filter_x_a
-  paddw                m2, filter_rnd
-  paddw                m0, filter_rnd
-%else
-  punpckhbw            m2, m0, m5
-  punpckhbw            m3, m1, m5
-  punpcklbw            m0, m5
-  punpcklbw            m1, m5
-  pmullw               m0, filter_x_a
-  pmullw               m1, filter_x_b
-  paddw                m0, filter_rnd
-  pmullw               m2, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m2, filter_rnd
-  paddw                m0, m1
-  paddw                m2, m3
-%endif
-  psraw                m0, 4
-  psraw                m2, 4
-  add                srcq, src_strideq
-  packuswb             m0, m2
-.x_other_y_half_loop:
-  movu                 m4, [srcq]
-  movu                 m3, [srcq+1]
-%if cpuflag(ssse3)
-  mova                 m1, [dstq]
-  punpckhbw            m2, m4, m3
-  punpcklbw            m4, m3
-  pmaddubsw            m2, filter_x_a
-  pmaddubsw            m4, filter_x_a
-  paddw                m2, filter_rnd
-  paddw                m4, filter_rnd
-  psraw                m2, 4
-  psraw                m4, 4
-  packuswb             m4, m2
-  pavgb                m0, m4
-  punpckhbw            m3, m1, m5
-  punpcklbw            m1, m5
-%else
-  punpckhbw            m2, m4, m5
-  punpckhbw            m1, m3, m5
-  punpcklbw            m4, m5
-  punpcklbw            m3, m5
-  pmullw               m4, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m4, filter_rnd
-  pmullw               m2, filter_x_a
-  pmullw               m1, filter_x_b
-  paddw                m2, filter_rnd
-  paddw                m4, m3
-  paddw                m2, m1
-  mova                 m1, [dstq]
-  psraw                m4, 4
-  psraw                m2, 4
-  punpckhbw            m3, m1, m5
-  ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
-  ; have a 1-register shortage to be able to store the backup of the bilin
-  ; filtered second line as words as cache for the next line. Packing into
-  ; a byte costs 1 pack and 2 unpacks, but saves a register.
-  packuswb             m4, m2
-  punpcklbw            m1, m5
-  pavgb                m0, m4
-%endif
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-  pavgb                m0, [secq]
-%endif
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-  mova                 m0, m4
-
-  add                srcq, src_strideq
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-  movx                 m1, [srcq+1]
-%if cpuflag(ssse3)
-  punpcklbw            m0, m1
-  pmaddubsw            m0, filter_x_a
-  paddw                m0, filter_rnd
-%else
-  punpcklbw            m0, m5
-  punpcklbw            m1, m5
-  pmullw               m0, filter_x_a
-  pmullw               m1, filter_x_b
-  paddw                m0, filter_rnd
-  paddw                m0, m1
-%endif
-  add                srcq, src_strideq
-  psraw                m0, 4
-.x_other_y_half_loop:
-  movx                 m2, [srcq]
-  movx                 m1, [srcq+1]
-  movx                 m4, [srcq+src_strideq]
-  movx                 m3, [srcq+src_strideq+1]
-%if cpuflag(ssse3)
-  punpcklbw            m2, m1
-  punpcklbw            m4, m3
-  pmaddubsw            m2, filter_x_a
-  pmaddubsw            m4, filter_x_a
-  movx                 m1, [dstq]
-  movx                 m3, [dstq+dst_strideq]
-  paddw                m2, filter_rnd
-  paddw                m4, filter_rnd
-%else
-  punpcklbw            m2, m5
-  punpcklbw            m1, m5
-  punpcklbw            m4, m5
-  punpcklbw            m3, m5
-  pmullw               m2, filter_x_a
-  pmullw               m1, filter_x_b
-  paddw                m2, filter_rnd
-  pmullw               m4, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m4, filter_rnd
-  paddw                m2, m1
-  movx                 m1, [dstq]
-  paddw                m4, m3
-  movx                 m3, [dstq+dst_strideq]
-%endif
-  psraw                m2, 4
-  psraw                m4, 4
-  pavgw                m0, m2
-  pavgw                m2, m4
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline - also consider going to bytes here
-%if %1 == 4
-  movlhps              m0, m2
-%endif
-  packuswb             m0, m2
-%if %1 > 4
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else
-  movh                 m2, [secq]
-  pavgb                m0, m2
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%endif
-  punpcklbw            m3, m5
-  punpcklbw            m1, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-  mova                 m0, m4
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_other_y_half_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_rnd
-  STORE_AND_RET %1
-
-.x_nonhalf_y_nonhalf:
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           x_offsetd, filter_idx_shift
-  shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
-  mova                 m8, [bilin_filter+x_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
-  mova                 m9, [bilin_filter+x_offsetq+16]
-%endif
-  mova                m10, [bilin_filter+y_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
-  mova                m11, [bilin_filter+y_offsetq+16]
-%endif
-  mova                m12, [GLOBAL(pw_8)]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_y_a m10
-%define filter_y_b m11
-%define filter_rnd m12
-%else   ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; In this case, there is NO unused register. Used src_stride register. Later,
-; src_stride has to be loaded from stack when it is needed.
-%define tempq src_strideq
-  mov tempq, g_bilin_filterm
-  add           x_offsetq, tempq
-  add           y_offsetq, tempq
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           x_offsetq, bilin_filter
-  add           y_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-  ; x_offset == bilin interpolation && y_offset == bilin interpolation
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq+1]
-%if cpuflag(ssse3)
-  punpckhbw            m2, m0, m1
-  punpcklbw            m0, m1
-  pmaddubsw            m2, filter_x_a
-  pmaddubsw            m0, filter_x_a
-  paddw                m2, filter_rnd
-  paddw                m0, filter_rnd
-%else
-  punpckhbw            m2, m0, m5
-  punpckhbw            m3, m1, m5
-  punpcklbw            m0, m5
-  punpcklbw            m1, m5
-  pmullw               m0, filter_x_a
-  pmullw               m1, filter_x_b
-  paddw                m0, filter_rnd
-  pmullw               m2, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m2, filter_rnd
-  paddw                m0, m1
-  paddw                m2, m3
-%endif
-  psraw                m0, 4
-  psraw                m2, 4
-
-  INC_SRC_BY_SRC_STRIDE
-
-  packuswb             m0, m2
-.x_other_y_other_loop:
-%if cpuflag(ssse3)
-  movu                 m4, [srcq]
-  movu                 m3, [srcq+1]
-  mova                 m1, [dstq]
-  punpckhbw            m2, m4, m3
-  punpcklbw            m4, m3
-  pmaddubsw            m2, filter_x_a
-  pmaddubsw            m4, filter_x_a
-  punpckhbw            m3, m1, m5
-  paddw                m2, filter_rnd
-  paddw                m4, filter_rnd
-  psraw                m2, 4
-  psraw                m4, 4
-  packuswb             m4, m2
-  punpckhbw            m2, m0, m4
-  punpcklbw            m0, m4
-  pmaddubsw            m2, filter_y_a
-  pmaddubsw            m0, filter_y_a
-  punpcklbw            m1, m5
-  paddw                m2, filter_rnd
-  paddw                m0, filter_rnd
-  psraw                m2, 4
-  psraw                m0, 4
-%else
-  movu                 m3, [srcq]
-  movu                 m4, [srcq+1]
-  punpckhbw            m1, m3, m5
-  punpckhbw            m2, m4, m5
-  punpcklbw            m3, m5
-  punpcklbw            m4, m5
-  pmullw               m3, filter_x_a
-  pmullw               m4, filter_x_b
-  paddw                m3, filter_rnd
-  pmullw               m1, filter_x_a
-  pmullw               m2, filter_x_b
-  paddw                m1, filter_rnd
-  paddw                m3, m4
-  paddw                m1, m2
-  psraw                m3, 4
-  psraw                m1, 4
-  packuswb             m4, m3, m1
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-  pmullw               m2, filter_y_a
-  pmullw               m1, filter_y_b
-  paddw                m2, filter_rnd
-  pmullw               m0, filter_y_a
-  pmullw               m3, filter_y_b
-  paddw                m2, m1
-  mova                 m1, [dstq]
-  paddw                m0, filter_rnd
-  psraw                m2, 4
-  paddw                m0, m3
-  punpckhbw            m3, m1, m5
-  psraw                m0, 4
-  punpcklbw            m1, m5
-%endif
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-  packuswb             m0, m2
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-  mova                 m0, m4
-
-  INC_SRC_BY_SRC_STRIDE
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-  movx                 m1, [srcq+1]
-%if cpuflag(ssse3)
-  punpcklbw            m0, m1
-  pmaddubsw            m0, filter_x_a
-  paddw                m0, filter_rnd
-%else
-  punpcklbw            m0, m5
-  punpcklbw            m1, m5
-  pmullw               m0, filter_x_a
-  pmullw               m1, filter_x_b
-  paddw                m0, filter_rnd
-  paddw                m0, m1
-%endif
-  psraw                m0, 4
-%if cpuflag(ssse3)
-  packuswb             m0, m0
-%endif
-
-  INC_SRC_BY_SRC_STRIDE
-
-.x_other_y_other_loop:
-  movx                 m2, [srcq]
-  movx                 m1, [srcq+1]
-
-  INC_SRC_BY_SRC_STRIDE
-  movx                 m4, [srcq]
-  movx                 m3, [srcq+1]
-
-%if cpuflag(ssse3)
-  punpcklbw            m2, m1
-  punpcklbw            m4, m3
-  pmaddubsw            m2, filter_x_a
-  pmaddubsw            m4, filter_x_a
-  movx                 m3, [dstq+dst_strideq]
-  movx                 m1, [dstq]
-  paddw                m2, filter_rnd
-  paddw                m4, filter_rnd
-  psraw                m2, 4
-  psraw                m4, 4
-  packuswb             m2, m2
-  packuswb             m4, m4
-  punpcklbw            m0, m2
-  punpcklbw            m2, m4
-  pmaddubsw            m0, filter_y_a
-  pmaddubsw            m2, filter_y_a
-  punpcklbw            m3, m5
-  paddw                m0, filter_rnd
-  paddw                m2, filter_rnd
-  psraw                m0, 4
-  psraw                m2, 4
-  punpcklbw            m1, m5
-%else
-  punpcklbw            m2, m5
-  punpcklbw            m1, m5
-  punpcklbw            m4, m5
-  punpcklbw            m3, m5
-  pmullw               m2, filter_x_a
-  pmullw               m1, filter_x_b
-  paddw                m2, filter_rnd
-  pmullw               m4, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m4, filter_rnd
-  paddw                m2, m1
-  paddw                m4, m3
-  psraw                m2, 4
-  psraw                m4, 4
-  pmullw               m0, filter_y_a
-  pmullw               m3, m2, filter_y_b
-  paddw                m0, filter_rnd
-  pmullw               m2, filter_y_a
-  pmullw               m1, m4, filter_y_b
-  paddw                m2, filter_rnd
-  paddw                m0, m3
-  movx                 m3, [dstq+dst_strideq]
-  paddw                m2, m1
-  movx                 m1, [dstq]
-  psraw                m0, 4
-  psraw                m2, 4
-  punpcklbw            m3, m5
-  punpcklbw            m1, m5
-%endif
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-%if %1 == 4
-  movlhps              m0, m2
-%endif
-  packuswb             m0, m2
-%if %1 > 4
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else
-  movh                 m2, [secq]
-  pavgb                m0, m2
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-  mova                 m0, m4
-
-  INC_SRC_BY_SRC_STRIDE
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_other_y_other_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
-%undef movx
-  STORE_AND_RET %1
-%endmacro
-
-; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
-; between the ssse3 and non-ssse3 version. It may make sense to merge their
-; code in the sense that the ssse3 version would jump to the appropriate
-; location in the sse/2 version, rather than duplicating that code in the
-; binary.
-
-INIT_XMM sse2
-SUBPEL_VARIANCE  4
-SUBPEL_VARIANCE  8
-SUBPEL_VARIANCE 16
-
-INIT_XMM ssse3
-SUBPEL_VARIANCE  4
-SUBPEL_VARIANCE  8
-SUBPEL_VARIANCE 16
-
-INIT_XMM sse2
-SUBPEL_VARIANCE  4, 1
-SUBPEL_VARIANCE  8, 1
-SUBPEL_VARIANCE 16, 1
-
-INIT_XMM ssse3
-SUBPEL_VARIANCE  4, 1
-SUBPEL_VARIANCE  8, 1
-SUBPEL_VARIANCE 16, 1

diff --git a/aom_dsp/x86/subtract_avx2.c b/aom_dsp/x86/subtract_avx2.c
deleted file mode 100644
index 8326451..0000000
--- a/aom_dsp/x86/subtract_avx2.c
+++ /dev/null

@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr,
-                                   const uint8_t *pred_ptr) {
-  __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr));
-  __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr));
-  __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s));
-  __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1));
-  __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p));
-  __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1));
-  const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
-  const __m256i d_1 = _mm256_sub_epi16(s_1, p_1);
-  _mm256_store_si256((__m256i *)(diff_ptr), d_0);
-  _mm256_store_si256((__m256i *)(diff_ptr + 16), d_1);
-}
-
-static INLINE void subtract_block_16xn_avx2(
-    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
-    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
-  for (int32_t j = 0; j < rows; ++j) {
-    __m128i s = _mm_lddqu_si128((__m128i *)(src_ptr));
-    __m128i p = _mm_lddqu_si128((__m128i *)(pred_ptr));
-    __m256i s_0 = _mm256_cvtepu8_epi16(s);
-    __m256i p_0 = _mm256_cvtepu8_epi16(p);
-    const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
-    _mm256_store_si256((__m256i *)(diff_ptr), d_0);
-    src_ptr += src_stride;
-    pred_ptr += pred_stride;
-    diff_ptr += diff_stride;
-  }
-}
-
-static INLINE void subtract_block_32xn_avx2(
-    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
-    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
-  for (int32_t j = 0; j < rows; ++j) {
-    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
-    src_ptr += src_stride;
-    pred_ptr += pred_stride;
-    diff_ptr += diff_stride;
-  }
-}
-
-static INLINE void subtract_block_64xn_avx2(
-    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
-    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
-  for (int32_t j = 0; j < rows; ++j) {
-    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
-    subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
-    src_ptr += src_stride;
-    pred_ptr += pred_stride;
-    diff_ptr += diff_stride;
-  }
-}
-
-static INLINE void subtract_block_128xn_avx2(
-    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
-    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
-  for (int32_t j = 0; j < rows; ++j) {
-    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
-    subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
-    subtract32_avx2(diff_ptr + 64, src_ptr + 64, pred_ptr + 64);
-    subtract32_avx2(diff_ptr + 96, src_ptr + 96, pred_ptr + 96);
-    src_ptr += src_stride;
-    pred_ptr += pred_stride;
-    diff_ptr += diff_stride;
-  }
-}
-
-void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
-                             ptrdiff_t diff_stride, const uint8_t *src_ptr,
-                             ptrdiff_t src_stride, const uint8_t *pred_ptr,
-                             ptrdiff_t pred_stride) {
-  switch (cols) {
-    case 16:
-      subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
-                               pred_ptr, pred_stride);
-      break;
-    case 32:
-      subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
-                               pred_ptr, pred_stride);
-      break;
-    case 64:
-      subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
-                               pred_ptr, pred_stride);
-      break;
-    case 128:
-      subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
-                                src_stride, pred_ptr, pred_stride);
-      break;
-    default:
-      aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
-                           src_stride, pred_ptr, pred_stride);
-      break;
-  }
-}

diff --git a/aom_dsp/x86/subtract_sse2.asm b/aom_dsp/x86/subtract_sse2.asm
deleted file mode 100644
index f4e9406..0000000
--- a/aom_dsp/x86/subtract_sse2.asm
+++ /dev/null

@@ -1,147 +0,0 @@
-;
-; Copyright (c) 2021, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 3-Clause Clear License and the
-; Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear License was
-; not distributed with this source code in the LICENSE file, you can obtain it
-; at aomedia.org/license/software-license/bsd-3-c-c/.  If the Alliance for Open Media Patent
-; License 1.0 was not distributed with this source code in the PATENTS file, you
-; can obtain it at aomedia.org/license/patent-license/.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-; void aom_subtract_block(int rows, int cols,
-;                         int16_t *diff, ptrdiff_t diff_stride,
-;                         const uint8_t *src, ptrdiff_t src_stride,
-;                         const uint8_t *pred, ptrdiff_t pred_stride)
-
-INIT_XMM sse2
-cglobal subtract_block, 7, 7, 8, \
-                        rows, cols, diff, diff_stride, src, src_stride, \
-                        pred, pred_stride
-%define pred_str colsq
-  pxor                  m7, m7         ; dedicated zero register
-  cmp                colsd, 4
-  je .case_4
-  cmp                colsd, 8
-  je .case_8
-  cmp                colsd, 16
-  je .case_16
-  cmp                colsd, 32
-  je .case_32
-  cmp                colsd, 64
-  je .case_64
-
-%macro loop16 6
-  mova                  m0, [srcq+%1]
-  mova                  m4, [srcq+%2]
-  mova                  m1, [predq+%3]
-  mova                  m5, [predq+%4]
-  punpckhbw             m2, m0, m7
-  punpckhbw             m3, m1, m7
-  punpcklbw             m0, m7
-  punpcklbw             m1, m7
-  psubw                 m2, m3
-  psubw                 m0, m1
-  punpckhbw             m1, m4, m7
-  punpckhbw             m3, m5, m7
-  punpcklbw             m4, m7
-  punpcklbw             m5, m7
-  psubw                 m1, m3
-  psubw                 m4, m5
-  mova [diffq+mmsize*0+%5], m0
-  mova [diffq+mmsize*1+%5], m2
-  mova [diffq+mmsize*0+%6], m4
-  mova [diffq+mmsize*1+%6], m1
-%endmacro
-
-  mov             pred_str, pred_stridemp
-.loop_128:
-  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize,  0*mmsize,  2*mmsize
-  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize,  4*mmsize,  6*mmsize
-  loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize,  8*mmsize, 10*mmsize
-  loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize
-  lea                diffq, [diffq+diff_strideq*2]
-  add                predq, pred_str
-  add                 srcq, src_strideq
-  sub                rowsd, 1
-  jnz .loop_128
-  RET
-
-.case_64:
-  mov             pred_str, pred_stridemp
-.loop_64:
-  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
-  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
-  lea                diffq, [diffq+diff_strideq*2]
-  add                predq, pred_str
-  add                 srcq, src_strideq
-  dec                rowsd
-  jg .loop_64
-  RET
-
-.case_32:
-  mov             pred_str, pred_stridemp
-.loop_32:
-  loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
-  lea                diffq, [diffq+diff_strideq*2]
-  add                predq, pred_str
-  add                 srcq, src_strideq
-  dec                rowsd
-  jg .loop_32
-  RET
-
-.case_16:
-  mov             pred_str, pred_stridemp
-.loop_16:
-  loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
-  lea                diffq, [diffq+diff_strideq*4]
-  lea                predq, [predq+pred_str*2]
-  lea                 srcq, [srcq+src_strideq*2]
-  sub                rowsd, 2
-  jg .loop_16
-  RET
-
-%macro loop_h 0
-  movh                  m0, [srcq]
-  movh                  m2, [srcq+src_strideq]
-  movh                  m1, [predq]
-  movh                  m3, [predq+pred_str]
-  punpcklbw             m0, m7
-  punpcklbw             m1, m7
-  punpcklbw             m2, m7
-  punpcklbw             m3, m7
-  psubw                 m0, m1
-  psubw                 m2, m3
-  mova             [diffq], m0
-  mova [diffq+diff_strideq*2], m2
-%endmacro
-
-.case_8:
-  mov             pred_str, pred_stridemp
-.loop_8:
-  loop_h
-  lea                diffq, [diffq+diff_strideq*4]
-  lea                 srcq, [srcq+src_strideq*2]
-  lea                predq, [predq+pred_str*2]
-  sub                rowsd, 2
-  jg .loop_8
-  RET
-
-INIT_MMX
-.case_4:
-  mov             pred_str, pred_stridemp
-.loop_4:
-  loop_h
-  lea                diffq, [diffq+diff_strideq*4]
-  lea                 srcq, [srcq+src_strideq*2]
-  lea                predq, [predq+pred_str*2]
-  sub                rowsd, 2
-  jg .loop_4
-  emms
-  RET

diff --git a/aom_dsp/x86/variance_avx2.c b/aom_dsp/x86/variance_avx2.c
index 39d301d..b5807af 100644
--- a/aom_dsp/x86/variance_avx2.c
+++ b/aom_dsp/x86/variance_avx2.c

@@ -17,318 +17,6 @@
 #include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
 #include "aom_dsp/x86/synonyms.h"
 
-static INLINE __m128i mm256_add_hi_lo_epi16(const __m256i val) {
-  return _mm_add_epi16(_mm256_castsi256_si128(val),
-                       _mm256_extractf128_si256(val, 1));
-}
-
-static INLINE __m128i mm256_add_hi_lo_epi32(const __m256i val) {
-  return _mm_add_epi32(_mm256_castsi256_si128(val),
-                       _mm256_extractf128_si256(val, 1));
-}
-
-static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref,
-                                        __m256i *const sse,
-                                        __m256i *const sum) {
-  const __m256i adj_sub = _mm256_set1_epi16((short)0xff01);  // (1,-1)
-
-  // unpack into pairs of source and reference values
-  const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref);
-  const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref);
-
-  // subtract adjacent elements using src*1 + ref*-1
-  const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
-  const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
-  const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
-  const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
-
-  // add to the running totals
-  *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1));
-  *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1));
-}
-
-static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum,
-                                                     unsigned int *const sse) {
-  // extract the low lane and add it to the high lane
-  const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse);
-
-  // unpack sse and sum registers and add
-  const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum);
-  const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum);
-  const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
-
-  // perform the final summation and extract the results
-  const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
-  *((int *)sse) = _mm_cvtsi128_si32(res);
-  return _mm_extract_epi32(res, 1);
-}
-
-// handle pixels (<= 512)
-static INLINE int variance_final_512_avx2(__m256i vsse, __m256i vsum,
-                                          unsigned int *const sse) {
-  // extract the low lane and add it to the high lane
-  const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
-  const __m128i vsum_64 = _mm_add_epi16(vsum_128, _mm_srli_si128(vsum_128, 8));
-  const __m128i sum_int32 = _mm_cvtepi16_epi32(vsum_64);
-  return variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse);
-}
-
-// handle 1024 pixels (32x32, 16x64, 64x16)
-static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum,
-                                           unsigned int *const sse) {
-  // extract the low lane and add it to the high lane
-  const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
-  const __m128i vsum_64 =
-      _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128),
-                    _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8)));
-  return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse);
-}
-
-static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
-  const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum));
-  const __m256i sum_hi =
-      _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1));
-  return _mm256_add_epi32(sum_lo, sum_hi);
-}
-
-// handle 2048 pixels (32x64, 64x32)
-static INLINE int variance_final_2048_avx2(__m256i vsse, __m256i vsum,
-                                           unsigned int *const sse) {
-  vsum = sum_to_32bit_avx2(vsum);
-  const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);
-  return variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);
-}
-
-static INLINE void variance16_kernel_avx2(
-    const uint8_t *const src, const int src_stride, const uint8_t *const ref,
-    const int ref_stride, __m256i *const sse, __m256i *const sum) {
-  const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
-  const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
-  const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride));
-  const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride));
-  const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
-  const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1);
-  variance_kernel_avx2(s, r, sse, sum);
-}
-
-static INLINE void variance32_kernel_avx2(const uint8_t *const src,
-                                          const uint8_t *const ref,
-                                          __m256i *const sse,
-                                          __m256i *const sum) {
-  const __m256i s = _mm256_loadu_si256((__m256i const *)(src));
-  const __m256i r = _mm256_loadu_si256((__m256i const *)(ref));
-  variance_kernel_avx2(s, r, sse, sum);
-}
-
-static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
-                                   const uint8_t *ref, const int ref_stride,
-                                   const int h, __m256i *const vsse,
-                                   __m256i *const vsum) {
-  *vsum = _mm256_setzero_si256();
-
-  for (int i = 0; i < h; i += 2) {
-    variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
-    src += 2 * src_stride;
-    ref += 2 * ref_stride;
-  }
-}
-
-static INLINE void variance32_avx2(const uint8_t *src, const int src_stride,
-                                   const uint8_t *ref, const int ref_stride,
-                                   const int h, __m256i *const vsse,
-                                   __m256i *const vsum) {
-  *vsum = _mm256_setzero_si256();
-
-  for (int i = 0; i < h; i++) {
-    variance32_kernel_avx2(src, ref, vsse, vsum);
-    src += src_stride;
-    ref += ref_stride;
-  }
-}
-
-static INLINE void variance64_avx2(const uint8_t *src, const int src_stride,
-                                   const uint8_t *ref, const int ref_stride,
-                                   const int h, __m256i *const vsse,
-                                   __m256i *const vsum) {
-  *vsum = _mm256_setzero_si256();
-
-  for (int i = 0; i < h; i++) {
-    variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
-    variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
-    src += src_stride;
-    ref += ref_stride;
-  }
-}
-
-static INLINE void variance128_avx2(const uint8_t *src, const int src_stride,
-                                    const uint8_t *ref, const int ref_stride,
-                                    const int h, __m256i *const vsse,
-                                    __m256i *const vsum) {
-  *vsum = _mm256_setzero_si256();
-
-  for (int i = 0; i < h; i++) {
-    variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
-    variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
-    variance32_kernel_avx2(src + 64, ref + 64, vsse, vsum);
-    variance32_kernel_avx2(src + 96, ref + 96, vsse, vsum);
-    src += src_stride;
-    ref += ref_stride;
-  }
-}
-
-#define AOM_VAR_NO_LOOP_AVX2(bw, bh, bits, max_pixel)                         \
-  unsigned int aom_variance##bw##x##bh##_avx2(                                \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      unsigned int *sse) {                                                    \
-    __m256i vsse = _mm256_setzero_si256();                                    \
-    __m256i vsum;                                                             \
-    variance##bw##_avx2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum);  \
-    const int sum = variance_final_##max_pixel##_avx2(vsse, vsum, sse);       \
-    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
-  }
-
-AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512);
-AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512);
-AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512);
-AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512);
-AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024);
-
-AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512);
-AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512);
-AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024);
-AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048);
-
-AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024);
-AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048);
-
-#define AOM_VAR_LOOP_AVX2(bw, bh, bits, uh)                                   \
-  unsigned int aom_variance##bw##x##bh##_avx2(                                \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      unsigned int *sse) {                                                    \
-    __m256i vsse = _mm256_setzero_si256();                                    \
-    __m256i vsum = _mm256_setzero_si256();                                    \
-    for (int i = 0; i < (bh / uh); i++) {                                     \
-      __m256i vsum16;                                                         \
-      variance##bw##_avx2(src, src_stride, ref, ref_stride, uh, &vsse,        \
-                          &vsum16);                                           \
-      vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16));               \
-      src += uh * src_stride;                                                 \
-      ref += uh * ref_stride;                                                 \
-    }                                                                         \
-    const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);                     \
-    const int sum = variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);  \
-    return *sse - (unsigned int)(((int64_t)sum * sum) >> bits);               \
-  }
-
-AOM_VAR_LOOP_AVX2(64, 64, 12, 32);    // 64x32 * ( 64/32)
-AOM_VAR_LOOP_AVX2(64, 128, 13, 32);   // 64x32 * (128/32)
-AOM_VAR_LOOP_AVX2(128, 64, 13, 16);   // 128x16 * ( 64/16)
-AOM_VAR_LOOP_AVX2(128, 128, 14, 16);  // 128x16 * (128/16)
-
-unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride,
-                               unsigned int *sse) {
-  aom_variance16x16_avx2(src, src_stride, ref, ref_stride, sse);
-  return *sse;
-}
-
-unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
-                                             int x_offset, int y_offset,
-                                             const uint8_t *dst, int dst_stride,
-                                             int height, unsigned int *sse);
-unsigned int aom_sub_pixel_variance16xh_avx2(const uint8_t *src, int src_stride,
-                                             int x_offset, int y_offset,
-                                             const uint8_t *dst, int dst_stride,
-                                             int height, unsigned int *sse);
-
-unsigned int aom_sub_pixel_avg_variance32xh_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
-    int height, unsigned int *sseptr);
-
-#define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, wlog2, hlog2)                        \
-  unsigned int aom_sub_pixel_variance##w##x##h##_avx2(                        \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
-    /*Avoid overflow in helper by capping height.*/                           \
-    const int hf = AOMMIN(h, 64);                                             \
-    unsigned int sse = 0;                                                     \
-    int se = 0;                                                               \
-    for (int i = 0; i < (w / wf); ++i) {                                      \
-      const uint8_t *src_ptr = src;                                           \
-      const uint8_t *dst_ptr = dst;                                           \
-      for (int j = 0; j < (h / hf); ++j) {                                    \
-        unsigned int sse2;                                                    \
-        const int se2 = aom_sub_pixel_variance##wf##xh_avx2(                  \
-            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
-            &sse2);                                                           \
-        dst_ptr += hf * dst_stride;                                           \
-        src_ptr += hf * src_stride;                                           \
-        se += se2;                                                            \
-        sse += sse2;                                                          \
-      }                                                                       \
-      src += wf;                                                              \
-      dst += wf;                                                              \
-    }                                                                         \
-    *sse_ptr = sse;                                                           \
-    return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));       \
-  }
-
-AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 7, 7);
-AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 7, 6);
-AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 6, 7);
-AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 6, 6);
-AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 6, 5);
-AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6);
-AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5);
-AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4);
-AOM_SUB_PIXEL_VAR_AVX2(16, 64, 16, 4, 6);
-AOM_SUB_PIXEL_VAR_AVX2(16, 32, 16, 4, 5);
-AOM_SUB_PIXEL_VAR_AVX2(16, 16, 16, 4, 4);
-AOM_SUB_PIXEL_VAR_AVX2(16, 8, 16, 4, 3);
-AOM_SUB_PIXEL_VAR_AVX2(16, 4, 16, 4, 2);
-
-#define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, wlog2, hlog2)                \
-  unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2(                \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,     \
-      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,          \
-      const uint8_t *sec) {                                               \
-    /*Avoid overflow in helper by capping height.*/                       \
-    const int hf = AOMMIN(h, 64);                                         \
-    unsigned int sse = 0;                                                 \
-    int se = 0;                                                           \
-    for (int i = 0; i < (w / wf); ++i) {                                  \
-      const uint8_t *src_ptr = src;                                       \
-      const uint8_t *dst_ptr = dst;                                       \
-      const uint8_t *sec_ptr = sec;                                       \
-      for (int j = 0; j < (h / hf); ++j) {                                \
-        unsigned int sse2;                                                \
-        const int se2 = aom_sub_pixel_avg_variance##wf##xh_avx2(          \
-            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
-            sec_ptr, w, hf, &sse2);                                       \
-        dst_ptr += hf * dst_stride;                                       \
-        src_ptr += hf * src_stride;                                       \
-        sec_ptr += hf * w;                                                \
-        se += se2;                                                        \
-        sse += sse2;                                                      \
-      }                                                                   \
-      src += wf;                                                          \
-      dst += wf;                                                          \
-      sec += wf;                                                          \
-    }                                                                     \
-    *sse_ptr = sse;                                                       \
-    return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));   \
-  }
-
-AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 7, 7);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 7, 6);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 6, 7);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 6, 6);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 6, 5);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 64, 32, 5, 6);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 32, 32, 5, 5);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 16, 32, 5, 4);
-
 static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) {
   const __m256i d =
       _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
@@ -523,120 +211,3 @@
     } while (i < height);
   }
 }
-
-uint64_t aom_mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
-                                int sstride, int h) {
-  uint64_t sum = 0;
-  __m128i dst0_4x8, dst1_4x8, dst2_4x8, dst3_4x8, dst_16x8;
-  __m128i src0_4x16, src1_4x16, src2_4x16, src3_4x16;
-  __m256i src0_8x16, src1_8x16, dst_16x16, src_16x16;
-  __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64;
-  __m256i sub_result;
-  const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
-  __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
-  for (int i = 0; i < h; i += 4) {
-    dst0_4x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 0) * dstride]));
-    dst1_4x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 1) * dstride]));
-    dst2_4x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 2) * dstride]));
-    dst3_4x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 3) * dstride]));
-    dst_16x8 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(dst0_4x8, dst1_4x8),
-                                  _mm_unpacklo_epi32(dst2_4x8, dst3_4x8));
-    dst_16x16 = _mm256_cvtepu8_epi16(dst_16x8);
-
-    src0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
-    src1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
-    src2_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 2) * sstride]));
-    src3_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 3) * sstride]));
-    src0_8x16 =
-        _mm256_castsi128_si256(_mm_unpacklo_epi64(src0_4x16, src1_4x16));
-    src1_8x16 =
-        _mm256_castsi128_si256(_mm_unpacklo_epi64(src2_4x16, src3_4x16));
-    src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
-
-    sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
-
-    src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros);
-    dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros);
-
-    src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16);  // 32bit store
-    dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16);  // 32bit store
-
-    res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros);
-    res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros);
-    res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros);
-    res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros);
-
-    square_result = _mm256_add_epi64(
-        square_result,
-        _mm256_add_epi64(
-            _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64),
-            res3_4x64));
-  }
-  const __m128i sum_2x64 =
-      _mm_add_epi64(_mm256_castsi256_si128(square_result),
-                    _mm256_extracti128_si256(square_result, 1));
-  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
-  xx_storel_64(&sum, sum_1x64);
-  return sum;
-}
-
-uint64_t aom_mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
-                                int sstride, int h) {
-  uint64_t sum = 0;
-  __m128i dst0_8x8, dst1_8x8, dst3_16x8;
-  __m256i src0_8x16, src1_8x16, src_16x16, dst_16x16;
-  __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64;
-  __m256i sub_result;
-  const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
-  __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
-
-  for (int i = 0; i < h; i += 2) {
-    dst0_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
-    dst1_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride]));
-    dst3_16x8 = _mm_unpacklo_epi64(dst0_8x8, dst1_8x8);
-    dst_16x16 = _mm256_cvtepu8_epi16(dst3_16x8);
-
-    src0_8x16 =
-        _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src[i * sstride]));
-    src1_8x16 = _mm256_castsi128_si256(
-        _mm_loadu_si128((__m128i *)&src[(i + 1) * sstride]));
-    src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
-
-    sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
-
-    src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros);
-    dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros);
-
-    src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16);
-    dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16);
-
-    res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros);
-    res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros);
-    res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros);
-    res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros);
-
-    square_result = _mm256_add_epi64(
-        square_result,
-        _mm256_add_epi64(
-            _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64),
-            res3_4x64));
-  }
-
-  const __m128i sum_2x64 =
-      _mm_add_epi64(_mm256_castsi256_si128(square_result),
-                    _mm256_extracti128_si256(square_result, 1));
-  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
-  xx_storel_64(&sum, sum_1x64);
-  return sum;
-}
-
-uint64_t aom_mse_wxh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
-                                int sstride, int w, int h) {
-  assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
-         "w=8/4 and h=8/4 must satisfy");
-  switch (w) {
-    case 4: return aom_mse_4xh_16bit_avx2(dst, dstride, src, sstride, h);
-    case 8: return aom_mse_8xh_16bit_avx2(dst, dstride, src, sstride, h);
-    default: assert(0 && "unsupported width"); return -1;
-  }
-}

diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index d488cdc..f2a7c41 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c

@@ -43,612 +43,6 @@
   return _mm_cvtsi128_si32(vsum);
 }
 
-static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
-  const __m128i p0 = _mm_cvtsi32_si128(loadu_uint32(p + 0 * stride));
-  const __m128i p1 = _mm_cvtsi32_si128(loadu_uint32(p + 1 * stride));
-  return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128());
-}
-
-static INLINE __m128i load8_8to16_sse2(const uint8_t *const p) {
-  const __m128i p0 = _mm_loadl_epi64((const __m128i *)p);
-  return _mm_unpacklo_epi8(p0, _mm_setzero_si128());
-}
-
-// Accumulate 4 32bit numbers in val to 1 32bit number
-static INLINE unsigned int add32x4_sse2(__m128i val) {
-  val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
-  val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
-  return _mm_cvtsi128_si32(val);
-}
-
-// Accumulate 8 16bit in sum to 4 32bit number
-static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
-  const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16);
-  const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16);
-  return _mm_add_epi32(sum_lo, sum_hi);
-}
-
-static INLINE void variance_kernel_sse2(const __m128i src, const __m128i ref,
-                                        __m128i *const sse,
-                                        __m128i *const sum) {
-  const __m128i diff = _mm_sub_epi16(src, ref);
-  *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff));
-  *sum = _mm_add_epi16(*sum, diff);
-}
-
-// Can handle 128 pixels' diff sum (such as 8x16 or 16x8)
-// Slightly faster than variance_final_256_pel_sse2()
-// diff sum of 128 pixels can still fit in 16bit integer
-static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum,
-                                               unsigned int *const sse,
-                                               int *const sum) {
-  *sse = add32x4_sse2(vsse);
-
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
-  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
-}
-
-// Can handle 256 pixels' diff sum (such as 16x16)
-static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum,
-                                               unsigned int *const sse,
-                                               int *const sum) {
-  *sse = add32x4_sse2(vsse);
-
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
-  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
-  *sum += (int16_t)_mm_extract_epi16(vsum, 1);
-}
-
-// Can handle 512 pixels' diff sum (such as 16x32 or 32x16)
-static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
-                                               unsigned int *const sse,
-                                               int *const sum) {
-  *sse = add32x4_sse2(vsse);
-
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_unpacklo_epi16(vsum, vsum);
-  vsum = _mm_srai_epi32(vsum, 16);
-  *sum = add32x4_sse2(vsum);
-}
-
-// Can handle 1024 pixels' diff sum (such as 32x32)
-static INLINE void variance_final_1024_pel_sse2(__m128i vsse, __m128i vsum,
-                                                unsigned int *const sse,
-                                                int *const sum) {
-  *sse = add32x4_sse2(vsse);
-
-  vsum = sum_to_32bit_sse2(vsum);
-  *sum = add32x4_sse2(vsum);
-}
-
-static INLINE void variance4_sse2(const uint8_t *src, const int src_stride,
-                                  const uint8_t *ref, const int ref_stride,
-                                  const int h, __m128i *const sse,
-                                  __m128i *const sum) {
-  assert(h <= 256);  // May overflow for larger height.
-  *sum = _mm_setzero_si128();
-
-  for (int i = 0; i < h; i += 2) {
-    const __m128i s = load4x2_sse2(src, src_stride);
-    const __m128i r = load4x2_sse2(ref, ref_stride);
-
-    variance_kernel_sse2(s, r, sse, sum);
-    src += 2 * src_stride;
-    ref += 2 * ref_stride;
-  }
-}
-
-static INLINE void variance8_sse2(const uint8_t *src, const int src_stride,
-                                  const uint8_t *ref, const int ref_stride,
-                                  const int h, __m128i *const sse,
-                                  __m128i *const sum) {
-  assert(h <= 128);  // May overflow for larger height.
-  *sum = _mm_setzero_si128();
-  *sse = _mm_setzero_si128();
-  for (int i = 0; i < h; i++) {
-    const __m128i s = load8_8to16_sse2(src);
-    const __m128i r = load8_8to16_sse2(ref);
-
-    variance_kernel_sse2(s, r, sse, sum);
-    src += src_stride;
-    ref += ref_stride;
-  }
-}
-
-static INLINE void variance16_kernel_sse2(const uint8_t *const src,
-                                          const uint8_t *const ref,
-                                          __m128i *const sse,
-                                          __m128i *const sum) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i s = _mm_loadu_si128((const __m128i *)src);
-  const __m128i r = _mm_loadu_si128((const __m128i *)ref);
-  const __m128i src0 = _mm_unpacklo_epi8(s, zero);
-  const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
-  const __m128i src1 = _mm_unpackhi_epi8(s, zero);
-  const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
-
-  variance_kernel_sse2(src0, ref0, sse, sum);
-  variance_kernel_sse2(src1, ref1, sse, sum);
-}
-
-static INLINE void variance16_sse2(const uint8_t *src, const int src_stride,
-                                   const uint8_t *ref, const int ref_stride,
-                                   const int h, __m128i *const sse,
-                                   __m128i *const sum) {
-  assert(h <= 64);  // May overflow for larger height.
-  *sum = _mm_setzero_si128();
-
-  for (int i = 0; i < h; ++i) {
-    variance16_kernel_sse2(src, ref, sse, sum);
-    src += src_stride;
-    ref += ref_stride;
-  }
-}
-
-static INLINE void variance32_sse2(const uint8_t *src, const int src_stride,
-                                   const uint8_t *ref, const int ref_stride,
-                                   const int h, __m128i *const sse,
-                                   __m128i *const sum) {
-  assert(h <= 32);  // May overflow for larger height.
-  // Don't initialize sse here since it's an accumulation.
-  *sum = _mm_setzero_si128();
-
-  for (int i = 0; i < h; ++i) {
-    variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
-    variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
-    src += src_stride;
-    ref += ref_stride;
-  }
-}
-
-static INLINE void variance64_sse2(const uint8_t *src, const int src_stride,
-                                   const uint8_t *ref, const int ref_stride,
-                                   const int h, __m128i *const sse,
-                                   __m128i *const sum) {
-  assert(h <= 16);  // May overflow for larger height.
-  *sum = _mm_setzero_si128();
-
-  for (int i = 0; i < h; ++i) {
-    variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
-    variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
-    variance16_kernel_sse2(src + 32, ref + 32, sse, sum);
-    variance16_kernel_sse2(src + 48, ref + 48, sse, sum);
-    src += src_stride;
-    ref += ref_stride;
-  }
-}
-
-static INLINE void variance128_sse2(const uint8_t *src, const int src_stride,
-                                    const uint8_t *ref, const int ref_stride,
-                                    const int h, __m128i *const sse,
-                                    __m128i *const sum) {
-  assert(h <= 8);  // May overflow for larger height.
-  *sum = _mm_setzero_si128();
-
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < 4; ++j) {
-      const int offset0 = j << 5;
-      const int offset1 = offset0 + 16;
-      variance16_kernel_sse2(src + offset0, ref + offset0, sse, sum);
-      variance16_kernel_sse2(src + offset1, ref + offset1, sse, sum);
-    }
-    src += src_stride;
-    ref += ref_stride;
-  }
-}
-
-void aom_get8x8var_sse2(const uint8_t *src_ptr, int src_stride,
-                        const uint8_t *ref_ptr, int ref_stride,
-                        unsigned int *sse, int *sum) {
-  __m128i vsse, vsum;
-  variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
-  variance_final_128_pel_sse2(vsse, vsum, sse, sum);
-}
-
-#define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels)                        \
-  unsigned int aom_variance##bw##x##bh##_sse2(                                \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      unsigned int *sse) {                                                    \
-    __m128i vsse = _mm_setzero_si128();                                       \
-    __m128i vsum;                                                             \
-    int sum = 0;                                                              \
-    variance##bw##_sse2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum);  \
-    variance_final_##max_pixels##_pel_sse2(vsse, vsum, sse, &sum);            \
-    assert(sum <= 255 * bw * bh);                                             \
-    assert(sum >= -255 * bw * bh);                                            \
-    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
-  }
-
-AOM_VAR_NO_LOOP_SSE2(4, 4, 4, 128);
-AOM_VAR_NO_LOOP_SSE2(4, 8, 5, 128);
-AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128);
-
-AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128);
-AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128);
-AOM_VAR_NO_LOOP_SSE2(8, 16, 7, 128);
-AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256);
-
-AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128);
-AOM_VAR_NO_LOOP_SSE2(16, 8, 7, 128);
-AOM_VAR_NO_LOOP_SSE2(16, 16, 8, 256);
-AOM_VAR_NO_LOOP_SSE2(16, 32, 9, 512);
-AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024);
-
-AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256);
-AOM_VAR_NO_LOOP_SSE2(32, 16, 9, 512);
-AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024);
-
-#define AOM_VAR_LOOP_SSE2(bw, bh, bits, uh)                                   \
-  unsigned int aom_variance##bw##x##bh##_sse2(                                \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      unsigned int *sse) {                                                    \
-    __m128i vsse = _mm_setzero_si128();                                       \
-    __m128i vsum = _mm_setzero_si128();                                       \
-    for (int i = 0; i < (bh / uh); ++i) {                                     \
-      __m128i vsum16;                                                         \
-      variance##bw##_sse2(src, src_stride, ref, ref_stride, uh, &vsse,        \
-                          &vsum16);                                           \
-      vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));                  \
-      src += (src_stride * uh);                                               \
-      ref += (ref_stride * uh);                                               \
-    }                                                                         \
-    *sse = add32x4_sse2(vsse);                                                \
-    int sum = add32x4_sse2(vsum);                                             \
-    assert(sum <= 255 * bw * bh);                                             \
-    assert(sum >= -255 * bw * bh);                                            \
-    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
-  }
-
-AOM_VAR_LOOP_SSE2(32, 64, 11, 32);  // 32x32 * ( 64/32 )
-
-AOM_VAR_NO_LOOP_SSE2(64, 16, 10, 1024);
-AOM_VAR_LOOP_SSE2(64, 32, 11, 16);   // 64x16 * ( 32/16 )
-AOM_VAR_LOOP_SSE2(64, 64, 12, 16);   // 64x16 * ( 64/16 )
-AOM_VAR_LOOP_SSE2(64, 128, 13, 16);  // 64x16 * ( 128/16 )
-
-AOM_VAR_LOOP_SSE2(128, 64, 13, 8);   // 128x8 * ( 64/8 )
-AOM_VAR_LOOP_SSE2(128, 128, 14, 8);  // 128x8 * ( 128/8 )
-
-unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride,
-                             const uint8_t *ref, int ref_stride,
-                             unsigned int *sse) {
-  aom_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
-  return *sse;
-}
-
-unsigned int aom_mse8x16_sse2(const uint8_t *src, int src_stride,
-                              const uint8_t *ref, int ref_stride,
-                              unsigned int *sse) {
-  aom_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
-  return *sse;
-}
-
-unsigned int aom_mse16x8_sse2(const uint8_t *src, int src_stride,
-                              const uint8_t *ref, int ref_stride,
-                              unsigned int *sse) {
-  aom_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
-  return *sse;
-}
-
-unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride,
-                               unsigned int *sse) {
-  aom_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
-  return *sse;
-}
-
-// The 2 unused parameters are place holders for PIC enabled build.
-// These definitions are for functions defined in subpel_variance.asm
-#define DECL(w, opt)                                                           \
-  int aom_sub_pixel_variance##w##xh_##opt(                                     \
-      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset,    \
-      const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
-      void *unused0, void *unused)
-#define DECLS(opt) \
-  DECL(4, opt);    \
-  DECL(8, opt);    \
-  DECL(16, opt)
-
-DECLS(sse2);
-DECLS(ssse3);
-#undef DECLS
-#undef DECL
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                      \
-  unsigned int aom_sub_pixel_variance##w##x##h##_##opt(                       \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
-    /*Avoid overflow in helper by capping height.*/                           \
-    const int hf = AOMMIN(h, 64);                                             \
-    unsigned int sse = 0;                                                     \
-    int se = 0;                                                               \
-    for (int i = 0; i < (w / wf); ++i) {                                      \
-      const uint8_t *src_ptr = src;                                           \
-      const uint8_t *dst_ptr = dst;                                           \
-      for (int j = 0; j < (h / hf); ++j) {                                    \
-        unsigned int sse2;                                                    \
-        const int se2 = aom_sub_pixel_variance##wf##xh_##opt(                 \
-            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
-            &sse2, NULL, NULL);                                               \
-        dst_ptr += hf * dst_stride;                                           \
-        src_ptr += hf * src_stride;                                           \
-        se += se2;                                                            \
-        sse += sse2;                                                          \
-      }                                                                       \
-      src += wf;                                                              \
-      dst += wf;                                                              \
-    }                                                                         \
-    *sse_ptr = sse;                                                           \
-    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));  \
-  }
-
-#define FNS(opt)                                     \
-  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
-  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t));  \
-  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t));  \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));   \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));   \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));   \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));   \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));   \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));   \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t));  \
-  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t));    \
-  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t));     \
-  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t));      \
-  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t));      \
-  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t));      \
-  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t));      \
-  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));     \
-  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));    \
-  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t));    \
-  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t));   \
-  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t));   \
-  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
-
-FNS(sse2);
-FNS(ssse3);
-
-#undef FNS
-#undef FN
-
-// The 2 unused parameters are place holders for PIC enabled build.
-#define DECL(w, opt)                                                        \
-  int aom_sub_pixel_avg_variance##w##xh_##opt(                              \
-      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
-      const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec,         \
-      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,   \
-      void *unused)
-#define DECLS(opt) \
-  DECL(4, opt);    \
-  DECL(8, opt);    \
-  DECL(16, opt)
-
-DECLS(sse2);
-DECLS(ssse3);
-#undef DECL
-#undef DECLS
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                     \
-  unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt(                  \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,        \
-      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,             \
-      const uint8_t *sec) {                                                  \
-    /*Avoid overflow in helper by capping height.*/                          \
-    const int hf = AOMMIN(h, 64);                                            \
-    unsigned int sse = 0;                                                    \
-    int se = 0;                                                              \
-    for (int i = 0; i < (w / wf); ++i) {                                     \
-      const uint8_t *src_ptr = src;                                          \
-      const uint8_t *dst_ptr = dst;                                          \
-      const uint8_t *sec_ptr = sec;                                          \
-      for (int j = 0; j < (h / hf); ++j) {                                   \
-        unsigned int sse2;                                                   \
-        const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(            \
-            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride,    \
-            sec_ptr, w, hf, &sse2, NULL, NULL);                              \
-        dst_ptr += hf * dst_stride;                                          \
-        src_ptr += hf * src_stride;                                          \
-        sec_ptr += hf * w;                                                   \
-        se += se2;                                                           \
-        sse += sse2;                                                         \
-      }                                                                      \
-      src += wf;                                                             \
-      dst += wf;                                                             \
-      sec += wf;                                                             \
-    }                                                                        \
-    *sse_ptr = sse;                                                          \
-    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
-  }
-
-#define FNS(opt)                                     \
-  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
-  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t));  \
-  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t));  \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));   \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));   \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));   \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));   \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));   \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));   \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t));  \
-  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t));   \
-  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t));    \
-  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t));     \
-  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t));     \
-  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t));     \
-  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t));     \
-  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));     \
-  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));    \
-  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t));    \
-  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t));   \
-  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t));   \
-  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
-
-FNS(sse2);
-FNS(ssse3);
-
-#undef FNS
-#undef FN
-
-void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
-                             int mi_row, int mi_col, const MV *const mv,
-                             uint8_t *comp_pred, int width, int height,
-                             int subpel_x_q3, int subpel_y_q3,
-                             const uint8_t *ref, int ref_stride,
-                             int subpel_search) {
-  // expect xd == NULL only in tests
-  if (xd != NULL) {
-    const MB_MODE_INFO *mi = xd->mi[0];
-    const int ref_num = 0;
-    const int is_intrabc = is_intrabc_block(mi, xd->tree_type);
-    const struct scale_factors *const sf =
-        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
-    const int is_scaled = av1_is_scaled(sf);
-
-    if (is_scaled) {
-      int plane = 0;
-      const int mi_x = mi_col * MI_SIZE;
-      const int mi_y = mi_row * MI_SIZE;
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const struct buf_2d *const dst_buf = &pd->dst;
-      const struct buf_2d *const pre_buf =
-          is_intrabc ? dst_buf : &pd->pre[ref_num];
-
-      InterPredParams inter_pred_params;
-      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
-      const InterpFilter filters = EIGHTTAP_REGULAR;
-      av1_init_inter_params(
-          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
-          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
-          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
-      av1_enc_build_one_inter_predictor(comp_pred, width, mv,
-                                        &inter_pred_params);
-      return;
-    }
-  }
-
-  const InterpFilterParams *filter = av1_get_filter(subpel_search);
-  // (TODO:yunqing) 2-tap case uses 4-tap functions since there is no SIMD for
-  // 2-tap yet.
-  int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
-
-  if (!subpel_x_q3 && !subpel_y_q3) {
-    if (width >= 16) {
-      int i;
-      assert(!(width & 15));
-      /*Read 16 pixels one row at a time.*/
-      for (i = 0; i < height; i++) {
-        int j;
-        for (j = 0; j < width; j += 16) {
-          xx_storeu_128(comp_pred, xx_loadu_128(ref));
-          comp_pred += 16;
-          ref += 16;
-        }
-        ref += ref_stride - width;
-      }
-    } else if (width >= 8) {
-      int i;
-      assert(!(width & 7));
-      assert(!(height & 1));
-      /*Read 8 pixels two rows at a time.*/
-      for (i = 0; i < height; i += 2) {
-        __m128i s0 = xx_loadl_64(ref + 0 * ref_stride);
-        __m128i s1 = xx_loadl_64(ref + 1 * ref_stride);
-        xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1));
-        comp_pred += 16;
-        ref += 2 * ref_stride;
-      }
-    } else {
-      int i;
-      assert(!(width & 3));
-      assert(!(height & 3));
-      /*Read 4 pixels four rows at a time.*/
-      for (i = 0; i < height; i++) {
-        const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride);
-        const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride);
-        const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride);
-        const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride);
-        const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1),
-                                               _mm_unpacklo_epi32(row2, row3));
-        xx_storeu_128(comp_pred, reg);
-        comp_pred += 16;
-        ref += 4 * ref_stride;
-      }
-    }
-  } else if (!subpel_y_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
-                        width, height);
-  } else if (!subpel_x_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
-                       width, height);
-  } else {
-    DECLARE_ALIGNED(16, uint8_t,
-                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
-    const int16_t *const kernel_x =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    const int16_t *const kernel_y =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1);
-    uint8_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
-                                    ? temp + (filter_taps >> 1) * MAX_SB_SIZE
-                                    : temp;
-    uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
-    int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
-    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE,
-                        kernel_x, 16, NULL, -1, width, intermediate_height);
-    aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1,
-                       kernel_y, 16, width, height);
-  }
-}
-
-void aom_comp_avg_upsampled_pred_sse2(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, int subpel_search) {
-  int n;
-  int i;
-  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
-  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
-  assert(!(width * height & 15));
-  n = width * height >> 4;
-  for (i = 0; i < n; i++) {
-    __m128i s0 = xx_loadu_128(comp_pred);
-    __m128i p0 = xx_loadu_128(pred);
-    xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0));
-    comp_pred += 16;
-    pred += 16;
-  }
-}
-
-void aom_comp_mask_upsampled_pred_sse2(
-    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
-    int subpel_search) {
-  if (subpel_x_q3 | subpel_y_q3) {
-    aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
-                       subpel_search);
-    ref = comp_pred;
-    ref_stride = width;
-  }
-  aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask,
-                     mask_stride, invert_mask);
-}
-
 static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0,
                                                       const __m128i s1,
                                                       const __m128i a) {
@@ -756,98 +150,3 @@
     } while (i < height);
   }
 }
-
-uint64_t aom_mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
-                                int sstride, int h) {
-  uint64_t sum = 0;
-  __m128i dst0_8x8, dst1_8x8, dst_16x8;
-  __m128i src0_16x4, src1_16x4, src_16x8;
-  __m128i res0_32x4, res1_32x4, res0_64x4, res1_64x4, res2_64x4, res3_64x4;
-  __m128i sub_result_16x8;
-  const __m128i zeros = _mm_setzero_si128();
-  __m128i square_result = _mm_setzero_si128();
-  for (int i = 0; i < h; i += 2) {
-    dst0_8x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 0) * dstride]));
-    dst1_8x8 = _mm_cvtsi32_si128(*(uint32_t const *)(&dst[(i + 1) * dstride]));
-    dst_16x8 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(dst0_8x8, dst1_8x8), zeros);
-
-    src0_16x4 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
-    src1_16x4 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
-    src_16x8 = _mm_unpacklo_epi64(src0_16x4, src1_16x4);
-
-    sub_result_16x8 = _mm_sub_epi16(src_16x8, dst_16x8);
-
-    res0_32x4 = _mm_unpacklo_epi16(sub_result_16x8, zeros);
-    res1_32x4 = _mm_unpackhi_epi16(sub_result_16x8, zeros);
-
-    res0_32x4 = _mm_madd_epi16(res0_32x4, res0_32x4);
-    res1_32x4 = _mm_madd_epi16(res1_32x4, res1_32x4);
-
-    res0_64x4 = _mm_unpacklo_epi32(res0_32x4, zeros);
-    res1_64x4 = _mm_unpackhi_epi32(res0_32x4, zeros);
-    res2_64x4 = _mm_unpacklo_epi32(res1_32x4, zeros);
-    res3_64x4 = _mm_unpackhi_epi32(res1_32x4, zeros);
-
-    square_result = _mm_add_epi64(
-        square_result,
-        _mm_add_epi64(
-            _mm_add_epi64(_mm_add_epi64(res0_64x4, res1_64x4), res2_64x4),
-            res3_64x4));
-  }
-  const __m128i sum_1x64 =
-      _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
-  xx_storel_64(&sum, sum_1x64);
-  return sum;
-}
-
-uint64_t aom_mse_8xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
-                                int sstride, int h) {
-  uint64_t sum = 0;
-  __m128i dst_8x8, dst_16x8;
-  __m128i src_16x8;
-  __m128i res0_32x4, res1_32x4, res0_64x4, res1_64x4, res2_64x4, res3_64x4;
-  __m128i sub_result_16x8;
-  const __m128i zeros = _mm_setzero_si128();
-  __m128i square_result = _mm_setzero_si128();
-
-  for (int i = 0; i < h; i++) {
-    dst_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
-    dst_16x8 = _mm_unpacklo_epi8(dst_8x8, zeros);
-
-    src_16x8 = _mm_loadu_si128((__m128i *)&src[i * sstride]);
-
-    sub_result_16x8 = _mm_sub_epi16(src_16x8, dst_16x8);
-
-    res0_32x4 = _mm_unpacklo_epi16(sub_result_16x8, zeros);
-    res1_32x4 = _mm_unpackhi_epi16(sub_result_16x8, zeros);
-
-    res0_32x4 = _mm_madd_epi16(res0_32x4, res0_32x4);
-    res1_32x4 = _mm_madd_epi16(res1_32x4, res1_32x4);
-
-    res0_64x4 = _mm_unpacklo_epi32(res0_32x4, zeros);
-    res1_64x4 = _mm_unpackhi_epi32(res0_32x4, zeros);
-    res2_64x4 = _mm_unpacklo_epi32(res1_32x4, zeros);
-    res3_64x4 = _mm_unpackhi_epi32(res1_32x4, zeros);
-
-    square_result = _mm_add_epi64(
-        square_result,
-        _mm_add_epi64(
-            _mm_add_epi64(_mm_add_epi64(res0_64x4, res1_64x4), res2_64x4),
-            res3_64x4));
-  }
-  const __m128i sum_1x64 =
-      _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
-  xx_storel_64(&sum, sum_1x64);
-  return sum;
-}
-
-uint64_t aom_mse_wxh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
-                                int sstride, int w, int h) {
-  assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
-         "w=8/4 and h=8/4 must satisfy");
-  switch (w) {
-    case 4: return aom_mse_4xh_16bit_sse2(dst, dstride, src, sstride, h);
-    case 8: return aom_mse_8xh_16bit_sse2(dst, dstride, src, sstride, h);
-    default: assert(0 && "unsupported width"); return -1;
-  }
-}

diff --git a/aom_scale/generic/yv12config.c b/aom_scale/generic/yv12config.c
index dc3668f..5ead645 100644
--- a/aom_scale/generic/yv12config.c
+++ b/aom_scale/generic/yv12config.c

@@ -46,16 +46,15 @@
 
 static int realloc_frame_buffer_aligned(
     YV12_BUFFER_CONFIG *ybf, int width, int height, int ss_x, int ss_y,
-    int use_highbitdepth, int border, int byte_alignment,
-    aom_codec_frame_buffer_t *fb, aom_get_frame_buffer_cb_fn_t cb,
-    void *cb_priv, const int y_stride, const uint64_t yplane_size,
-    const uint64_t uvplane_size, const int aligned_width,
-    const int aligned_height, const int uv_width, const int uv_height,
-    const int uv_stride, const int uv_border_w, const int uv_border_h) {
+    int border, int byte_alignment, aom_codec_frame_buffer_t *fb,
+    aom_get_frame_buffer_cb_fn_t cb, void *cb_priv, const int y_stride,
+    const uint64_t yplane_size, const uint64_t uvplane_size,
+    const int aligned_width, const int aligned_height, const int uv_width,
+    const int uv_height, const int uv_stride, const int uv_border_w,
+    const int uv_border_h) {
   if (ybf) {
     const int aom_byte_align = (byte_alignment == 0) ? 1 : byte_alignment;
-    const uint64_t frame_size =
-        (1 + use_highbitdepth) * (yplane_size + 2 * uvplane_size);
+    const uint64_t frame_size = 2 * (yplane_size + 2 * uvplane_size);
 
     uint8_t *buf = NULL;
 
@@ -63,7 +62,7 @@
     // The size of ybf->buffer_alloc.
     uint64_t alloc_size = frame_size;
     // The size of ybf->y_buffer_8bit.
-    if (use_highbitdepth) alloc_size += yplane_size;
+    alloc_size += yplane_size;
     // The decoder may allocate REF_FRAMES frame buffers in the frame buffer
     // pool. Bound the total amount of allocated memory as if these REF_FRAMES
     // frame buffers were allocated in a single allocation.
@@ -133,14 +132,8 @@
     ybf->subsampling_x = ss_x;
     ybf->subsampling_y = ss_y;
 
-    buf = ybf->buffer_alloc;
-    if (use_highbitdepth) {
-      // Store uint16 addresses when using 16bit framebuffers
-      buf = CONVERT_TO_BYTEPTR(ybf->buffer_alloc);
-      ybf->flags = YV12_FLAG_HIGHBITDEPTH;
-    } else {
-      ybf->flags = 0;
-    }
+    // Store uint16 addresses when using 16bit framebuffers
+    buf = CONVERT_TO_BYTEPTR(ybf->buffer_alloc);
 
     ybf->y_buffer = (uint8_t *)aom_align_addr(
         buf + (border * y_stride) + border, aom_byte_align);
@@ -154,16 +147,9 @@
 
     ybf->use_external_reference_buffers = 0;
 
-    if (use_highbitdepth) {
-      if (ybf->y_buffer_8bit) aom_free(ybf->y_buffer_8bit);
-      ybf->y_buffer_8bit = (uint8_t *)aom_memalign(32, (size_t)yplane_size);
-      if (!ybf->y_buffer_8bit) return AOM_CODEC_MEM_ERROR;
-    } else {
-      if (ybf->y_buffer_8bit) {
-        aom_free(ybf->y_buffer_8bit);
-        ybf->y_buffer_8bit = NULL;
-      }
-    }
+    if (ybf->y_buffer_8bit) aom_free(ybf->y_buffer_8bit);
+    ybf->y_buffer_8bit = (uint8_t *)aom_memalign(32, (size_t)yplane_size);
+    if (!ybf->y_buffer_8bit) return AOM_CODEC_MEM_ERROR;
     // y_buffer_8bit may have been allocated above, but it has not been filled
     // in yet. So, mark it as invalid.
     ybf->buf_8bit_valid = 0;
@@ -197,8 +183,7 @@
 }
 
 int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
-                             int ss_x, int ss_y, int use_highbitdepth,
-                             int border, int byte_alignment,
+                             int ss_x, int ss_y, int border, int byte_alignment,
                              aom_codec_frame_buffer_t *fb,
                              aom_get_frame_buffer_cb_fn_t cb, void *cb_priv) {
 #if CONFIG_SIZE_LIMIT
@@ -223,22 +208,19 @@
         &y_stride, &uv_stride, &yplane_size, &uvplane_size, uv_height);
     if (error) return error;
     return realloc_frame_buffer_aligned(
-        ybf, width, height, ss_x, ss_y, use_highbitdepth, border,
-        byte_alignment, fb, cb, cb_priv, y_stride, yplane_size, uvplane_size,
-        aligned_width, aligned_height, uv_width, uv_height, uv_stride,
-        uv_border_w, uv_border_h);
+        ybf, width, height, ss_x, ss_y, border, byte_alignment, fb, cb, cb_priv,
+        y_stride, yplane_size, uvplane_size, aligned_width, aligned_height,
+        uv_width, uv_height, uv_stride, uv_border_w, uv_border_h);
   }
   return AOM_CODEC_MEM_ERROR;
 }
 
 int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
-                           int ss_x, int ss_y, int use_highbitdepth, int border,
-                           int byte_alignment) {
+                           int ss_x, int ss_y, int border, int byte_alignment) {
   if (ybf) {
     aom_free_frame_buffer(ybf);
-    return aom_realloc_frame_buffer(ybf, width, height, ss_x, ss_y,
-                                    use_highbitdepth, border, byte_alignment,
-                                    NULL, NULL, NULL);
+    return aom_realloc_frame_buffer(ybf, width, height, ss_x, ss_y, border,
+                                    byte_alignment, NULL, NULL, NULL);
   }
   return AOM_CODEC_MEM_ERROR;
 }

diff --git a/aom_scale/generic/yv12extend.c b/aom_scale/generic/yv12extend.c
index 53937fc..7791a0d 100644
--- a/aom_scale/generic/yv12extend.c
+++ b/aom_scale/generic/yv12extend.c

@@ -20,47 +20,6 @@
 #include "aom_ports/mem.h"
 #include "aom_scale/yv12config.h"
 
-static void extend_plane(uint8_t *const src, int src_stride, int width,
-                         int height, int extend_top, int extend_left,
-                         int extend_bottom, int extend_right) {
-  assert(src != NULL);
-  int i;
-  const int linesize = extend_left + extend_right + width;
-
-  /* copy the left and right most columns out */
-  uint8_t *src_ptr1 = src;
-  uint8_t *src_ptr2 = src + width - 1;
-  uint8_t *dst_ptr1 = src - extend_left;
-  uint8_t *dst_ptr2 = src + width;
-
-  for (i = 0; i < height; ++i) {
-    memset(dst_ptr1, src_ptr1[0], extend_left);
-    memset(dst_ptr2, src_ptr2[0], extend_right);
-    src_ptr1 += src_stride;
-    src_ptr2 += src_stride;
-    dst_ptr1 += src_stride;
-    dst_ptr2 += src_stride;
-  }
-
-  /* Now copy the top and bottom lines into each line of the respective
-   * borders
-   */
-  src_ptr1 = src - extend_left;
-  src_ptr2 = src + src_stride * (height - 1) - extend_left;
-  dst_ptr1 = src + src_stride * -extend_top - extend_left;
-  dst_ptr2 = src + src_stride * height - extend_left;
-
-  for (i = 0; i < extend_top; ++i) {
-    memcpy(dst_ptr1, src_ptr1, linesize);
-    dst_ptr1 += src_stride;
-  }
-
-  for (i = 0; i < extend_bottom; ++i) {
-    memcpy(dst_ptr2, src_ptr2, linesize);
-    dst_ptr2 += src_stride;
-  }
-}
-
 static void extend_plane_high(uint8_t *const src8, int src_stride, int width,
                               int height, int extend_top, int extend_left,
                               int extend_bottom, int extend_right) {
@@ -110,27 +69,14 @@
   assert(ybf->y_height - ybf->y_crop_height >= 0);
   assert(ybf->y_width - ybf->y_crop_width >= 0);
 
-  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    for (int plane = 0; plane < num_planes; ++plane) {
-      const int is_uv = plane > 0;
-      const int plane_border = ybf->border >> is_uv;
-      extend_plane_high(
-          ybf->buffers[plane], ybf->strides[is_uv], ybf->crop_widths[is_uv],
-          ybf->crop_heights[is_uv], plane_border, plane_border,
-          plane_border + ybf->heights[is_uv] - ybf->crop_heights[is_uv],
-          plane_border + ybf->widths[is_uv] - ybf->crop_widths[is_uv]);
-    }
-    return;
-  }
-
   for (int plane = 0; plane < num_planes; ++plane) {
     const int is_uv = plane > 0;
     const int plane_border = ybf->border >> is_uv;
-    extend_plane(ybf->buffers[plane], ybf->strides[is_uv],
-                 ybf->crop_widths[is_uv], ybf->crop_heights[is_uv],
-                 plane_border, plane_border,
-                 plane_border + ybf->heights[is_uv] - ybf->crop_heights[is_uv],
-                 plane_border + ybf->widths[is_uv] - ybf->crop_widths[is_uv]);
+    extend_plane_high(
+        ybf->buffers[plane], ybf->strides[is_uv], ybf->crop_widths[is_uv],
+        ybf->crop_heights[is_uv], plane_border, plane_border,
+        plane_border + ybf->heights[is_uv] - ybf->crop_heights[is_uv],
+        plane_border + ybf->widths[is_uv] - ybf->crop_widths[is_uv]);
   }
 }
 
@@ -144,29 +90,15 @@
   assert(ybf->y_height - ybf->y_crop_height >= 0);
   assert(ybf->y_width - ybf->y_crop_width >= 0);
 
-  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    for (int plane = 0; plane < num_planes; ++plane) {
-      const int is_uv = plane > 0;
-      const int top = ext_size >> (is_uv ? ss_y : 0);
-      const int left = ext_size >> (is_uv ? ss_x : 0);
-      const int bottom = top + ybf->heights[is_uv] - ybf->crop_heights[is_uv];
-      const int right = left + ybf->widths[is_uv] - ybf->crop_widths[is_uv];
-      extend_plane_high(ybf->buffers[plane], ybf->strides[is_uv],
-                        ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], top,
-                        left, bottom, right);
-    }
-    return;
-  }
-
   for (int plane = 0; plane < num_planes; ++plane) {
     const int is_uv = plane > 0;
     const int top = ext_size >> (is_uv ? ss_y : 0);
     const int left = ext_size >> (is_uv ? ss_x : 0);
     const int bottom = top + ybf->heights[is_uv] - ybf->crop_heights[is_uv];
     const int right = left + ybf->widths[is_uv] - ybf->crop_widths[is_uv];
-    extend_plane(ybf->buffers[plane], ybf->strides[is_uv],
-                 ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], top, left,
-                 bottom, right);
+    extend_plane_high(ybf->buffers[plane], ybf->strides[is_uv],
+                      ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], top,
+                      left, bottom, right);
   }
 }
 
@@ -189,18 +121,10 @@
   assert(ybf->y_height - ybf->y_crop_height >= 0);
   assert(ybf->y_width - ybf->y_crop_width >= 0);
 
-  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    extend_plane_high(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
-                      ybf->y_crop_height, ext_size, ext_size,
-                      ext_size + ybf->y_height - ybf->y_crop_height,
-                      ext_size + ybf->y_width - ybf->y_crop_width);
-    return;
-  }
-
-  extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
-               ybf->y_crop_height, ext_size, ext_size,
-               ext_size + ybf->y_height - ybf->y_crop_height,
-               ext_size + ybf->y_width - ybf->y_crop_width);
+  extend_plane_high(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
+                    ybf->y_crop_height, ext_size, ext_size,
+                    ext_size + ybf->y_height - ybf->y_crop_height,
+                    ext_size + ybf->y_width - ybf->y_crop_width);
 }
 
 static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
@@ -217,32 +141,13 @@
   assert(src_bc->y_width == dst_bc->y_width);
   assert(src_bc->y_height == dst_bc->y_height);
 
-  assert((src_bc->flags & YV12_FLAG_HIGHBITDEPTH) ==
-         (dst_bc->flags & YV12_FLAG_HIGHBITDEPTH));
-
-  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
-    for (int plane = 0; plane < num_planes; ++plane) {
-      const uint8_t *plane_src = src_bc->buffers[plane];
-      uint8_t *plane_dst = dst_bc->buffers[plane];
-      const int is_uv = plane > 0;
-
-      for (int row = 0; row < src_bc->heights[is_uv]; ++row) {
-        memcpy_short_addr(plane_dst, plane_src, src_bc->widths[is_uv]);
-        plane_src += src_bc->strides[is_uv];
-        plane_dst += dst_bc->strides[is_uv];
-      }
-    }
-    aom_yv12_extend_frame_borders_c(dst_bc, num_planes);
-    return;
-  }
-
   for (int plane = 0; plane < num_planes; ++plane) {
     const uint8_t *plane_src = src_bc->buffers[plane];
     uint8_t *plane_dst = dst_bc->buffers[plane];
     const int is_uv = plane > 0;
 
     for (int row = 0; row < src_bc->heights[is_uv]; ++row) {
-      memcpy(plane_dst, plane_src, src_bc->widths[is_uv]);
+      memcpy_short_addr(plane_dst, plane_src, src_bc->widths[is_uv]);
       plane_src += src_bc->strides[is_uv];
       plane_dst += dst_bc->strides[is_uv];
     }
@@ -256,21 +161,12 @@
   const uint8_t *src = src_ybc->y_buffer;
   uint8_t *dst = dst_ybc->y_buffer;
 
-  if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
-    const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
-    uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
-    for (row = 0; row < src_ybc->y_height; ++row) {
-      memcpy(dst16, src16, src_ybc->y_width * sizeof(uint16_t));
-      src16 += src_ybc->y_stride;
-      dst16 += dst_ybc->y_stride;
-    }
-    return;
-  }
-
+  const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+  uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
   for (row = 0; row < src_ybc->y_height; ++row) {
-    memcpy(dst, src, src_ybc->y_width);
-    src += src_ybc->y_stride;
-    dst += dst_ybc->y_stride;
+    memcpy(dst16, src16, src_ybc->y_width * sizeof(uint16_t));
+    src16 += src_ybc->y_stride;
+    dst16 += dst_ybc->y_stride;
   }
 }
 
@@ -280,21 +176,12 @@
   const uint8_t *src = src_bc->u_buffer;
   uint8_t *dst = dst_bc->u_buffer;
 
-  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
-    const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
-    uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
-    for (row = 0; row < src_bc->uv_height; ++row) {
-      memcpy(dst16, src16, src_bc->uv_width * sizeof(uint16_t));
-      src16 += src_bc->uv_stride;
-      dst16 += dst_bc->uv_stride;
-    }
-    return;
-  }
-
+  const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+  uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
   for (row = 0; row < src_bc->uv_height; ++row) {
-    memcpy(dst, src, src_bc->uv_width);
-    src += src_bc->uv_stride;
-    dst += dst_bc->uv_stride;
+    memcpy(dst16, src16, src_bc->uv_width * sizeof(uint16_t));
+    src16 += src_bc->uv_stride;
+    dst16 += dst_bc->uv_stride;
   }
 }
 
@@ -304,21 +191,12 @@
   const uint8_t *src = src_bc->v_buffer;
   uint8_t *dst = dst_bc->v_buffer;
 
-  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
-    const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
-    uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
-    for (row = 0; row < src_bc->uv_height; ++row) {
-      memcpy(dst16, src16, src_bc->uv_width * sizeof(uint16_t));
-      src16 += src_bc->uv_stride;
-      dst16 += dst_bc->uv_stride;
-    }
-    return;
-  }
-
+  const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+  uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
   for (row = 0; row < src_bc->uv_height; ++row) {
-    memcpy(dst, src, src_bc->uv_width);
-    src += src_bc->uv_stride;
-    dst += dst_bc->uv_stride;
+    memcpy(dst16, src16, src_bc->uv_width * sizeof(uint16_t));
+    src16 += src_bc->uv_stride;
+    dst16 += dst_bc->uv_stride;
   }
 }
 
@@ -330,27 +208,15 @@
   const uint8_t *src = src_ybc->y_buffer;
   uint8_t *dst = dst_ybc->y_buffer;
 
-  if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
-    const uint16_t *src16 =
-        CONVERT_TO_SHORTPTR(src + vstart1 * src_ybc->y_stride + hstart1);
-    uint16_t *dst16 =
-        CONVERT_TO_SHORTPTR(dst + vstart2 * dst_ybc->y_stride + hstart2);
-
-    for (row = vstart1; row < vend1; ++row) {
-      memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t));
-      src16 += src_ybc->y_stride;
-      dst16 += dst_ybc->y_stride;
-    }
-    return;
-  }
-
-  src = (src + vstart1 * src_ybc->y_stride + hstart1);
-  dst = (dst + vstart2 * dst_ybc->y_stride + hstart2);
+  const uint16_t *src16 =
+      CONVERT_TO_SHORTPTR(src + vstart1 * src_ybc->y_stride + hstart1);
+  uint16_t *dst16 =
+      CONVERT_TO_SHORTPTR(dst + vstart2 * dst_ybc->y_stride + hstart2);
 
   for (row = vstart1; row < vend1; ++row) {
-    memcpy(dst, src, (hend1 - hstart1));
-    src += src_ybc->y_stride;
-    dst += dst_ybc->y_stride;
+    memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t));
+    src16 += src_ybc->y_stride;
+    dst16 += dst_ybc->y_stride;
   }
 }
 
@@ -369,26 +235,14 @@
   const uint8_t *src = src_bc->u_buffer;
   uint8_t *dst = dst_bc->u_buffer;
 
-  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
-    const uint16_t *src16 =
-        CONVERT_TO_SHORTPTR(src + vstart1 * src_bc->uv_stride + hstart1);
-    uint16_t *dst16 =
-        CONVERT_TO_SHORTPTR(dst + vstart2 * dst_bc->uv_stride + hstart2);
-    for (row = vstart1; row < vend1; ++row) {
-      memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t));
-      src16 += src_bc->uv_stride;
-      dst16 += dst_bc->uv_stride;
-    }
-    return;
-  }
-
-  src = (src + vstart1 * src_bc->uv_stride + hstart1);
-  dst = (dst + vstart2 * dst_bc->uv_stride + hstart2);
-
+  const uint16_t *src16 =
+      CONVERT_TO_SHORTPTR(src + vstart1 * src_bc->uv_stride + hstart1);
+  uint16_t *dst16 =
+      CONVERT_TO_SHORTPTR(dst + vstart2 * dst_bc->uv_stride + hstart2);
   for (row = vstart1; row < vend1; ++row) {
-    memcpy(dst, src, (hend1 - hstart1));
-    src += src_bc->uv_stride;
-    dst += dst_bc->uv_stride;
+    memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t));
+    src16 += src_bc->uv_stride;
+    dst16 += dst_bc->uv_stride;
   }
 }
 
@@ -407,26 +261,14 @@
   const uint8_t *src = src_bc->v_buffer;
   uint8_t *dst = dst_bc->v_buffer;
 
-  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
-    const uint16_t *src16 =
-        CONVERT_TO_SHORTPTR(src + vstart1 * src_bc->uv_stride + hstart1);
-    uint16_t *dst16 =
-        CONVERT_TO_SHORTPTR(dst + vstart2 * dst_bc->uv_stride + hstart2);
-    for (row = vstart1; row < vend1; ++row) {
-      memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t));
-      src16 += src_bc->uv_stride;
-      dst16 += dst_bc->uv_stride;
-    }
-    return;
-  }
-
-  src = (src + vstart1 * src_bc->uv_stride + hstart1);
-  dst = (dst + vstart2 * dst_bc->uv_stride + hstart2);
-
+  const uint16_t *src16 =
+      CONVERT_TO_SHORTPTR(src + vstart1 * src_bc->uv_stride + hstart1);
+  uint16_t *dst16 =
+      CONVERT_TO_SHORTPTR(dst + vstart2 * dst_bc->uv_stride + hstart2);
   for (row = vstart1; row < vend1; ++row) {
-    memcpy(dst, src, (hend1 - hstart1));
-    src += src_bc->uv_stride;
-    dst += dst_bc->uv_stride;
+    memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t));
+    src16 += src_bc->uv_stride;
+    dst16 += dst_bc->uv_stride;
   }
 }
 
@@ -445,8 +287,7 @@
     memset(&new_buf, 0, sizeof(new_buf));
     const int error = aom_alloc_frame_buffer(
         &new_buf, ybf->y_crop_width, ybf->y_crop_height, ybf->subsampling_x,
-        ybf->subsampling_y, ybf->flags & YV12_FLAG_HIGHBITDEPTH, new_border,
-        byte_alignment);
+        ybf->subsampling_y, new_border, byte_alignment);
     if (error) return error;
     // Copy image buffer
     aom_yv12_copy_frame(ybf, &new_buf, num_planes);

diff --git a/aom_scale/yv12config.h b/aom_scale/yv12config.h
index 6172c77..f8231f5 100644
--- a/aom_scale/yv12config.h
+++ b/aom_scale/yv12config.h

@@ -119,11 +119,8 @@
 
 /*!\cond */
 
-#define YV12_FLAG_HIGHBITDEPTH 8
-
 int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
-                           int ss_x, int ss_y, int use_highbitdepth, int border,
-                           int byte_alignment);
+                           int ss_x, int ss_y, int border, int byte_alignment);
 
 // Updates the yv12 buffer config with the frame buffer. |byte_alignment| must
 // be a power of 2, from 32 to 1024. 0 sets legacy alignment. If cb is not
@@ -133,8 +130,7 @@
 // internally to decode the current frame. Returns 0 on success. Returns < 0
 // on failure.
 int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
-                             int ss_x, int ss_y, int use_highbitdepth,
-                             int border, int byte_alignment,
+                             int ss_x, int ss_y, int border, int byte_alignment,
                              aom_codec_frame_buffer_t *fb,
                              aom_get_frame_buffer_cb_fn_t cb, void *cb_priv);
 

diff --git a/aom_util/debug_util.c b/aom_util/debug_util.c
index 462da45..22e2b13 100644
--- a/aom_util/debug_util.c
+++ b/aom_util/debug_util.c

@@ -136,18 +136,17 @@
 
 void mismatch_record_block_pre(const uint8_t *src, int src_stride,
                                int frame_offset, int plane, int pixel_c,
-                               int pixel_r, int blk_w, int blk_h, int highbd) {
+                               int pixel_r, int blk_w, int blk_h) {
   if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
     printf("frame_buf undersized\n");
     assert(0);
   }
 
-  const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+  const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
   for (int r = 0; r < blk_h; ++r) {
     for (int c = 0; c < blk_w; ++c) {
-      frame_pre[frame_buf_idx_w][plane]
-               [(r + pixel_r) * frame_stride + c + pixel_c] =
-                   src16 ? src16[r * src_stride + c] : src[r * src_stride + c];
+      frame_pre[frame_buf_idx_w][plane][(r + pixel_r) * frame_stride + c +
+                                        pixel_c] = src16[r * src_stride + c];
     }
   }
 #if 0
@@ -169,18 +168,17 @@
 }
 void mismatch_record_block_tx(const uint8_t *src, int src_stride,
                               int frame_offset, int plane, int pixel_c,
-                              int pixel_r, int blk_w, int blk_h, int highbd) {
+                              int pixel_r, int blk_w, int blk_h) {
   if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
     printf("frame_buf undersized\n");
     assert(0);
   }
 
-  const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+  const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
   for (int r = 0; r < blk_h; ++r) {
     for (int c = 0; c < blk_w; ++c) {
-      frame_tx[frame_buf_idx_w][plane]
-              [(r + pixel_r) * frame_stride + c + pixel_c] =
-                  src16 ? src16[r * src_stride + c] : src[r * src_stride + c];
+      frame_tx[frame_buf_idx_w][plane][(r + pixel_r) * frame_stride + c +
+                                       pixel_c] = src16[r * src_stride + c];
     }
   }
 #if 0
@@ -201,20 +199,19 @@
 }
 void mismatch_check_block_pre(const uint8_t *src, int src_stride,
                               int frame_offset, int plane, int pixel_c,
-                              int pixel_r, int blk_w, int blk_h, int highbd) {
+                              int pixel_r, int blk_w, int blk_h) {
   if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
     printf("frame_buf undersized\n");
     assert(0);
   }
 
-  const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+  const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
   int mismatch = 0;
   for (int r = 0; r < blk_h; ++r) {
     for (int c = 0; c < blk_w; ++c) {
       if (frame_pre[frame_buf_idx_r][plane]
                    [(r + pixel_r) * frame_stride + c + pixel_c] !=
-          (uint16_t)(src16 ? src16[r * src_stride + c]
-                           : src[r * src_stride + c])) {
+          (uint16_t)(src16[r * src_stride + c])) {
         mismatch = 1;
       }
     }
@@ -237,8 +234,7 @@
     printf("dec\n");
     for (int rr = 0; rr < blk_h; ++rr) {
       for (int cc = 0; cc < blk_w; ++cc) {
-        printf("%d ",
-               src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]);
+        printf("%d ", src16[rr * src_stride + cc]);
       }
       printf("\n");
     }
@@ -247,20 +243,19 @@
 }
 void mismatch_check_block_tx(const uint8_t *src, int src_stride,
                              int frame_offset, int plane, int pixel_c,
-                             int pixel_r, int blk_w, int blk_h, int highbd) {
+                             int pixel_r, int blk_w, int blk_h) {
   if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
     printf("frame_buf undersized\n");
     assert(0);
   }
 
-  const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+  const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
   int mismatch = 0;
   for (int r = 0; r < blk_h; ++r) {
     for (int c = 0; c < blk_w; ++c) {
       if (frame_tx[frame_buf_idx_r][plane]
                   [(r + pixel_r) * frame_stride + c + pixel_c] !=
-          (uint16_t)(src16 ? src16[r * src_stride + c]
-                           : src[r * src_stride + c])) {
+          (uint16_t)(src16[r * src_stride + c])) {
         mismatch = 1;
       }
     }
@@ -283,8 +278,7 @@
     printf("dec\n");
     for (int rr = 0; rr < blk_h; ++rr) {
       for (int cc = 0; cc < blk_w; ++cc) {
-        printf("%d ",
-               src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]);
+        printf("%d ", src16[rr * src_stride + cc]);
       }
       printf("\n");
     }

diff --git a/aom_util/debug_util.h b/aom_util/debug_util.h
index 82efde8..7da9705 100644
--- a/aom_util/debug_util.h
+++ b/aom_util/debug_util.h

@@ -51,16 +51,16 @@
 void mismatch_reset_frame(int num_planes);
 void mismatch_record_block_pre(const uint8_t *src, int src_stride,
                                int frame_offset, int plane, int pixel_c,
-                               int pixel_r, int blk_w, int blk_h, int highbd);
+                               int pixel_r, int blk_w, int blk_h);
 void mismatch_record_block_tx(const uint8_t *src, int src_stride,
                               int frame_offset, int plane, int pixel_c,
                               int pixel_r, int blk_w, int blk_h, int highbd);
 void mismatch_check_block_pre(const uint8_t *src, int src_stride,
                               int frame_offset, int plane, int pixel_c,
-                              int pixel_r, int blk_w, int blk_h, int highbd);
+                              int pixel_r, int blk_w, int blk_h);
 void mismatch_check_block_tx(const uint8_t *src, int src_stride,
                              int frame_offset, int plane, int pixel_c,
-                             int pixel_r, int blk_w, int blk_h, int highbd);
+                             int pixel_r, int blk_w, int blk_h);
 #endif  // CONFIG_MISMATCH_DEBUG
 
 #ifdef __cplusplus

diff --git a/apps/aomdec.c b/apps/aomdec.c
index 3b6a31b..0beb468 100644
--- a/apps/aomdec.c
+++ b/apps/aomdec.c

@@ -445,7 +445,7 @@
   int opt_yv12 = 0;
   int opt_i420 = 0;
   int opt_raw = 0;
-  aom_codec_dec_cfg_t cfg = { 0, 0, 0, 0 };
+  aom_codec_dec_cfg_t cfg = { 0, 0, 0 };
   unsigned int fixed_output_bit_depth = 0;
   unsigned int is_annexb = 0;
   int frames_corrupted = 0;

diff --git a/apps/aomenc.c b/apps/aomenc.c
index 4e80e78..2f56922 100644
--- a/apps/aomenc.c
+++ b/apps/aomenc.c

@@ -273,7 +273,6 @@
   &g_av1_codec_arg_defs.large_scale_tile,
   &g_av1_codec_arg_defs.monochrome,
   &g_av1_codec_arg_defs.full_still_picture_hdr,
-  &g_av1_codec_arg_defs.use_16bit_internal,
   &g_av1_codec_arg_defs.save_as_annexb,
   NULL
 };
@@ -537,8 +536,6 @@
   int write_webm;
   const char *film_grain_filename;
   int write_ivf;
-  // whether to use 16bit internal buffers
-  int use_16bit_internal;
 #if CONFIG_TUNE_VMAF
   const char *vmaf_model_path;
 #endif
@@ -1107,9 +1104,6 @@
     } else if (arg_match(&arg, &g_av1_codec_arg_defs.full_still_picture_hdr,
                          argi)) {
       config->cfg.full_still_picture_hdr = 1;
-    } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_16bit_internal,
-                         argi)) {
-      warn("%s option deprecated. default to 1 always.\n", arg.name);
     } else if (arg_match(&arg, &g_av1_codec_arg_defs.dropframe_thresh, argi)) {
       config->cfg.rc_dropframe_thresh = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &g_av1_codec_arg_defs.resize_mode, argi)) {
@@ -1244,7 +1238,6 @@
       if (!match) argj++;
     }
   }
-  config->use_16bit_internal = 1;
 
   return eos_mark_found;
 }
@@ -1620,7 +1613,6 @@
   int flags = 0;
 
   flags |= global->show_psnr ? AOM_CODEC_USE_PSNR : 0;
-  flags |= stream->config.use_16bit_internal ? AOM_CODEC_USE_HIGHBITDEPTH : 0;
   flags |= global->quiet ? 0 : AOM_CODEC_USE_PER_FRAME_STATS;
 
   /* Construct Encoder Context */
@@ -1670,7 +1662,7 @@
   if (global->test_decode != TEST_DECODE_OFF) {
     aom_codec_iface_t *decoder = get_aom_decoder_by_short_name(
         get_short_name_by_aom_encoder(global->codec));
-    aom_codec_dec_cfg_t cfg = { 0, 0, 0, !stream->config.use_16bit_internal };
+    aom_codec_dec_cfg_t cfg = { 0, 0, 0 };
     aom_codec_dec_init(&stream->decoder, decoder, &cfg, 0);
 
     if (strcmp(get_short_name_by_aom_encoder(global->codec), "av1") == 0) {
@@ -1902,34 +1894,12 @@
   AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_GET_NEW_FRAME_IMAGE,
                                 &dec_img);
 
-  if ((enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) !=
-      (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) {
-    if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
-      aom_image_t enc_hbd_img;
-      aom_img_alloc(&enc_hbd_img, enc_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH,
-                    enc_img.d_w, enc_img.d_h, 16);
-      aom_img_truncate_16_to_8(&enc_hbd_img, &enc_img);
-      enc_img = enc_hbd_img;
-    }
-    if (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
-      aom_image_t dec_hbd_img;
-      aom_img_alloc(&dec_hbd_img, dec_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH,
-                    dec_img.d_w, dec_img.d_h, 16);
-      aom_img_truncate_16_to_8(&dec_hbd_img, &dec_img);
-      dec_img = dec_hbd_img;
-    }
-  }
-
   ctx_exit_on_error(&stream->encoder, "Failed to get encoder reference frame");
   ctx_exit_on_error(&stream->decoder, "Failed to get decoder reference frame");
 
   if (!aom_compare_img(&enc_img, &dec_img)) {
     int y[4], u[4], v[4];
-    if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
-      aom_find_mismatch_high(&enc_img, &dec_img, y, u, v);
-    } else {
-      aom_find_mismatch(&enc_img, &dec_img, y, u, v);
-    }
+    aom_find_mismatch_high(&enc_img, &dec_img, y, u, v);
     stream->decoder.err = 1;
     warn_or_exit_on_error(&stream->decoder, fatal == TEST_DECODE_FATAL,
                           "Stream %d: Encode/decode mismatch on frame %d at"
@@ -1950,7 +1920,6 @@
   aom_image_t raw;
   aom_image_t raw_shift;
   int allocated_raw_shift = 0;
-  int do_16bit_internal = 0;
   int input_shift = 0;
 
   struct AvxInputContext input;
@@ -2164,7 +2133,6 @@
         }
       }
       // Force encoder to use 16-bit pipeline for 8-bit video/image
-      stream->config.use_16bit_internal = 1;
       if (profile_updated && !global.quiet) {
         fprintf(stderr,
                 "Warning: automatically updating to profile %d to "
@@ -2254,9 +2222,6 @@
       // Currently assume that the bit_depths for all streams using
       // highbitdepth are the same.
       FOREACH_STREAM(stream, streams) {
-        if (stream->config.use_16bit_internal) {
-          do_16bit_internal = 1;
-        }
         input_shift = (int)stream->config.cfg.g_bit_depth -
                       stream->config.cfg.g_input_bit_depth;
       };
@@ -2294,8 +2259,7 @@
       fflush(stdout);
 
       aom_image_t *frame_to_encode;
-      if (input_shift || (do_16bit_internal && input.bit_depth == 8)) {
-        assert(do_16bit_internal);
+      if (input_shift || input.bit_depth == 8) {
         // Input bit depth and stream bit depth do not match, so up
         // shift frame to stream bit depth
         if (!allocated_raw_shift) {
@@ -2308,22 +2272,11 @@
       } else {
         frame_to_encode = &raw;
       }
-      if (do_16bit_internal) {
-        assert(frame_to_encode->fmt & AOM_IMG_FMT_HIGHBITDEPTH);
-        FOREACH_STREAM(stream, streams) {
-          if (stream->config.use_16bit_internal)
-            encode_frame(stream, &global, frame_avail ? frame_to_encode : NULL,
-                         seen_frames);
-          else
-            assert(0);
-        };
-      } else {
-        assert((frame_to_encode->fmt & AOM_IMG_FMT_HIGHBITDEPTH) == 0);
-        FOREACH_STREAM(stream, streams) {
-          encode_frame(stream, &global, frame_avail ? frame_to_encode : NULL,
-                       seen_frames);
-        }
-      }
+      assert(frame_to_encode->fmt & AOM_IMG_FMT_HIGHBITDEPTH);
+      FOREACH_STREAM(stream, streams) {
+        encode_frame(stream, &global, frame_avail ? frame_to_encode : NULL,
+                     seen_frames);
+      };
 
       FOREACH_STREAM(stream, streams) { update_quantizer_histogram(stream); }
 

diff --git a/av1/arg_defs.c b/av1/arg_defs.c
index 1098787..a824daf 100644
--- a/av1/arg_defs.c
+++ b/av1/arg_defs.c

@@ -213,8 +213,6 @@
       ARG_DEF(NULL, "monochrome", 0, "Monochrome video (no chroma planes)"),
   .full_still_picture_hdr = ARG_DEF(NULL, "full-still-picture-hdr", 0,
                                     "Use full header for still picture"),
-  .use_16bit_internal =
-      ARG_DEF(NULL, "use-16bit-internal", 0, "Force use of 16-bit pipeline"),
   .dropframe_thresh =
       ARG_DEF(NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)"),
   .resize_mode = ARG_DEF(NULL, "resize-mode", 1, "Frame resize mode"),

diff --git a/av1/arg_defs.h b/av1/arg_defs.h
index 2e63c44..01e283b 100644
--- a/av1/arg_defs.h
+++ b/av1/arg_defs.h

@@ -79,7 +79,6 @@
   arg_def_t large_scale_tile;
   arg_def_t monochrome;
   arg_def_t full_still_picture_hdr;
-  arg_def_t use_16bit_internal;
   arg_def_t dropframe_thresh;
   arg_def_t resize_mode;
   arg_def_t resize_denominator;

diff --git a/av1/av1.cmake b/av1/av1.cmake
index 0b94a40..4bc51a5 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake

@@ -280,29 +280,19 @@
 endif()
 
 list(
-  APPEND
-  AOM_AV1_COMMON_INTRIN_SSE2
-  "${AOM_ROOT}/av1/common/cdef_block_sse2.c"
+  APPEND AOM_AV1_COMMON_INTRIN_SSE2 "${AOM_ROOT}/av1/common/cdef_block_sse2.c"
   "${AOM_ROOT}/av1/common/x86/cfl_sse2.c"
-  "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c"
-  "${AOM_ROOT}/av1/common/x86/convolve_sse2.c"
-  "${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c"
-  "${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c"
-  "${AOM_ROOT}/av1/common/x86/av1_txfm_sse2.h"
-  "${AOM_ROOT}/av1/common/x86/warp_plane_sse2.c")
+  "${AOM_ROOT}/av1/common/x86/av1_txfm_sse2.h")
 
 list(
   APPEND
   AOM_AV1_COMMON_INTRIN_SSSE3
   "${AOM_ROOT}/av1/common/cdef_block_ssse3.c"
-  "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.c"
   "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.h"
   "${AOM_ROOT}/av1/common/x86/cfl_ssse3.c"
   "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_ssse3.c"
   "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_ssse3.c"
-  "${AOM_ROOT}/av1/common/x86/jnt_convolve_ssse3.c"
-  "${AOM_ROOT}/av1/common/x86/reconinter_ssse3.c"
-  "${AOM_ROOT}/av1/common/x86/resize_ssse3.c")
+  "${AOM_ROOT}/av1/common/x86/reconinter_ssse3.c")
 
 list(
   APPEND
@@ -320,31 +310,22 @@
   "${AOM_ROOT}/av1/common/x86/intra_edge_sse4.c"
   "${AOM_ROOT}/av1/common/x86/optflow_refine_sse4.c"
   "${AOM_ROOT}/av1/common/x86/reconinter_sse4.c"
-  "${AOM_ROOT}/av1/common/x86/selfguided_sse4.c"
-  "${AOM_ROOT}/av1/common/x86/warp_plane_sse4.c")
+  "${AOM_ROOT}/av1/common/x86/selfguided_sse4.c")
 
 list(
   APPEND
   AOM_AV1_COMMON_INTRIN_AVX2
   "${AOM_ROOT}/av1/common/cdef_block_avx2.c"
-  "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_avx2.c"
-  "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_avx2.h"
   "${AOM_ROOT}/av1/common/x86/cfl_avx2.c"
-  "${AOM_ROOT}/av1/common/x86/convolve_2d_avx2.c"
-  "${AOM_ROOT}/av1/common/x86/convolve_avx2.c"
   "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_avx2.c"
   "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c"
   "${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_avx2.c"
   "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_avx2.c"
   "${AOM_ROOT}/av1/common/x86/highbd_warp_affine_avx2.c"
-  "${AOM_ROOT}/av1/common/x86/jnt_convolve_avx2.c"
   "${AOM_ROOT}/av1/common/x86/reconinter_avx2.c"
-  "${AOM_ROOT}/av1/common/x86/selfguided_avx2.c"
-  "${AOM_ROOT}/av1/common/x86/warp_plane_avx2.c"
-  "${AOM_ROOT}/av1/common/x86/wiener_convolve_avx2.c")
+  "${AOM_ROOT}/av1/common/x86/selfguided_avx2.c")
 
-list(APPEND AOM_AV1_ENCODER_ASM_SSE2 "${AOM_ROOT}/av1/encoder/x86/dct_sse2.asm"
-     "${AOM_ROOT}/av1/encoder/x86/error_sse2.asm")
+list(APPEND AOM_AV1_ENCODER_ASM_SSE2 "${AOM_ROOT}/av1/encoder/x86/dct_sse2.asm")
 
 list(
   APPEND
@@ -354,7 +335,6 @@
   "${AOM_ROOT}/av1/encoder/x86/av1_quantize_sse2.c"
   "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse2.c"
   "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c"
-  "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse2.c"
   "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_sse2.c"
   "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c")
 
@@ -381,7 +361,6 @@
   "${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c"
   "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_avx2.c"
   "${AOM_ROOT}/av1/encoder/x86/corner_match_avx2.c"
-  "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c"
   "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_avx2.c"
   "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_avx2.h"
   "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_avx2.c"
@@ -389,7 +368,6 @@
   "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c"
   "${AOM_ROOT}/av1/encoder/x86/encodetxb_avx2.c"
   "${AOM_ROOT}/av1/encoder/x86/rdopt_avx2.c"
-  "${AOM_ROOT}/av1/encoder/x86/temporal_filter_avx2.c"
   "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c")
 
 list(
@@ -397,20 +375,15 @@
   AOM_AV1_ENCODER_INTRIN_NEON
   "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c"
   "${AOM_ROOT}/av1/encoder/arm/neon/ml_neon.c"
-  "${AOM_ROOT}/av1/encoder/arm/neon/picksrt_neon.c"
   "${AOM_ROOT}/av1/encoder/arm/neon/rdopt_neon.c"
-  "${AOM_ROOT}/av1/encoder/arm/neon/av1_error_neon.c"
   "${AOM_ROOT}/av1/encoder/arm/neon/encodetxb_neon.c"
   "${AOM_ROOT}/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c"
   "${AOM_ROOT}/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c"
   "${AOM_ROOT}/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c")
 
-list(
-  APPEND
-  AOM_AV1_ENCODER_INTRIN_MSA
-  "${AOM_ROOT}/av1/encoder/mips/msa/error_msa.c"
-  "${AOM_ROOT}/av1/encoder/mips/msa/fdct4x4_msa.c"
-  "${AOM_ROOT}/av1/encoder/mips/msa/temporal_filter_msa.c")
+list(APPEND AOM_AV1_ENCODER_INTRIN_MSA
+     "${AOM_ROOT}/av1/encoder/mips/msa/fdct4x4_msa.c"
+     "${AOM_ROOT}/av1/encoder/mips/msa/temporal_filter_msa.c")
 
 list(
   APPEND
@@ -426,13 +399,8 @@
   "${AOM_ROOT}/av1/common/arm/blend_a64_vmask_neon.c"
   "${AOM_ROOT}/av1/common/arm/reconinter_neon.c"
   "${AOM_ROOT}/av1/common/arm/reconintra_neon.c"
-  "${AOM_ROOT}/av1/common/arm/resize_neon.c"
-  "${AOM_ROOT}/av1/common/arm/wiener_convolve_neon.c"
   "${AOM_ROOT}/av1/common/arm/selfguided_neon.c"
-  "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c"
-  "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h"
   "${AOM_ROOT}/av1/common/arm/highbd_inv_txfm_neon.c"
-  "${AOM_ROOT}/av1/common/arm/warp_plane_neon.c"
   "${AOM_ROOT}/av1/common/cdef_block_neon.c")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2

diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 9f14ad2..2d862e8 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c

@@ -22,6 +22,7 @@
 
 #include "aom/aom_encoder.h"
 #include "aom/internal/aom_codec_internal.h"
+#include "aom/internal/aom_image_internal.h"
 
 #include "av1/av1_iface_common.h"
 #include "av1/encoder/bitstream.h"
@@ -2548,8 +2549,6 @@
           lap_lag_in_frames = LAP_LAG_IN_FRAMES;
         }
       }
-      priv->oxcf.use_highbitdepth =
-          (ctx->init_flags & AOM_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;
 
       res = create_stats_buffer(&priv->frame_stats_buffer,
                                 &priv->stats_buf_context, *num_lap_buffers);
@@ -2811,14 +2810,22 @@
           timebase_units_to_ticks(timestamp_ratio, ptsvol + duration);
 
       YV12_BUFFER_CONFIG sd;
-      res = image2yuvconfig(img, &sd);
+      aom_image_t *hbd_img = NULL;
+      // May need to allocate larger buffer to use hbd internal.
+      if (!(img->fmt & AOM_IMG_FMT_HIGHBITDEPTH)) {
+        hbd_img = aom_img_alloc(NULL, img->fmt | AOM_IMG_FMT_HIGHBITDEPTH,
+                                img->w, img->h, 32);
+        if (!hbd_img) return AOM_CODEC_MEM_ERROR;
+        image2yuvconfig_upshift(hbd_img, img, &sd);
+      } else {
+        res = image2yuvconfig(img, &sd);
+      }
       // When generating a monochrome stream, make |sd| a monochrome image.
       if (ctx->cfg.monochrome) {
         sd.u_buffer = sd.v_buffer = NULL;
         sd.uv_stride = 0;
         sd.monochrome = 1;
       }
-      int use_highbitdepth = (sd.flags & YV12_FLAG_HIGHBITDEPTH) != 0;
       int subsampling_x = sd.subsampling_x;
       int subsampling_y = sd.subsampling_y;
 
@@ -2828,7 +2835,7 @@
 
         cpi->lookahead = av1_lookahead_init(
             cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height,
-            subsampling_x, subsampling_y, use_highbitdepth, lag_in_frames,
+            subsampling_x, subsampling_y, lag_in_frames,
             cpi->oxcf.border_in_pixels, cpi->common.features.byte_alignment,
             ctx->num_lap_buffers);
       }
@@ -2836,12 +2843,10 @@
         aom_internal_error(&cpi->common.error, AOM_CODEC_MEM_ERROR,
                            "Failed to allocate lag buffers");
 
-      av1_check_initial_width(cpi, use_highbitdepth, subsampling_x,
-                              subsampling_y);
+      av1_check_initial_width(cpi, subsampling_x, subsampling_y);
       if (cpi_lap != NULL) {
         cpi_lap->lookahead = cpi->lookahead;
-        av1_check_initial_width(cpi_lap, use_highbitdepth, subsampling_x,
-                                subsampling_y);
+        av1_check_initial_width(cpi_lap, subsampling_x, subsampling_y);
       }
 
       // Store the original flags in to the frame buffer. Will extract the
@@ -2850,6 +2855,7 @@
                                 src_time_stamp, src_end_time_stamp)) {
         res = update_error_state(ctx, &cpi->common.error);
       }
+      aom_img_free(hbd_img);
       ctx->next_frame_flags = 0;
     }
 
@@ -3069,10 +3075,19 @@
   av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
 
   if (frame != NULL) {
+    aom_image_t *hbd_img = NULL;
     YV12_BUFFER_CONFIG sd;
 
-    image2yuvconfig(&frame->img, &sd);
+    if (!(frame->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) {
+      hbd_img = aom_img_alloc(NULL, frame->img.fmt | AOM_IMG_FMT_HIGHBITDEPTH,
+                              frame->img.w, frame->img.h, 32);
+      if (!hbd_img) return AOM_CODEC_MEM_ERROR;
+      image2yuvconfig_upshift(hbd_img, &frame->img, &sd);
+    } else {
+      image2yuvconfig(&frame->img, &sd);
+    }
     av1_set_reference_enc(ctx->cpi, frame->idx, &sd);
+    aom_img_free(hbd_img);
     return AOM_CODEC_OK;
   } else {
     return AOM_CODEC_INVALID_PARAM;
@@ -3084,10 +3099,19 @@
   av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
 
   if (frame != NULL) {
+    aom_image_t *hbd_img = NULL;
     YV12_BUFFER_CONFIG sd;
 
-    image2yuvconfig(&frame->img, &sd);
+    if (!(frame->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) {
+      hbd_img = aom_img_alloc(NULL, frame->img.fmt | AOM_IMG_FMT_HIGHBITDEPTH,
+                              frame->img.w, frame->img.h, 32);
+      if (!hbd_img) return AOM_CODEC_MEM_ERROR;
+      image2yuvconfig_upshift(hbd_img, &frame->img, &sd);
+    } else {
+      image2yuvconfig(&frame->img, &sd);
+    }
     av1_copy_reference_enc(ctx->cpi, frame->idx, &sd);
+    aom_img_free(hbd_img);
     return AOM_CODEC_OK;
   } else {
     return AOM_CODEC_INVALID_PARAM;
@@ -3135,9 +3159,21 @@
     YV12_BUFFER_CONFIG new_frame;
 
     if (av1_get_last_show_frame(ctx->cpi, &new_frame) == 0) {
+      aom_image_t *hbd_img = NULL;
       YV12_BUFFER_CONFIG sd;
-      image2yuvconfig(new_img, &sd);
-      return av1_copy_new_frame_enc(&ctx->cpi->common, &new_frame, &sd);
+
+      if (!(new_img->fmt & AOM_IMG_FMT_HIGHBITDEPTH)) {
+        hbd_img = aom_img_alloc(NULL, new_img->fmt | AOM_IMG_FMT_HIGHBITDEPTH,
+                                new_img->w, new_img->h, 32);
+        if (!hbd_img) return AOM_CODEC_MEM_ERROR;
+        image2yuvconfig_upshift(hbd_img, new_img, &sd);
+      } else {
+        image2yuvconfig(new_img, &sd);
+      }
+      aom_codec_err_t res =
+          av1_copy_new_frame_enc(&ctx->cpi->common, &new_frame, &sd);
+      aom_img_free(hbd_img);
+      return res;
     } else {
       return AOM_CODEC_ERROR;
     }
@@ -4047,11 +4083,10 @@
 aom_codec_iface_t aom_codec_av1_cx_algo = {
   "AOMedia Project AV1 Encoder" VERSION_STRING,
   AOM_CODEC_INTERNAL_ABI_VERSION,
-  AOM_CODEC_CAP_HIGHBITDEPTH | AOM_CODEC_CAP_ENCODER |
-      AOM_CODEC_CAP_PSNR,  // aom_codec_caps_t
-  encoder_init,            // aom_codec_init_fn_t
-  encoder_destroy,         // aom_codec_destroy_fn_t
-  encoder_ctrl_maps,       // aom_codec_ctrl_fn_map_t
+  AOM_CODEC_CAP_ENCODER | AOM_CODEC_CAP_PSNR,  // aom_codec_caps_t
+  encoder_init,                                // aom_codec_init_fn_t
+  encoder_destroy,                             // aom_codec_destroy_fn_t
+  encoder_ctrl_maps,                           // aom_codec_ctrl_fn_map_t
   {
       // NOLINT
       NULL,  // aom_codec_peek_si_fn_t

diff --git a/av1/av1_dx_iface.c b/av1/av1_dx_iface.c
index 184a906..85c77f4 100644
--- a/av1/av1_dx_iface.c
+++ b/av1/av1_dx_iface.c

@@ -93,8 +93,6 @@
     ctx->priv->init_flags = ctx->init_flags;
     priv->flushed = 0;
 
-    // TODO(tdaede): this should not be exposed to the API
-    priv->cfg.allow_lowbitdepth = 0;
     if (ctx->config.dec) {
       priv->cfg = *ctx->config.dec;
       ctx->config.dec = &priv->cfg;
@@ -470,7 +468,6 @@
   }
   frame_worker_data->frame_context_ready = 0;
   frame_worker_data->received_frame = 0;
-  frame_worker_data->pbi->allow_lowbitdepth = ctx->cfg.allow_lowbitdepth;
 
   // If decoding in serial mode, FrameWorker thread could create tile worker
   // thread or loopfilter thread.
@@ -825,13 +822,11 @@
           const int tile_col = AOMMIN(pbi->dec_tile_col, tiles->cols - 1);
           const int mi_col = tile_col * tile_width;
           const int ssx = ctx->img.x_chroma_shift;
-          const int is_hbd = (ctx->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0;
           int plane;
-          ctx->img.planes[0] += mi_col * MI_SIZE * (1 + is_hbd);
+          ctx->img.planes[0] += mi_col * MI_SIZE * 2;
           if (num_planes > 1) {
             for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-              ctx->img.planes[plane] +=
-                  mi_col * (MI_SIZE >> ssx) * (1 + is_hbd);
+              ctx->img.planes[plane] += mi_col * (MI_SIZE >> ssx) * 2;
             }
           }
           ctx->img.d_w =
@@ -884,13 +879,25 @@
   av1_ref_frame_t *const data = va_arg(args, av1_ref_frame_t *);
 
   if (data) {
+    aom_image_t *hbd_img = NULL;
     av1_ref_frame_t *const frame = data;
     YV12_BUFFER_CONFIG sd;
     AVxWorker *const worker = ctx->frame_worker;
     FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    image2yuvconfig(&frame->img, &sd);
-    return av1_set_reference_dec(&frame_worker_data->pbi->common, frame->idx,
-                                 frame->use_external_ref, &sd);
+    if (!(frame->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) {
+      if (frame->use_external_ref) return AOM_CODEC_INVALID_PARAM;
+      hbd_img = aom_img_alloc(NULL, frame->img.fmt | AOM_IMG_FMT_HIGHBITDEPTH,
+                              frame->img.w, frame->img.h, 32);
+      if (!hbd_img) return AOM_CODEC_MEM_ERROR;
+      image2yuvconfig_upshift(hbd_img, &frame->img, &sd);
+    } else {
+      image2yuvconfig(&frame->img, &sd);
+    }
+    aom_codec_err_t res =
+        av1_set_reference_dec(&frame_worker_data->pbi->common, frame->idx,
+                              frame->use_external_ref, &sd);
+    aom_img_free(hbd_img);
+    return res;
   } else {
     return AOM_CODEC_INVALID_PARAM;
   }
@@ -900,11 +907,22 @@
                                            va_list args) {
   const av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
   if (frame) {
+    aom_image_t *hbd_img = NULL;
     YV12_BUFFER_CONFIG sd;
     AVxWorker *const worker = ctx->frame_worker;
     FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    image2yuvconfig(&frame->img, &sd);
-    return av1_copy_reference_dec(frame_worker_data->pbi, frame->idx, &sd);
+    if (!(frame->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) {
+      hbd_img = aom_img_alloc(NULL, frame->img.fmt | AOM_IMG_FMT_HIGHBITDEPTH,
+                              frame->img.w, frame->img.h, 32);
+      if (!hbd_img) return AOM_CODEC_MEM_ERROR;
+      image2yuvconfig_upshift(hbd_img, &frame->img, &sd);
+    } else {
+      image2yuvconfig(&frame->img, &sd);
+    }
+    aom_codec_err_t res =
+        av1_copy_reference_dec(frame_worker_data->pbi, frame->idx, &sd);
+    aom_img_free(hbd_img);
+    return res;
   } else {
     return AOM_CODEC_INVALID_PARAM;
   }
@@ -954,10 +972,20 @@
     FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
 
     if (av1_get_frame_to_show(frame_worker_data->pbi, &new_frame) == 0) {
+      aom_image_t *hbd_img = NULL;
       YV12_BUFFER_CONFIG sd;
-      image2yuvconfig(img, &sd);
-      return av1_copy_new_frame_dec(&frame_worker_data->pbi->common, &new_frame,
-                                    &sd);
+      if (!(img->fmt & AOM_IMG_FMT_HIGHBITDEPTH)) {
+        hbd_img = aom_img_alloc(NULL, img->fmt | AOM_IMG_FMT_HIGHBITDEPTH,
+                                img->w, img->h, 32);
+        if (!hbd_img) return AOM_CODEC_MEM_ERROR;
+        image2yuvconfig_upshift(hbd_img, img, &sd);
+      } else {
+        image2yuvconfig(img, &sd);
+      }
+      aom_codec_err_t res = av1_copy_new_frame_dec(
+          &frame_worker_data->pbi->common, &new_frame, &sd);
+      aom_img_free(hbd_img);
+      return res;
     } else {
       return AOM_CODEC_ERROR;
     }
@@ -1357,8 +1385,7 @@
   return AOM_CODEC_INVALID_PARAM;
 }
 
-static aom_img_fmt_t get_img_format(int subsampling_x, int subsampling_y,
-                                    int use_highbitdepth) {
+static aom_img_fmt_t get_img_format(int subsampling_x, int subsampling_y) {
   aom_img_fmt_t fmt = 0;
 
   if (subsampling_x == 0 && subsampling_y == 0)
@@ -1368,7 +1395,7 @@
   else if (subsampling_x == 1 && subsampling_y == 1)
     fmt = AOM_IMG_FMT_I420;
 
-  if (use_highbitdepth) fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+  fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
   return fmt;
 }
 
@@ -1384,8 +1411,7 @@
       const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
 
       *img_fmt = get_img_format(cm->seq_params.subsampling_x,
-                                cm->seq_params.subsampling_y,
-                                cm->seq_params.use_highbitdepth);
+                                cm->seq_params.subsampling_y);
       return AOM_CODEC_OK;
     } else {
       return AOM_CODEC_ERROR;

diff --git a/av1/av1_iface_common.h b/av1/av1_iface_common.h
index 0f335ca..b812b2e 100644
--- a/av1/av1_iface_common.h
+++ b/av1/av1_iface_common.h

@@ -83,19 +83,17 @@
   img->stride[AOM_PLANE_Y] = yv12->y_stride;
   img->stride[AOM_PLANE_U] = yv12->uv_stride;
   img->stride[AOM_PLANE_V] = yv12->uv_stride;
-  if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) {
-    bps *= 2;
-    // aom_image_t uses byte strides and a pointer to the first byte
-    // of the image.
-    img->fmt = (aom_img_fmt_t)(img->fmt | AOM_IMG_FMT_HIGHBITDEPTH);
-    img->bit_depth = yv12->bit_depth;
-    img->planes[AOM_PLANE_Y] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->y_buffer);
-    img->planes[AOM_PLANE_U] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->u_buffer);
-    img->planes[AOM_PLANE_V] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->v_buffer);
-    img->stride[AOM_PLANE_Y] = 2 * yv12->y_stride;
-    img->stride[AOM_PLANE_U] = 2 * yv12->uv_stride;
-    img->stride[AOM_PLANE_V] = 2 * yv12->uv_stride;
-  }
+  bps *= 2;
+  // aom_image_t uses byte strides and a pointer to the first byte
+  // of the image.
+  img->fmt = (aom_img_fmt_t)(img->fmt | AOM_IMG_FMT_HIGHBITDEPTH);
+  img->bit_depth = yv12->bit_depth;
+  img->planes[AOM_PLANE_Y] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->y_buffer);
+  img->planes[AOM_PLANE_U] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->u_buffer);
+  img->planes[AOM_PLANE_V] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->v_buffer);
+  img->stride[AOM_PLANE_Y] = 2 * yv12->y_stride;
+  img->stride[AOM_PLANE_U] = 2 * yv12->uv_stride;
+  img->stride[AOM_PLANE_V] = 2 * yv12->uv_stride;
   img->bps = bps;
   img->user_priv = user_priv;
   img->img_data = yv12->buffer_alloc;
@@ -135,26 +133,21 @@
   yv12->chroma_sample_position = img->csp;
   yv12->color_range = img->range;
 
-  if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
-    // In aom_image_t
-    //     planes point to uint8 address of start of data
-    //     stride counts uint8s to reach next row
-    // In YV12_BUFFER_CONFIG
-    //     y_buffer, u_buffer, v_buffer point to uint16 address of data
-    //     stride and border counts in uint16s
-    // This means that all the address calculations in the main body of code
-    // should work correctly.
-    // However, before we do any pixel operations we need to cast the address
-    // to a uint16 ponter and double its value.
-    yv12->y_buffer = CONVERT_TO_BYTEPTR(yv12->y_buffer);
-    yv12->u_buffer = CONVERT_TO_BYTEPTR(yv12->u_buffer);
-    yv12->v_buffer = CONVERT_TO_BYTEPTR(yv12->v_buffer);
-    yv12->y_stride >>= 1;
-    yv12->uv_stride >>= 1;
-    yv12->flags = YV12_FLAG_HIGHBITDEPTH;
-  } else {
-    yv12->flags = 0;
-  }
+  // In aom_image_t
+  //     planes point to uint8 address of start of data
+  //     stride counts uint8s to reach next row
+  // In YV12_BUFFER_CONFIG
+  //     y_buffer, u_buffer, v_buffer point to uint16 address of data
+  //     stride and border counts in uint16s
+  // This means that all the address calculations in the main body of code
+  // should work correctly.
+  // However, before we do any pixel operations we need to cast the address
+  // to a uint16 ponter and double its value.
+  yv12->y_buffer = CONVERT_TO_BYTEPTR(yv12->y_buffer);
+  yv12->u_buffer = CONVERT_TO_BYTEPTR(yv12->u_buffer);
+  yv12->v_buffer = CONVERT_TO_BYTEPTR(yv12->v_buffer);
+  yv12->y_stride >>= 1;
+  yv12->uv_stride >>= 1;
 
   // Note(yunqing): if img is allocated the same as the frame buffer, y_stride
   // is 32-byte aligned. Also, handle the cases while allocating img without a
@@ -167,4 +160,18 @@
   return AOM_CODEC_OK;
 }
 
+static void image2yuvconfig_upshift(aom_image_t *hbd_img,
+                                    const aom_image_t *img,
+                                    YV12_BUFFER_CONFIG *yv12) {
+  aom_img_upshift(hbd_img, img, 0);
+  // Copy some properties aom_img_upshift() ignores
+  hbd_img->cp = img->cp;
+  hbd_img->tc = img->tc;
+  hbd_img->mc = img->mc;
+  hbd_img->monochrome = img->monochrome;
+  hbd_img->csp = img->csp;
+  hbd_img->range = img->range;
+  image2yuvconfig(hbd_img, yv12);
+  yv12->metadata = img->metadata;
+}
 #endif  // AOM_AV1_AV1_IFACE_COMMON_H_

diff --git a/av1/common/alloccommon.c b/av1/common/alloccommon.c
index 06cac16..c851289 100644
--- a/av1/common/alloccommon.c
+++ b/av1/common/alloccommon.c

@@ -86,15 +86,13 @@
   // Now we need to allocate enough space to store the line buffers for the
   // stripes
   const int frame_w = cm->superres_upscaled_width;
-  const int use_highbd = cm->seq_params.use_highbitdepth;
 
   for (int p = 0; p < num_planes; ++p) {
     const int is_uv = p > 0;
     const int ss_x = is_uv && cm->seq_params.subsampling_x;
     const int plane_w = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ;
     const int stride = ALIGN_POWER_OF_TWO(plane_w, 5);
-    const int buf_size = num_stripes * stride * RESTORATION_CTX_VERT
-                         << use_highbd;
+    const int buf_size = num_stripes * stride * RESTORATION_CTX_VERT << 1;
     RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries;
 
     if (buf_size != boundaries->stripe_boundary_size ||

diff --git a/av1/common/arm/av1_inv_txfm_neon.c b/av1/common/arm/av1_inv_txfm_neon.c
deleted file mode 100644
index a0bbe54..0000000
--- a/av1/common/arm/av1_inv_txfm_neon.c
+++ /dev/null

@@ -1,4272 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
-
-#include "av1/common/av1_inv_txfm1d.h"
-#include "av1/common/av1_inv_txfm1d_cfg.h"
-#include "av1/common/av1_txfm.h"
-#include "av1/common/enums.h"
-#include "av1/common/idct.h"
-#include "av1/common/arm/av1_inv_txfm_neon.h"
-#include "av1/common/arm/transpose_neon.h"
-
-// 1D itx types
-typedef enum ATTRIBUTE_PACKED {
-  IDCT_1D,
-  IADST_1D,
-  IFLIPADST_1D = IADST_1D,
-  IIDENTITY_1D,
-  ITX_TYPES_1D,
-} ITX_TYPE_1D;
-
-static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
-  IDCT_1D,      IADST_1D,     IDCT_1D,      IADST_1D,
-  IFLIPADST_1D, IDCT_1D,      IFLIPADST_1D, IADST_1D,
-  IFLIPADST_1D, IIDENTITY_1D, IDCT_1D,      IIDENTITY_1D,
-  IADST_1D,     IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
-};
-
-static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
-  IDCT_1D,      IDCT_1D,      IADST_1D,     IADST_1D,
-  IDCT_1D,      IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
-  IADST_1D,     IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
-  IIDENTITY_1D, IADST_1D,     IIDENTITY_1D, IFLIPADST_1D,
-};
-
-// 1D functions
-static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = {
-  { av1_idct4, av1_iadst4, av1_iidentity4_c },
-  { av1_idct8, av1_iadst8, av1_iidentity8_c },
-  { av1_idct16, av1_iadst16, av1_iidentity16_c },
-  { av1_idct32, NULL, NULL },
-  { av1_idct64, NULL, NULL },
-};
-
-static INLINE void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in,
-                                                  uint8_t *output, int stride,
-                                                  int flipud,
-                                                  const int height) {
-  int j = flipud ? (height - 1) : 0;
-  const int step = flipud ? -1 : 1;
-  int16x8_t temp_output;
-  for (int i = 0; i < height; ++i, j += step) {
-    temp_output = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(output)));
-    temp_output = vaddq_s16(temp_output, in[j]);
-    vst1_u8(output, vqmovun_s16(temp_output));
-    output += stride;
-  }
-}
-
-static INLINE uint8x16_t lowbd_get_recon_16x16_neon(const uint8x16_t pred,
-                                                    int16x8_t res0,
-                                                    int16x8_t res1) {
-  int16x8_t temp_output[2];
-  uint8x16_t temp_output_8q;
-  temp_output[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pred)));
-  temp_output[0] = vaddq_s16(temp_output[0], res0);
-  temp_output[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pred)));
-  temp_output[1] = vaddq_s16(temp_output[1], res1);
-  temp_output_8q =
-      vcombine_u8(vqmovun_s16(temp_output[0]), vqmovun_s16(temp_output[1]));
-  return temp_output_8q;
-}
-
-static INLINE void lowbd_add_flip_buffer_16xn_neon(int16x8_t *in,
-                                                   uint8_t *output, int stride,
-                                                   int flipud, int height) {
-  uint8x16_t temp_output_8q;
-  int j = flipud ? (height - 1) : 0;
-  const int step = flipud ? -1 : 1;
-  for (int i = 0; i < height; ++i, j += step) {
-    temp_output_8q = vld1q_u8(output + i * stride);
-    temp_output_8q =
-        lowbd_get_recon_16x16_neon(temp_output_8q, in[j], in[j + height]);
-    vst1q_u8((output + i * stride), temp_output_8q);
-  }
-}
-
-static INLINE void lowbd_inv_txfm2d_memset_neon(int16x8_t *a, int size,
-                                                int value) {
-  for (int i = 0; i < size; i++) {
-    a[i] = vdupq_n_s16((int16_t)value);
-  }
-}
-
-static INLINE void btf_16_lane_0_1_neon(const int16x8_t in0,
-                                        const int16x8_t in1, const int16x4_t c,
-                                        int16x8_t *t0, int16x8_t *t1) {
-  int32x4_t s0[2], s1[2];
-  int16x4_t v0[2], v1[2];
-
-  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
-  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
-  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
-  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
-
-  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1);
-  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1);
-  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0);
-  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0);
-
-  v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
-  v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
-  v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
-  v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
-
-  *t0 = vcombine_s16(v0[0], v0[1]);
-  *t1 = vcombine_s16(v1[0], v1[1]);
-}
-
-static INLINE void btf_16_lane_1_0_neon(const int16x8_t in0,
-                                        const int16x8_t in1, const int16x4_t c,
-                                        int16x8_t *t0, int16x8_t *t1) {
-  int32x4_t s0[2], s1[2];
-  int16x4_t v0[2], v1[2];
-
-  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
-  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
-  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
-  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
-
-  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0);
-  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0);
-  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1);
-  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1);
-
-  v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
-  v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
-  v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
-  v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
-
-  *t0 = vcombine_s16(v0[0], v0[1]);
-  *t1 = vcombine_s16(v1[0], v1[1]);
-}
-
-static INLINE void btf_16_lane_2_3_neon(const int16x8_t in0,
-                                        const int16x8_t in1, const int16x4_t c,
-                                        int16x8_t *t0, int16x8_t *t1) {
-  int32x4_t s0[2], s1[2];
-  int16x4_t v0[2], v1[2];
-
-  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
-  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
-  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
-  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
-
-  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3);
-  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3);
-  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2);
-  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2);
-
-  v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
-  v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
-  v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
-  v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
-
-  *t0 = vcombine_s16(v0[0], v0[1]);
-  *t1 = vcombine_s16(v1[0], v1[1]);
-}
-
-static INLINE void btf_16_neon(const int16x8_t in0, int16_t coef1,
-                               int16_t coef2, int16x8_t *t0, int16x8_t *t1) {
-  int32x4_t s0_l, s0_h, s1_l, s1_h;
-  int16x4_t v0[2], v1[2];
-
-  s0_l = vmull_n_s16(vget_low_s16(in0), coef1);
-  s0_h = vmull_n_s16(vget_high_s16(in0), coef1);
-  s1_l = vmull_n_s16(vget_low_s16(in0), coef2);
-  s1_h = vmull_n_s16(vget_high_s16(in0), coef2);
-
-  v0[0] = vrshrn_n_s32(s0_l, INV_COS_BIT);
-  v0[1] = vrshrn_n_s32(s0_h, INV_COS_BIT);
-  v1[0] = vrshrn_n_s32(s1_l, INV_COS_BIT);
-  v1[1] = vrshrn_n_s32(s1_h, INV_COS_BIT);
-
-  *t0 = vcombine_s16(v0[0], v0[1]);
-  *t1 = vcombine_s16(v1[0], v1[1]);
-}
-
-static INLINE void btf_16_lane_3_2_neon(const int16x8_t in0,
-                                        const int16x8_t in1, const int16x4_t c,
-                                        int16x8_t *t0, int16x8_t *t1) {
-  int32x4_t s0[2], s1[2];
-  int16x4_t v0[2], v1[2];
-
-  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
-  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
-  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
-  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
-
-  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2);
-  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2);
-  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3);
-  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3);
-
-  v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
-  v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
-  v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
-  v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
-
-  *t0 = vcombine_s16(v0[0], v0[1]);
-  *t1 = vcombine_s16(v1[0], v1[1]);
-}
-
-static INLINE void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) {
-  int32x4_t t0[2], t1[2];
-  int16x4_t v0[2], v1[2];
-
-  // Don't add/sub before multiply, which will overflow in iadst8.
-  const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0);
-  const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0);
-  const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0);
-  const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0);
-
-  t0[0] = vaddq_s32(x0_lo, x1_lo);
-  t0[1] = vaddq_s32(x0_hi, x1_hi);
-  t1[0] = vsubq_s32(x0_lo, x1_lo);
-  t1[1] = vsubq_s32(x0_hi, x1_hi);
-
-  v0[0] = vrshrn_n_s32(t0[0], INV_COS_BIT);
-  v0[1] = vrshrn_n_s32(t0[1], INV_COS_BIT);
-  v1[0] = vrshrn_n_s32(t1[0], INV_COS_BIT);
-  v1[1] = vrshrn_n_s32(t1[1], INV_COS_BIT);
-
-  x[0] = vcombine_s16(v0[0], v0[1]);
-  x[1] = vcombine_s16(v1[0], v1[1]);
-}
-
-static INLINE int16x4_t set_s16x4_neon(const int16_t c0, const int16_t c1,
-                                       const int16_t c2, const int16_t c3) {
-  int16x4_t val = vdup_n_s16((int16_t)0);
-  val = vset_lane_s16(c0, val, 0);
-  val = vset_lane_s16(c1, val, 1);
-  val = vset_lane_s16(c2, val, 2);
-  val = vset_lane_s16(c3, val, 3);
-  return val;
-}
-
-static INLINE void iadst8_neon(int16x8_t *const in, int16x8_t *out,
-                               int8_t cos_bit, int bit) {
-  (void)bit;
-  const int32_t *cospi = cospi_arr(cos_bit);
-
-  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
-                                      (int16_t)cospi[20], (int16_t)cospi[44]);
-  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[36], (int16_t)cospi[28],
-                                      (int16_t)cospi[52], (int16_t)cospi[12]);
-  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
-                                      (int16_t)cospi[16], (int16_t)cospi[48]);
-
-  int16x8_t x[8];
-  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-  // Stage 1
-  x[0] = in[7];
-  x[1] = in[0];
-  x[2] = in[5];
-  x[3] = in[2];
-  x[4] = in[3];
-  x[5] = in[4];
-  x[6] = in[1];
-  x[7] = in[6];
-
-  // Stage 2
-  btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
-  btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
-  btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
-  btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
-
-  // Stage 3
-  x[0] = vqaddq_s16(s0, s4);
-  x[1] = vqaddq_s16(s1, s5);
-  x[2] = vqaddq_s16(s2, s6);
-  x[3] = vqaddq_s16(s3, s7);
-  x[4] = vqsubq_s16(s0, s4);
-  x[5] = vqsubq_s16(s1, s5);
-  x[6] = vqsubq_s16(s2, s6);
-  x[7] = vqsubq_s16(s3, s7);
-
-  // Stage 4
-  s0 = x[0];
-  s1 = x[1];
-  s2 = x[2];
-  s3 = x[3];
-  btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
-  btf_16_lane_3_2_neon(x[7], x[6], c2, &s7, &s6);
-
-  // Stage 5
-  x[0] = vqaddq_s16(s0, s2);
-  x[1] = vqaddq_s16(s1, s3);
-  x[2] = vqsubq_s16(s0, s2);
-  x[3] = vqsubq_s16(s1, s3);
-  x[4] = vqaddq_s16(s4, s6);
-  x[5] = vqaddq_s16(s5, s7);
-  x[6] = vqsubq_s16(s4, s6);
-  x[7] = vqsubq_s16(s5, s7);
-
-  // stage 6
-  btf_16_half_neon(x + 2, c2);
-  btf_16_half_neon(x + 6, c2);
-
-  // Stage 7
-  out[0] = x[0];
-  out[1] = vqnegq_s16(x[4]);
-  out[2] = x[6];
-  out[3] = vqnegq_s16(x[2]);
-  out[4] = x[3];
-  out[5] = vqnegq_s16(x[7]);
-  out[6] = x[5];
-  out[7] = vqnegq_s16(x[1]);
-}
-
-static INLINE void iadst8_low1_neon(int16x8_t *const in, int16x8_t *out,
-                                    int8_t cos_bit, int bit) {
-  (void)bit;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
-                                      (int16_t)cospi[16], (int16_t)cospi[48]);
-
-  int16x8_t x[8];
-  int16x8_t s0, s1, s4, s5;
-
-  // Stage 1
-  x[1] = in[0];
-
-  // Stage 2
-
-  btf_16_neon(x[1], cospi[60], -cospi[4], &s0, &s1);
-
-  // Stage 3
-  x[0] = s0;
-  x[1] = s1;
-  x[4] = s0;
-  x[5] = s1;
-
-  // Stage 4
-  s0 = x[0];
-  s1 = x[1];
-  btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
-
-  // Stage 5
-  x[0] = s0;
-  x[1] = s1;
-  x[2] = s0;
-  x[3] = s1;
-  x[4] = s4;
-  x[5] = s5;
-  x[6] = s4;
-  x[7] = s5;
-
-  // stage 6
-  btf_16_half_neon(x + 2, c2);
-  btf_16_half_neon(x + 6, c2);
-
-  // Stage 7
-  out[0] = x[0];
-  out[1] = vqnegq_s16(x[4]);
-  out[2] = x[6];
-  out[3] = vqnegq_s16(x[2]);
-  out[4] = x[3];
-  out[5] = vqnegq_s16(x[7]);
-  out[6] = x[5];
-  out[7] = vqnegq_s16(x[1]);
-}
-
-static INLINE void idct8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
-                              int bit) {
-  (void)bit;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  int16x8_t step1[8], step2[8];
-  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
-                                      (int16_t)cospi[40], (int16_t)cospi[24]);
-  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
-                                      (int16_t)cospi[16], (int16_t)cospi[48]);
-
-  // stage 2
-  btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]);
-  btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]);
-
-  // stage 3
-  btf_16_lane_0_1_neon(in[0], in[4], c1, &step2[0], &step2[1]);
-  btf_16_lane_2_3_neon(in[2], in[6], c1, &step2[3], &step2[2]);
-  step2[4] = vqaddq_s16(step1[4], step1[5]);
-  step2[5] = vqsubq_s16(step1[4], step1[5]);
-  step2[6] = vqsubq_s16(step1[7], step1[6]);
-  step2[7] = vqaddq_s16(step1[7], step1[6]);
-
-  // stage 4
-  step1[0] = vqaddq_s16(step2[0], step2[3]);
-  step1[1] = vqaddq_s16(step2[1], step2[2]);
-  step1[2] = vqsubq_s16(step2[1], step2[2]);
-  step1[3] = vqsubq_s16(step2[0], step2[3]);
-  btf_16_lane_0_1_neon(step2[6], step2[5], c1, &step1[6], &step1[5]);
-
-  // stage 5
-  out[0] = vqaddq_s16(step1[0], step2[7]);
-  out[1] = vqaddq_s16(step1[1], step1[6]);
-  out[2] = vqaddq_s16(step1[2], step1[5]);
-  out[3] = vqaddq_s16(step1[3], step2[4]);
-  out[4] = vqsubq_s16(step1[3], step2[4]);
-  out[5] = vqsubq_s16(step1[2], step1[5]);
-  out[6] = vqsubq_s16(step1[1], step1[6]);
-  out[7] = vqsubq_s16(step1[0], step2[7]);
-}
-
-static INLINE void idct8_low1_neon(int16x8_t *in, int16x8_t *out,
-                                   int8_t cos_bit, int bit) {
-  (void)bit;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  int16x8_t step1;
-  int32x4_t t32[2];
-
-  // stage 1
-  // stage 2
-  // stage 3
-  t32[0] = vmull_n_s16(vget_low_s16(in[0]), (int16_t)cospi[32]);
-  t32[1] = vmull_n_s16(vget_high_s16(in[0]), (int16_t)cospi[32]);
-
-  step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
-                       vrshrn_n_s32(t32[1], INV_COS_BIT));
-
-  // stage 4
-  // stage 5
-  out[0] = step1;
-  out[1] = step1;
-  out[2] = step1;
-  out[3] = step1;
-  out[4] = step1;
-  out[5] = step1;
-  out[6] = step1;
-  out[7] = step1;
-}
-
-void av1_round_shift_array_16_neon(int16x8_t *arr, int size, int bit) {
-  assert(!(size % 4));
-  if (!bit) return;
-  const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit));
-  for (int i = 0; i < size; i++) {
-    arr[i] = vrshlq_s16(arr[i], dup_bits_n_16x8);
-  }
-}
-
-static INLINE void flip_buf_ud_neon(int16x8_t *input, int size) {
-  int16x8_t temp[8];
-  for (int i = 0; i < size; ++i) {
-    temp[i] = input[size - 1 - i];
-  }
-  for (int i = 0; i < size; ++i) {
-    input[i] = temp[i];
-  }
-}
-
-static INLINE void load_buffer_32bit_to_16bit_neon(const int32_t *input,
-                                                   int16x8_t *const a,
-                                                   int out_size) {
-  for (int i = 0; i < 8; ++i) {
-    a[i] = vcombine_s16(vmovn_s32(vld1q_s32(input)),
-                        vmovn_s32(vld1q_s32(input + 4)));
-    input += out_size;
-  }
-}
-
-static int16_t sqrt_2_list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
-                                         4 * 5793 };
-
-static INLINE void identity_txfm_round_neon(int16x8_t *input, int16x8_t *output,
-                                            int txw_idx, int8_t size, int bit) {
-  const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit));
-  int16x4_t scale = vdup_n_s16(sqrt_2_list[txw_idx]);
-  int16x4_t low_i16, high_i16;
-  int32x4_t low_i32, high_i32;
-  for (int i = 0; i < size; i++) {
-    int32x4_t temp_out_low = vmull_s16(vget_low_s16(input[i]), scale);
-    int32x4_t temp_out_high = vmull_s16(vget_high_s16(input[i]), scale);
-    low_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_low, 12), dup_bits_n_32x4);
-    high_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_high, 12), dup_bits_n_32x4);
-    low_i16 = vqmovn_s32(low_i32);
-    high_i16 = vqmovn_s32(high_i32);
-    output[i] = vcombine_s16(low_i16, high_i16);
-  }
-}
-
-static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output,
-                                        int size) {
-  int32x4_t out_low, out_high;
-  int16x4_t low, high;
-
-  for (int z = 0; z < size; ++z) {
-    out_low = vmull_n_s16(vget_low_s16(input[z]), (int16_t)NewInvSqrt2);
-    out_high = vmull_n_s16(vget_high_s16(input[z]), (int16_t)NewInvSqrt2);
-
-    low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits);
-    high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits);
-
-    output[z] = vcombine_s16(low, high);
-  }
-}
-
-static INLINE void idct16_low1_neon(int16x8_t *in, int16x8_t *out,
-                                    int8_t cos_bit, int bit) {
-  (void)bit;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  int16x8_t step1;
-  int32x4_t t32[2];
-
-  // stage 4
-
-  t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
-  t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
-  step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
-                       vrshrn_n_s32(t32[1], INV_COS_BIT));
-
-  // stage 6
-  // stage 7
-  out[0] = step1;
-  out[1] = step1;
-  out[2] = step1;
-  out[3] = step1;
-  out[4] = step1;
-  out[5] = step1;
-  out[6] = step1;
-  out[7] = step1;
-  out[8] = step1;
-  out[9] = step1;
-  out[10] = step1;
-  out[11] = step1;
-  out[12] = step1;
-  out[13] = step1;
-  out[14] = step1;
-  out[15] = step1;
-}
-
-static INLINE void idct16_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
-                               int bit) {
-  (void)bit;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  int16x8_t step1[16], step2[16];
-
-  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
-                                      (int16_t)cospi[36], (int16_t)cospi[28]);
-  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
-                                      (int16_t)cospi[52], (int16_t)cospi[12]);
-  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
-                                      (int16_t)cospi[40], (int16_t)cospi[24]);
-  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
-                                      (int16_t)cospi[16], (int16_t)cospi[48]);
-  const int16x4_t c4 =
-      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
-                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
-  // stage 2
-
-  btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]);
-  btf_16_lane_2_3_neon(in[9], in[7], c0, &step2[14], &step2[9]);
-  btf_16_lane_0_1_neon(in[5], in[11], c1, &step2[13], &step2[10]);
-  btf_16_lane_2_3_neon(in[13], in[3], c1, &step2[12], &step2[11]);
-
-  step2[0] = in[0];
-  step2[1] = in[8];
-  step2[2] = in[4];
-  step2[3] = in[12];
-  step2[4] = in[2];
-  step2[5] = in[10];
-  step2[6] = in[6];
-  step2[7] = in[14];
-
-  // stage 3
-
-  btf_16_lane_0_1_neon(step2[4], step2[7], c2, &step1[7], &step1[4]);
-  btf_16_lane_2_3_neon(step2[5], step2[6], c2, &step1[6], &step1[5]);
-
-  step1[0] = step2[0];
-  step1[1] = step2[1];
-  step1[2] = step2[2];
-  step1[3] = step2[3];
-  step1[8] = vqaddq_s16(step2[8], step2[9]);
-  step1[9] = vqsubq_s16(step2[8], step2[9]);
-  step1[10] = vqsubq_s16(step2[11], step2[10]);
-  step1[11] = vqaddq_s16(step2[11], step2[10]);
-  step1[12] = vqaddq_s16(step2[12], step2[13]);
-  step1[13] = vqsubq_s16(step2[12], step2[13]);
-  step1[14] = vqsubq_s16(step2[15], step2[14]);
-  step1[15] = vqaddq_s16(step2[15], step2[14]);
-
-  // stage 4
-
-  btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]);
-  btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]);
-  btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
-  btf_16_lane_3_2_neon(step1[10], step1[13], c4, &step2[10], &step2[13]);
-
-  step2[4] = vqaddq_s16(step1[4], step1[5]);
-  step2[5] = vqsubq_s16(step1[4], step1[5]);
-  step2[6] = vqsubq_s16(step1[7], step1[6]);
-  step2[7] = vqaddq_s16(step1[7], step1[6]);
-  step2[8] = step1[8];
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-  step2[15] = step1[15];
-
-  // stage 5
-
-  btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
-
-  step1[0] = vqaddq_s16(step2[0], step2[3]);
-  step1[1] = vqaddq_s16(step2[1], step2[2]);
-  step1[2] = vqsubq_s16(step2[1], step2[2]);
-  step1[3] = vqsubq_s16(step2[0], step2[3]);
-  step1[4] = step2[4];
-  step1[7] = step2[7];
-  step1[8] = vqaddq_s16(step2[8], step2[11]);
-  step1[9] = vqaddq_s16(step2[9], step2[10]);
-  step1[10] = vqsubq_s16(step2[9], step2[10]);
-  step1[11] = vqsubq_s16(step2[8], step2[11]);
-  step1[12] = vqsubq_s16(step2[15], step2[12]);
-  step1[13] = vqsubq_s16(step2[14], step2[13]);
-  step1[14] = vqaddq_s16(step2[14], step2[13]);
-  step1[15] = vqaddq_s16(step2[15], step2[12]);
-
-  // stage 6
-
-  btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
-  btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
-
-  step2[0] = vqaddq_s16(step1[0], step1[7]);
-  step2[1] = vqaddq_s16(step1[1], step1[6]);
-  step2[2] = vqaddq_s16(step1[2], step1[5]);
-  step2[3] = vqaddq_s16(step1[3], step1[4]);
-  step2[4] = vqsubq_s16(step1[3], step1[4]);
-  step2[5] = vqsubq_s16(step1[2], step1[5]);
-  step2[6] = vqsubq_s16(step1[1], step1[6]);
-  step2[7] = vqsubq_s16(step1[0], step1[7]);
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-
-  // stage 7
-  out[0] = vqaddq_s16(step2[0], step2[15]);
-  out[1] = vqaddq_s16(step2[1], step2[14]);
-  out[2] = vqaddq_s16(step2[2], step2[13]);
-  out[3] = vqaddq_s16(step2[3], step2[12]);
-  out[4] = vqaddq_s16(step2[4], step2[11]);
-  out[5] = vqaddq_s16(step2[5], step2[10]);
-  out[6] = vqaddq_s16(step2[6], step2[9]);
-  out[7] = vqaddq_s16(step2[7], step2[8]);
-  out[8] = vqsubq_s16(step2[7], step2[8]);
-  out[9] = vqsubq_s16(step2[6], step2[9]);
-  out[10] = vqsubq_s16(step2[5], step2[10]);
-  out[11] = vqsubq_s16(step2[4], step2[11]);
-  out[12] = vqsubq_s16(step2[3], step2[12]);
-  out[13] = vqsubq_s16(step2[2], step2[13]);
-  out[14] = vqsubq_s16(step2[1], step2[14]);
-  out[15] = vqsubq_s16(step2[0], step2[15]);
-}
-
-static INLINE void idct16_low8_neon(int16x8_t *in, int16x8_t *out,
-                                    int8_t cos_bit, int bit) {
-  (void)bit;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  int16x8_t step1[16], step2[16];
-  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
-                                      (int16_t)cospi[16], (int16_t)cospi[48]);
-  const int16x4_t c1 =
-      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
-                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
-
-  // stage 1
-  // stage 2
-
-  step2[0] = in[0];
-  step2[2] = in[4];
-  step2[4] = in[2];
-  step2[6] = in[6];
-
-  btf_16_neon(in[1], cospi[60], cospi[4], &step2[8], &step2[15]);
-  btf_16_neon(in[7], -cospi[36], cospi[28], &step2[9], &step2[14]);
-  btf_16_neon(in[5], cospi[44], cospi[20], &step2[10], &step2[13]);
-  btf_16_neon(in[3], -cospi[52], cospi[12], &step2[11], &step2[12]);
-
-  // stage 3
-
-  btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
-  btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
-
-  step1[0] = step2[0];
-  step1[2] = step2[2];
-  step1[8] = vqaddq_s16(step2[8], step2[9]);
-  step1[9] = vqsubq_s16(step2[8], step2[9]);
-  step1[10] = vqsubq_s16(step2[11], step2[10]);
-  step1[11] = vqaddq_s16(step2[11], step2[10]);
-  step1[12] = vqaddq_s16(step2[12], step2[13]);
-  step1[13] = vqsubq_s16(step2[12], step2[13]);
-  step1[14] = vqsubq_s16(step2[15], step2[14]);
-  step1[15] = vqaddq_s16(step2[15], step2[14]);
-
-  // stage 4
-
-  btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
-  btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
-  btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]);
-  btf_16_lane_3_2_neon(step1[10], step1[13], c1, &step2[10], &step2[13]);
-
-  step2[4] = vqaddq_s16(step1[4], step1[5]);
-  step2[5] = vqsubq_s16(step1[4], step1[5]);
-  step2[6] = vqsubq_s16(step1[7], step1[6]);
-  step2[7] = vqaddq_s16(step1[7], step1[6]);
-  step2[8] = step1[8];
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-  step2[15] = step1[15];
-
-  // stage 5
-
-  btf_16_lane_0_1_neon(step2[6], step2[5], c0, &step1[6], &step1[5]);
-  step1[0] = vqaddq_s16(step2[0], step2[3]);
-  step1[1] = vqaddq_s16(step2[1], step2[2]);
-  step1[2] = vqsubq_s16(step2[1], step2[2]);
-  step1[3] = vqsubq_s16(step2[0], step2[3]);
-  step1[4] = step2[4];
-  step1[7] = step2[7];
-  step1[8] = vqaddq_s16(step2[8], step2[11]);
-  step1[9] = vqaddq_s16(step2[9], step2[10]);
-  step1[10] = vqsubq_s16(step2[9], step2[10]);
-  step1[11] = vqsubq_s16(step2[8], step2[11]);
-  step1[12] = vqsubq_s16(step2[15], step2[12]);
-  step1[13] = vqsubq_s16(step2[14], step2[13]);
-  step1[14] = vqaddq_s16(step2[14], step2[13]);
-  step1[15] = vqaddq_s16(step2[15], step2[12]);
-
-  // stage 6
-  btf_16_lane_0_1_neon(step1[13], step1[10], c0, &step2[13], &step2[10]);
-  btf_16_lane_0_1_neon(step1[12], step1[11], c0, &step2[12], &step2[11]);
-
-  step2[0] = vqaddq_s16(step1[0], step1[7]);
-  step2[1] = vqaddq_s16(step1[1], step1[6]);
-  step2[2] = vqaddq_s16(step1[2], step1[5]);
-  step2[3] = vqaddq_s16(step1[3], step1[4]);
-  step2[4] = vqsubq_s16(step1[3], step1[4]);
-  step2[5] = vqsubq_s16(step1[2], step1[5]);
-  step2[6] = vqsubq_s16(step1[1], step1[6]);
-  step2[7] = vqsubq_s16(step1[0], step1[7]);
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-
-  // stage 7
-
-  out[0] = vqaddq_s16(step2[0], step2[15]);
-  out[1] = vqaddq_s16(step2[1], step2[14]);
-  out[2] = vqaddq_s16(step2[2], step2[13]);
-  out[3] = vqaddq_s16(step2[3], step2[12]);
-  out[4] = vqaddq_s16(step2[4], step2[11]);
-  out[5] = vqaddq_s16(step2[5], step2[10]);
-  out[6] = vqaddq_s16(step2[6], step2[9]);
-  out[7] = vqaddq_s16(step2[7], step2[8]);
-  out[8] = vqsubq_s16(step2[7], step2[8]);
-  out[9] = vqsubq_s16(step2[6], step2[9]);
-  out[10] = vqsubq_s16(step2[5], step2[10]);
-  out[11] = vqsubq_s16(step2[4], step2[11]);
-  out[12] = vqsubq_s16(step2[3], step2[12]);
-  out[13] = vqsubq_s16(step2[2], step2[13]);
-  out[14] = vqsubq_s16(step2[1], step2[14]);
-  out[15] = vqsubq_s16(step2[0], step2[15]);
-}
-
-static INLINE void iadst16_neon(int16x8_t *const in, int16x8_t *out,
-                                int8_t cos_bit, int bit) {
-  (void)bit;
-  const int32_t *cospi = cospi_arr(cos_bit);
-
-  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62],
-                                      (int16_t)cospi[10], (int16_t)cospi[54]);
-  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46],
-                                      (int16_t)cospi[26], (int16_t)cospi[38]);
-  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[34], (int16_t)cospi[30],
-                                      (int16_t)cospi[42], (int16_t)cospi[22]);
-  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[50], (int16_t)cospi[14],
-                                      (int16_t)cospi[58], (int16_t)cospi[6]);
-  const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
-                                      (int16_t)cospi[40], (int16_t)cospi[24]);
-  const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
-                                      (int16_t)cospi[16], (int16_t)cospi[48]);
-
-  int16x8_t x[16];
-  int16x8_t t[14];
-  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-  int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
-
-  // Stage 1
-  x[0] = in[15];
-  x[1] = in[0];
-  x[2] = in[13];
-  x[3] = in[2];
-  x[4] = in[11];
-  x[5] = in[4];
-  x[6] = in[9];
-  x[7] = in[6];
-  x[8] = in[7];
-  x[9] = in[8];
-  x[10] = in[5];
-  x[11] = in[10];
-  x[12] = in[3];
-  x[13] = in[12];
-  x[14] = in[1];
-  x[15] = in[14];
-
-  // Stage 2
-  btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
-  btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
-  btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
-  btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
-  btf_16_lane_0_1_neon(x[8], x[9], c2, &s8, &s9);
-  btf_16_lane_2_3_neon(x[10], x[11], c2, &s10, &s11);
-  btf_16_lane_0_1_neon(x[12], x[13], c3, &s12, &s13);
-  btf_16_lane_2_3_neon(x[14], x[15], c3, &s14, &s15);
-
-  // Stage 3
-  x[0] = vqaddq_s16(s0, s8);
-  x[1] = vqaddq_s16(s1, s9);
-  x[2] = vqaddq_s16(s2, s10);
-  x[3] = vqaddq_s16(s3, s11);
-  x[4] = vqaddq_s16(s4, s12);
-  x[5] = vqaddq_s16(s5, s13);
-  x[6] = vqaddq_s16(s6, s14);
-  x[7] = vqaddq_s16(s7, s15);
-  x[8] = vqsubq_s16(s0, s8);
-  x[9] = vqsubq_s16(s1, s9);
-  x[10] = vqsubq_s16(s2, s10);
-  x[11] = vqsubq_s16(s3, s11);
-  x[12] = vqsubq_s16(s4, s12);
-  x[13] = vqsubq_s16(s5, s13);
-  x[14] = vqsubq_s16(s6, s14);
-  x[15] = vqsubq_s16(s7, s15);
-
-  // Stage 4
-  t[0] = x[0];
-  t[1] = x[1];
-  t[2] = x[2];
-  t[3] = x[3];
-  t[4] = x[4];
-  t[5] = x[5];
-  t[6] = x[6];
-  t[7] = x[7];
-  btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
-  btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11);
-  btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12);
-  btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14);
-
-  // Stage 5
-  x[0] = vqaddq_s16(t[0], t[4]);
-  x[1] = vqaddq_s16(t[1], t[5]);
-  x[2] = vqaddq_s16(t[2], t[6]);
-  x[3] = vqaddq_s16(t[3], t[7]);
-  x[4] = vqsubq_s16(t[0], t[4]);
-  x[5] = vqsubq_s16(t[1], t[5]);
-  x[6] = vqsubq_s16(t[2], t[6]);
-  x[7] = vqsubq_s16(t[3], t[7]);
-  x[8] = vqaddq_s16(s8, s12);
-  x[9] = vqaddq_s16(s9, s13);
-  x[10] = vqaddq_s16(s10, s14);
-  x[11] = vqaddq_s16(s11, s15);
-  x[12] = vqsubq_s16(s8, s12);
-  x[13] = vqsubq_s16(s9, s13);
-  x[14] = vqsubq_s16(s10, s14);
-  x[15] = vqsubq_s16(s11, s15);
-
-  // stage 6
-  t[0] = x[0];
-  t[1] = x[1];
-  t[2] = x[2];
-  t[3] = x[3];
-  btf_16_lane_2_3_neon(x[4], x[5], c5, &s4, &s5);
-  btf_16_lane_3_2_neon(x[7], x[6], c5, &s7, &s6);
-  t[8] = x[8];
-  t[9] = x[9];
-  t[10] = x[10];
-  t[11] = x[11];
-  btf_16_lane_2_3_neon(x[12], x[13], c5, &s12, &s13);
-  btf_16_lane_3_2_neon(x[15], x[14], c5, &s15, &s14);
-
-  // Stage 7
-  x[0] = vqaddq_s16(t[0], t[2]);
-  x[1] = vqaddq_s16(t[1], t[3]);
-  x[2] = vqsubq_s16(t[0], t[2]);
-  x[3] = vqsubq_s16(t[1], t[3]);
-  x[4] = vqaddq_s16(s4, s6);
-  x[5] = vqaddq_s16(s5, s7);
-  x[6] = vqsubq_s16(s4, s6);
-  x[7] = vqsubq_s16(s5, s7);
-  x[8] = vqaddq_s16(t[8], t[10]);
-  x[9] = vqaddq_s16(t[9], t[11]);
-  x[10] = vqsubq_s16(t[8], t[10]);
-  x[11] = vqsubq_s16(t[9], t[11]);
-  x[12] = vqaddq_s16(s12, s14);
-  x[13] = vqaddq_s16(s13, s15);
-  x[14] = vqsubq_s16(s12, s14);
-  x[15] = vqsubq_s16(s13, s15);
-
-  // Stage 8
-  btf_16_half_neon(x + 2, c5);
-  btf_16_half_neon(x + 6, c5);
-  btf_16_half_neon(x + 10, c5);
-  btf_16_half_neon(x + 14, c5);
-
-  // Stage 9
-  out[0] = x[0];
-  out[1] = vqnegq_s16(x[8]);
-  out[2] = x[12];
-  out[3] = vqnegq_s16(x[4]);
-  out[4] = x[6];
-  out[5] = vqnegq_s16(x[14]);
-  out[6] = x[10];
-  out[7] = vqnegq_s16(x[2]);
-  out[8] = x[3];
-  out[9] = vqnegq_s16(x[11]);
-  out[10] = x[15];
-  out[11] = vqnegq_s16(x[7]);
-  out[12] = x[5];
-  out[13] = vqnegq_s16(x[13]);
-  out[14] = x[9];
-  out[15] = vqnegq_s16(x[1]);
-}
-
-static INLINE void iadst16_low1_neon(int16x8_t *const in, int16x8_t *out,
-                                     int8_t cos_bit, int bit) {
-  (void)bit;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
-                                      (int16_t)cospi[40], (int16_t)cospi[24]);
-  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
-                                      (int16_t)cospi[16], (int16_t)cospi[48]);
-
-  int16x8_t x[16];
-  int16x8_t t[10];
-  int16x8_t s0, s1, s4, s5;
-  int16x8_t s8, s9, s12, s13;
-
-  // Stage 1
-  x[1] = in[0];
-
-  // Stage 2
-  btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
-
-  // Stage 3
-  x[0] = s0;
-  x[1] = s1;
-  x[8] = s0;
-  x[9] = s1;
-
-  // Stage 4
-  t[0] = x[0];
-  t[1] = x[1];
-  btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9);
-
-  // Stage 5
-  x[0] = t[0];
-  x[1] = t[1];
-  x[4] = t[0];
-  x[5] = t[1];
-  x[8] = s8;
-  x[9] = s9;
-  x[12] = s8;
-  x[13] = s9;
-
-  // stage 6
-  t[0] = x[0];
-  t[1] = x[1];
-  btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5);
-  t[8] = x[8];
-  t[9] = x[9];
-  btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13);
-
-  // Stage 7
-  x[0] = t[0];
-  x[1] = t[1];
-  x[2] = t[0];
-  x[3] = t[1];
-  x[4] = s4;
-  x[5] = s5;
-  x[6] = s4;
-  x[7] = s5;
-  x[8] = t[8];
-  x[9] = t[9];
-  x[10] = t[8];
-  x[11] = t[9];
-  x[12] = s12;
-  x[13] = s13;
-  x[14] = s12;
-  x[15] = s13;
-
-  // Stage 8
-  btf_16_half_neon(x + 2, c1);
-  btf_16_half_neon(x + 6, c1);
-  btf_16_half_neon(x + 10, c1);
-  btf_16_half_neon(x + 14, c1);
-
-  // Stage 9
-  out[0] = x[0];
-  out[1] = vqnegq_s16(x[8]);
-  out[2] = x[12];
-  out[3] = vqnegq_s16(x[4]);
-  out[4] = x[6];
-  out[5] = vqnegq_s16(x[14]);
-  out[6] = x[10];
-  out[7] = vqnegq_s16(x[2]);
-  out[8] = x[3];
-  out[9] = vqnegq_s16(x[11]);
-  out[10] = x[15];
-  out[11] = vqnegq_s16(x[7]);
-  out[12] = x[5];
-  out[13] = vqnegq_s16(x[13]);
-  out[14] = x[9];
-  out[15] = vqnegq_s16(x[1]);
-}
-
-static INLINE void iadst16_low8_neon(int16x8_t *const in, int16x8_t *out,
-                                     int8_t cos_bit, int bit) {
-  (void)bit;
-  const int32_t *cospi = cospi_arr(cos_bit);
-
-  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
-                                      (int16_t)cospi[40], (int16_t)cospi[24]);
-  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
-                                      (int16_t)cospi[16], (int16_t)cospi[48]);
-
-  int16x8_t x[16];
-  int16x8_t t[14];
-  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-  int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
-
-  // Stage 1
-  x[1] = in[0];
-  x[3] = in[2];
-  x[5] = in[4];
-  x[7] = in[6];
-  x[8] = in[7];
-  x[10] = in[5];
-  x[12] = in[3];
-  x[14] = in[1];
-
-  // Stage 2
-  btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
-  btf_16_neon(x[3], cospi[54], -cospi[10], &s2, &s3);
-  btf_16_neon(x[5], cospi[46], -cospi[18], &s4, &s5);
-  btf_16_neon(x[7], cospi[38], -cospi[26], &s6, &s7);
-
-  btf_16_neon(x[8], cospi[34], cospi[30], &s8, &s9);
-  btf_16_neon(x[10], cospi[42], cospi[22], &s10, &s11);
-  btf_16_neon(x[12], cospi[50], cospi[14], &s12, &s13);
-  btf_16_neon(x[14], cospi[58], cospi[6], &s14, &s15);
-
-  // Stage 3
-  x[0] = vqaddq_s16(s0, s8);
-  x[1] = vqaddq_s16(s1, s9);
-  x[2] = vqaddq_s16(s2, s10);
-  x[3] = vqaddq_s16(s3, s11);
-  x[4] = vqaddq_s16(s4, s12);
-  x[5] = vqaddq_s16(s5, s13);
-  x[6] = vqaddq_s16(s6, s14);
-  x[7] = vqaddq_s16(s7, s15);
-  x[8] = vqsubq_s16(s0, s8);
-  x[9] = vqsubq_s16(s1, s9);
-  x[10] = vqsubq_s16(s2, s10);
-  x[11] = vqsubq_s16(s3, s11);
-  x[12] = vqsubq_s16(s4, s12);
-  x[13] = vqsubq_s16(s5, s13);
-  x[14] = vqsubq_s16(s6, s14);
-  x[15] = vqsubq_s16(s7, s15);
-
-  // Stage 4
-  t[0] = x[0];
-  t[1] = x[1];
-  t[2] = x[2];
-  t[3] = x[3];
-  t[4] = x[4];
-  t[5] = x[5];
-  t[6] = x[6];
-  t[7] = x[7];
-  btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9);
-  btf_16_lane_2_3_neon(x[10], x[11], c0, &s10, &s11);
-  btf_16_lane_1_0_neon(x[13], x[12], c0, &s13, &s12);
-  btf_16_lane_3_2_neon(x[15], x[14], c0, &s15, &s14);
-
-  // Stage 5
-  x[0] = vqaddq_s16(t[0], t[4]);
-  x[1] = vqaddq_s16(t[1], t[5]);
-  x[2] = vqaddq_s16(t[2], t[6]);
-  x[3] = vqaddq_s16(t[3], t[7]);
-  x[4] = vqsubq_s16(t[0], t[4]);
-  x[5] = vqsubq_s16(t[1], t[5]);
-  x[6] = vqsubq_s16(t[2], t[6]);
-  x[7] = vqsubq_s16(t[3], t[7]);
-  x[8] = vqaddq_s16(s8, s12);
-  x[9] = vqaddq_s16(s9, s13);
-  x[10] = vqaddq_s16(s10, s14);
-  x[11] = vqaddq_s16(s11, s15);
-  x[12] = vqsubq_s16(s8, s12);
-  x[13] = vqsubq_s16(s9, s13);
-  x[14] = vqsubq_s16(s10, s14);
-  x[15] = vqsubq_s16(s11, s15);
-
-  // stage 6
-  t[0] = x[0];
-  t[1] = x[1];
-  t[2] = x[2];
-  t[3] = x[3];
-  btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5);
-  btf_16_lane_3_2_neon(x[7], x[6], c1, &s7, &s6);
-  t[8] = x[8];
-  t[9] = x[9];
-  t[10] = x[10];
-  t[11] = x[11];
-  btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13);
-  btf_16_lane_3_2_neon(x[15], x[14], c1, &s15, &s14);
-
-  // Stage 7
-  x[0] = vqaddq_s16(t[0], t[2]);
-  x[1] = vqaddq_s16(t[1], t[3]);
-  x[2] = vqsubq_s16(t[0], t[2]);
-  x[3] = vqsubq_s16(t[1], t[3]);
-  x[4] = vqaddq_s16(s4, s6);
-  x[5] = vqaddq_s16(s5, s7);
-  x[6] = vqsubq_s16(s4, s6);
-  x[7] = vqsubq_s16(s5, s7);
-  x[8] = vqaddq_s16(t[8], t[10]);
-  x[9] = vqaddq_s16(t[9], t[11]);
-  x[10] = vqsubq_s16(t[8], t[10]);
-  x[11] = vqsubq_s16(t[9], t[11]);
-  x[12] = vqaddq_s16(s12, s14);
-  x[13] = vqaddq_s16(s13, s15);
-  x[14] = vqsubq_s16(s12, s14);
-  x[15] = vqsubq_s16(s13, s15);
-
-  // Stage 8
-  btf_16_half_neon(x + 2, c1);
-  btf_16_half_neon(x + 6, c1);
-  btf_16_half_neon(x + 10, c1);
-  btf_16_half_neon(x + 14, c1);
-
-  // Stage 9
-  out[0] = x[0];
-  out[1] = vqnegq_s16(x[8]);
-  out[2] = x[12];
-  out[3] = vqnegq_s16(x[4]);
-  out[4] = x[6];
-  out[5] = vqnegq_s16(x[14]);
-  out[6] = x[10];
-  out[7] = vqnegq_s16(x[2]);
-  out[8] = x[3];
-  out[9] = vqnegq_s16(x[11]);
-  out[10] = x[15];
-  out[11] = vqnegq_s16(x[7]);
-  out[12] = x[5];
-  out[13] = vqnegq_s16(x[13]);
-  out[14] = x[9];
-  out[15] = vqnegq_s16(x[1]);
-}
-
-static INLINE void idct32_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
-                               int bit) {
-  (void)bit;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  int16x8_t step1[32], step2[32];
-
-  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62],
-                                      (int16_t)cospi[34], (int16_t)cospi[30]);
-  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46],
-                                      (int16_t)cospi[50], (int16_t)cospi[14]);
-  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[10], (int16_t)cospi[54],
-                                      (int16_t)cospi[42], (int16_t)cospi[22]);
-  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[26], (int16_t)cospi[38],
-                                      (int16_t)cospi[58], (int16_t)cospi[6]);
-  const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
-                                      (int16_t)cospi[36], (int16_t)cospi[28]);
-  const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
-                                      (int16_t)cospi[52], (int16_t)cospi[12]);
-  const int16x4_t c6 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
-                                      (int16_t)cospi[40], (int16_t)cospi[24]);
-  const int16x4_t c7 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
-                                      (int16_t)cospi[16], (int16_t)cospi[48]);
-  const int16x4_t c8 =
-      set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
-                     (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
-  const int16x4_t c9 =
-      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
-                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
-
-  // stage 2
-
-  btf_16_lane_0_1_neon(in[1], in[31], c0, &step2[31], &step2[16]);
-  btf_16_lane_2_3_neon(in[17], in[15], c0, &step2[30], &step2[17]);
-  btf_16_lane_0_1_neon(in[9], in[23], c1, &step2[29], &step2[18]);
-  btf_16_lane_2_3_neon(in[25], in[7], c1, &step2[28], &step2[19]);
-  btf_16_lane_0_1_neon(in[5], in[27], c2, &step2[27], &step2[20]);
-  btf_16_lane_2_3_neon(in[21], in[11], c2, &step2[26], &step2[21]);
-  btf_16_lane_0_1_neon(in[13], in[19], c3, &step2[25], &step2[22]);
-  btf_16_lane_2_3_neon(in[29], in[3], c3, &step2[24], &step2[23]);
-
-  step2[0] = in[0];
-  step2[1] = in[16];
-  step2[2] = in[8];
-  step2[3] = in[24];
-  step2[4] = in[4];
-  step2[5] = in[20];
-  step2[6] = in[12];
-  step2[7] = in[28];
-  step2[8] = in[2];
-  step2[9] = in[18];
-  step2[10] = in[10];
-  step2[11] = in[26];
-  step2[12] = in[6];
-  step2[13] = in[22];
-  step2[14] = in[14];
-  step2[15] = in[30];
-
-  // stage 3
-
-  btf_16_lane_0_1_neon(step2[8], step2[15], c4, &step1[15], &step1[8]);
-  btf_16_lane_2_3_neon(step2[9], step2[14], c4, &step1[14], &step1[9]);
-  btf_16_lane_0_1_neon(step2[10], step2[13], c5, &step1[13], &step1[10]);
-  btf_16_lane_2_3_neon(step2[11], step2[12], c5, &step1[12], &step1[11]);
-
-  step1[0] = step2[0];
-  step1[1] = step2[1];
-  step1[2] = step2[2];
-  step1[3] = step2[3];
-  step1[4] = step2[4];
-  step1[5] = step2[5];
-  step1[6] = step2[6];
-  step1[7] = step2[7];
-
-  step1[16] = vqaddq_s16(step2[16], step2[17]);
-  step1[17] = vqsubq_s16(step2[16], step2[17]);
-  step1[18] = vqsubq_s16(step2[19], step2[18]);
-  step1[19] = vqaddq_s16(step2[19], step2[18]);
-  step1[20] = vqaddq_s16(step2[20], step2[21]);
-  step1[21] = vqsubq_s16(step2[20], step2[21]);
-  step1[22] = vqsubq_s16(step2[23], step2[22]);
-  step1[23] = vqaddq_s16(step2[23], step2[22]);
-  step1[24] = vqaddq_s16(step2[24], step2[25]);
-  step1[25] = vqsubq_s16(step2[24], step2[25]);
-  step1[26] = vqsubq_s16(step2[27], step2[26]);
-  step1[27] = vqaddq_s16(step2[27], step2[26]);
-  step1[28] = vqaddq_s16(step2[28], step2[29]);
-  step1[29] = vqsubq_s16(step2[28], step2[29]);
-  step1[30] = vqsubq_s16(step2[31], step2[30]);
-  step1[31] = vqaddq_s16(step2[31], step2[30]);
-
-  // stage 4
-
-  btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]);
-  btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]);
-  btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]);
-  btf_16_lane_1_0_neon(step1[18], step1[29], c8, &step2[18], &step2[29]);
-  btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]);
-  btf_16_lane_3_2_neon(step1[22], step1[25], c8, &step2[22], &step2[25]);
-
-  step2[0] = step1[0];
-  step2[1] = step1[1];
-  step2[2] = step1[2];
-  step2[3] = step1[3];
-  step2[8] = vqaddq_s16(step1[8], step1[9]);
-  step2[9] = vqsubq_s16(step1[8], step1[9]);
-  step2[10] = vqsubq_s16(step1[11], step1[10]);
-  step2[11] = vqaddq_s16(step1[11], step1[10]);
-  step2[12] = vqaddq_s16(step1[12], step1[13]);
-  step2[13] = vqsubq_s16(step1[12], step1[13]);
-  step2[14] = vqsubq_s16(step1[15], step1[14]);
-  step2[15] = vqaddq_s16(step1[15], step1[14]);
-  step2[16] = step1[16];
-  step2[19] = step1[19];
-  step2[20] = step1[20];
-  step2[23] = step1[23];
-  step2[24] = step1[24];
-  step2[27] = step1[27];
-  step2[28] = step1[28];
-  step2[31] = step1[31];
-
-  // stage 5
-
-  btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]);
-  btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]);
-  btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]);
-  btf_16_lane_3_2_neon(step2[10], step2[13], c9, &step1[10], &step1[13]);
-
-  step1[4] = vqaddq_s16(step2[4], step2[5]);
-  step1[5] = vqsubq_s16(step2[4], step2[5]);
-  step1[6] = vqsubq_s16(step2[7], step2[6]);
-  step1[7] = vqaddq_s16(step2[7], step2[6]);
-  step1[8] = step2[8];
-  step1[11] = step2[11];
-  step1[12] = step2[12];
-  step1[15] = step2[15];
-  step1[16] = vqaddq_s16(step2[16], step2[19]);
-  step1[17] = vqaddq_s16(step2[17], step2[18]);
-  step1[18] = vqsubq_s16(step2[17], step2[18]);
-  step1[19] = vqsubq_s16(step2[16], step2[19]);
-  step1[20] = vqsubq_s16(step2[23], step2[20]);
-  step1[21] = vqsubq_s16(step2[22], step2[21]);
-  step1[22] = vqaddq_s16(step2[22], step2[21]);
-  step1[23] = vqaddq_s16(step2[23], step2[20]);
-  step1[24] = vqaddq_s16(step2[24], step2[27]);
-  step1[25] = vqaddq_s16(step2[25], step2[26]);
-  step1[26] = vqsubq_s16(step2[25], step2[26]);
-  step1[27] = vqsubq_s16(step2[24], step2[27]);
-  step1[28] = vqsubq_s16(step2[31], step2[28]);
-  step1[29] = vqsubq_s16(step2[30], step2[29]);
-  step1[30] = vqaddq_s16(step2[30], step2[29]);
-  step1[31] = vqaddq_s16(step2[31], step2[28]);
-
-  // stage 6
-
-  btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]);
-  btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]);
-  btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]);
-  btf_16_lane_3_2_neon(step1[20], step1[27], c9, &step2[20], &step2[27]);
-  btf_16_lane_3_2_neon(step1[21], step1[26], c9, &step2[21], &step2[26]);
-
-  step2[0] = vqaddq_s16(step1[0], step1[3]);
-  step2[1] = vqaddq_s16(step1[1], step1[2]);
-  step2[2] = vqsubq_s16(step1[1], step1[2]);
-  step2[3] = vqsubq_s16(step1[0], step1[3]);
-  step2[4] = step1[4];
-  step2[7] = step1[7];
-  step2[8] = vqaddq_s16(step1[8], step1[11]);
-  step2[9] = vqaddq_s16(step1[9], step1[10]);
-  step2[10] = vqsubq_s16(step1[9], step1[10]);
-  step2[11] = vqsubq_s16(step1[8], step1[11]);
-  step2[12] = vqsubq_s16(step1[15], step1[12]);
-  step2[13] = vqsubq_s16(step1[14], step1[13]);
-  step2[14] = vqaddq_s16(step1[14], step1[13]);
-  step2[15] = vqaddq_s16(step1[15], step1[12]);
-  step2[16] = step1[16];
-  step2[17] = step1[17];
-  step2[22] = step1[22];
-  step2[23] = step1[23];
-  step2[24] = step1[24];
-  step2[25] = step1[25];
-  step2[30] = step1[30];
-  step2[31] = step1[31];
-
-  // stage 7
-
-  btf_16_lane_0_1_neon(step2[13], step2[10], c7, &step1[13], &step1[10]);
-  btf_16_lane_0_1_neon(step2[12], step2[11], c7, &step1[12], &step1[11]);
-
-  step1[0] = vqaddq_s16(step2[0], step2[7]);
-  step1[1] = vqaddq_s16(step2[1], step2[6]);
-  step1[2] = vqaddq_s16(step2[2], step2[5]);
-  step1[3] = vqaddq_s16(step2[3], step2[4]);
-  step1[4] = vqsubq_s16(step2[3], step2[4]);
-  step1[5] = vqsubq_s16(step2[2], step2[5]);
-  step1[6] = vqsubq_s16(step2[1], step2[6]);
-  step1[7] = vqsubq_s16(step2[0], step2[7]);
-  step1[8] = step2[8];
-  step1[9] = step2[9];
-  step1[14] = step2[14];
-  step1[15] = step2[15];
-  step1[16] = vqaddq_s16(step2[16], step2[23]);
-  step1[17] = vqaddq_s16(step2[17], step2[22]);
-  step1[18] = vqaddq_s16(step2[18], step2[21]);
-  step1[19] = vqaddq_s16(step2[19], step2[20]);
-  step1[20] = vqsubq_s16(step2[19], step2[20]);
-  step1[21] = vqsubq_s16(step2[18], step2[21]);
-  step1[22] = vqsubq_s16(step2[17], step2[22]);
-  step1[23] = vqsubq_s16(step2[16], step2[23]);
-  step1[24] = vqsubq_s16(step2[31], step2[24]);
-  step1[25] = vqsubq_s16(step2[30], step2[25]);
-  step1[26] = vqsubq_s16(step2[29], step2[26]);
-  step1[27] = vqsubq_s16(step2[28], step2[27]);
-  step1[28] = vqaddq_s16(step2[27], step2[28]);
-  step1[29] = vqaddq_s16(step2[26], step2[29]);
-  step1[30] = vqaddq_s16(step2[25], step2[30]);
-  step1[31] = vqaddq_s16(step2[24], step2[31]);
-
-  // stage 8
-
-  btf_16_lane_0_1_neon(step1[27], step1[20], c7, &step2[27], &step2[20]);
-  btf_16_lane_0_1_neon(step1[26], step1[21], c7, &step2[26], &step2[21]);
-  btf_16_lane_0_1_neon(step1[25], step1[22], c7, &step2[25], &step2[22]);
-  btf_16_lane_0_1_neon(step1[24], step1[23], c7, &step2[24], &step2[23]);
-
-  step2[0] = vqaddq_s16(step1[0], step1[15]);
-  step2[1] = vqaddq_s16(step1[1], step1[14]);
-  step2[2] = vqaddq_s16(step1[2], step1[13]);
-  step2[3] = vqaddq_s16(step1[3], step1[12]);
-  step2[4] = vqaddq_s16(step1[4], step1[11]);
-  step2[5] = vqaddq_s16(step1[5], step1[10]);
-  step2[6] = vqaddq_s16(step1[6], step1[9]);
-  step2[7] = vqaddq_s16(step1[7], step1[8]);
-  step2[8] = vqsubq_s16(step1[7], step1[8]);
-  step2[9] = vqsubq_s16(step1[6], step1[9]);
-  step2[10] = vqsubq_s16(step1[5], step1[10]);
-  step2[11] = vqsubq_s16(step1[4], step1[11]);
-  step2[12] = vqsubq_s16(step1[3], step1[12]);
-  step2[13] = vqsubq_s16(step1[2], step1[13]);
-  step2[14] = vqsubq_s16(step1[1], step1[14]);
-  step2[15] = vqsubq_s16(step1[0], step1[15]);
-  step2[16] = step1[16];
-  step2[17] = step1[17];
-  step2[18] = step1[18];
-  step2[19] = step1[19];
-  step2[28] = step1[28];
-  step2[29] = step1[29];
-  step2[30] = step1[30];
-  step2[31] = step1[31];
-
-  // stage 9
-
-  out[0] = vqaddq_s16(step2[0], step2[31]);
-  out[1] = vqaddq_s16(step2[1], step2[30]);
-  out[2] = vqaddq_s16(step2[2], step2[29]);
-  out[3] = vqaddq_s16(step2[3], step2[28]);
-  out[4] = vqaddq_s16(step2[4], step2[27]);
-  out[5] = vqaddq_s16(step2[5], step2[26]);
-  out[6] = vqaddq_s16(step2[6], step2[25]);
-  out[7] = vqaddq_s16(step2[7], step2[24]);
-  out[8] = vqaddq_s16(step2[8], step2[23]);
-  out[9] = vqaddq_s16(step2[9], step2[22]);
-  out[10] = vqaddq_s16(step2[10], step2[21]);
-  out[11] = vqaddq_s16(step2[11], step2[20]);
-  out[12] = vqaddq_s16(step2[12], step2[19]);
-  out[13] = vqaddq_s16(step2[13], step2[18]);
-  out[14] = vqaddq_s16(step2[14], step2[17]);
-  out[15] = vqaddq_s16(step2[15], step2[16]);
-  out[16] = vqsubq_s16(step2[15], step2[16]);
-  out[17] = vqsubq_s16(step2[14], step2[17]);
-  out[18] = vqsubq_s16(step2[13], step2[18]);
-  out[19] = vqsubq_s16(step2[12], step2[19]);
-  out[20] = vqsubq_s16(step2[11], step2[20]);
-  out[21] = vqsubq_s16(step2[10], step2[21]);
-  out[22] = vqsubq_s16(step2[9], step2[22]);
-  out[23] = vqsubq_s16(step2[8], step2[23]);
-  out[24] = vqsubq_s16(step2[7], step2[24]);
-  out[25] = vqsubq_s16(step2[6], step2[25]);
-  out[26] = vqsubq_s16(step2[5], step2[26]);
-  out[27] = vqsubq_s16(step2[4], step2[27]);
-  out[28] = vqsubq_s16(step2[3], step2[28]);
-  out[29] = vqsubq_s16(step2[2], step2[29]);
-  out[30] = vqsubq_s16(step2[1], step2[30]);
-  out[31] = vqsubq_s16(step2[0], step2[31]);
-}
-
-static INLINE void idct32_low1_neon(int16x8_t *in, int16x8_t *out,
-                                    int8_t cos_bit, int bit) {
-  (void)bit;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  int16x8_t step1;
-  int32x4_t t32[2];
-
-  // stage 1
-  // stage 2
-  // stage 3
-  // stage 4
-  // stage 5
-
-  t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
-  t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
-  step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
-                       vrshrn_n_s32(t32[1], INV_COS_BIT));
-
-  // stage 6
-  // stage 7
-  // stage 8
-  // stage 9
-
-  out[0] = step1;
-  out[1] = step1;
-  out[2] = step1;
-  out[3] = step1;
-  out[4] = step1;
-  out[5] = step1;
-  out[6] = step1;
-  out[7] = step1;
-  out[8] = step1;
-  out[9] = step1;
-  out[10] = step1;
-  out[11] = step1;
-  out[12] = step1;
-  out[13] = step1;
-  out[14] = step1;
-  out[15] = step1;
-  out[16] = step1;
-  out[17] = step1;
-  out[18] = step1;
-  out[19] = step1;
-  out[20] = step1;
-  out[21] = step1;
-  out[22] = step1;
-  out[23] = step1;
-  out[24] = step1;
-  out[25] = step1;
-  out[26] = step1;
-  out[27] = step1;
-  out[28] = step1;
-  out[29] = step1;
-  out[30] = step1;
-  out[31] = step1;
-}
-
-static INLINE void idct32_low8_neon(int16x8_t *in, int16x8_t *out,
-                                    int8_t cos_bit, int bit) {
-  (void)bit;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  int16x8_t step1[32], step2[32];
-  int32x4_t t32[16];
-  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
-                                      (int16_t)cospi[40], (int16_t)cospi[24]);
-  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
-                                      (int16_t)cospi[16], cospi[48]);
-  const int16x4_t c2 =
-      set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
-                     (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
-  const int16x4_t c3 =
-      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
-                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
-  // stage 1
-  // stage 2
-
-  step2[0] = in[0];
-  step2[4] = in[4];
-  step2[8] = in[2];
-  step2[12] = in[6];
-
-  btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
-  btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
-  btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
-  btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
-
-  // stage 3
-  step1[0] = step2[0];
-  step1[4] = step2[4];
-
-  btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
-  btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
-
-  step1[16] = step2[16];
-  step1[17] = step2[16];
-  step1[18] = step2[19];
-  step1[19] = step2[19];
-  step1[20] = step2[20];
-  step1[21] = step2[20];
-  step1[22] = step2[23];
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[25] = step2[24];
-  step1[26] = step2[27];
-  step1[27] = step2[27];
-  step1[28] = step2[28];
-  step1[29] = step2[28];
-  step1[30] = step2[31];
-  step1[31] = step2[31];
-
-  // stage 4
-
-  btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
-  btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
-  btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]);
-  btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
-  btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]);
-
-  step2[0] = step1[0];
-  step2[8] = step1[8];
-  step2[9] = step1[8];
-  step2[10] = step1[11];
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-  step2[13] = step1[12];
-  step2[14] = step1[15];
-  step2[15] = step1[15];
-  step2[16] = step1[16];
-  step2[19] = step1[19];
-  step2[20] = step1[20];
-  step2[23] = step1[23];
-  step2[24] = step1[24];
-  step2[27] = step1[27];
-  step2[28] = step1[28];
-  step2[31] = step1[31];
-
-  // stage 5
-
-  t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
-  t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
-  step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
-                          vrshrn_n_s32(t32[1], INV_COS_BIT));
-
-  btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
-  btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]);
-
-  step1[4] = step2[4];
-  step1[5] = step2[4];
-  step1[6] = step2[7];
-  step1[7] = step2[7];
-  step1[8] = step2[8];
-  step1[11] = step2[11];
-  step1[12] = step2[12];
-  step1[15] = step2[15];
-  step1[16] = vqaddq_s16(step2[16], step2[19]);
-  step1[17] = vqaddq_s16(step2[17], step2[18]);
-  step1[18] = vqsubq_s16(step2[17], step2[18]);
-  step1[19] = vqsubq_s16(step2[16], step2[19]);
-  step1[20] = vqsubq_s16(step2[23], step2[20]);
-  step1[21] = vqsubq_s16(step2[22], step2[21]);
-  step1[22] = vqaddq_s16(step2[22], step2[21]);
-  step1[23] = vqaddq_s16(step2[23], step2[20]);
-  step1[24] = vqaddq_s16(step2[24], step2[27]);
-  step1[25] = vqaddq_s16(step2[25], step2[26]);
-  step1[26] = vqsubq_s16(step2[25], step2[26]);
-  step1[27] = vqsubq_s16(step2[24], step2[27]);
-  step1[28] = vqsubq_s16(step2[31], step2[28]);
-  step1[29] = vqsubq_s16(step2[30], step2[29]);
-  step1[30] = vqaddq_s16(step2[30], step2[29]);
-  step1[31] = vqaddq_s16(step2[31], step2[28]);
-
-  // stage 6
-
-  btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
-  btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
-  btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
-  btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]);
-  btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]);
-
-  step2[0] = step1[0];
-  step2[1] = step1[0];
-  step2[2] = step1[0];
-  step2[3] = step1[0];
-  step2[4] = step1[4];
-  step2[7] = step1[7];
-  step2[8] = vqaddq_s16(step1[8], step1[11]);
-  step2[9] = vqaddq_s16(step1[9], step1[10]);
-  step2[10] = vqsubq_s16(step1[9], step1[10]);
-  step2[11] = vqsubq_s16(step1[8], step1[11]);
-  step2[12] = vqsubq_s16(step1[15], step1[12]);
-  step2[13] = vqsubq_s16(step1[14], step1[13]);
-  step2[14] = vqaddq_s16(step1[14], step1[13]);
-  step2[15] = vqaddq_s16(step1[15], step1[12]);
-  step2[16] = step1[16];
-  step2[17] = step1[17];
-  step2[22] = step1[22];
-  step2[23] = step1[23];
-  step2[24] = step1[24];
-  step2[25] = step1[25];
-  step2[30] = step1[30];
-  step2[31] = step1[31];
-
-  // stage 7
-
-  btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
-  btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
-
-  step1[0] = vqaddq_s16(step2[0], step2[7]);
-  step1[1] = vqaddq_s16(step2[1], step2[6]);
-  step1[2] = vqaddq_s16(step2[2], step2[5]);
-  step1[3] = vqaddq_s16(step2[3], step2[4]);
-  step1[4] = vqsubq_s16(step2[3], step2[4]);
-  step1[5] = vqsubq_s16(step2[2], step2[5]);
-  step1[6] = vqsubq_s16(step2[1], step2[6]);
-  step1[7] = vqsubq_s16(step2[0], step2[7]);
-  step1[8] = step2[8];
-  step1[9] = step2[9];
-  step1[14] = step2[14];
-  step1[15] = step2[15];
-  step1[16] = vqaddq_s16(step2[16], step2[23]);
-  step1[17] = vqaddq_s16(step2[17], step2[22]);
-  step1[18] = vqaddq_s16(step2[18], step2[21]);
-  step1[19] = vqaddq_s16(step2[19], step2[20]);
-  step1[20] = vqsubq_s16(step2[19], step2[20]);
-  step1[21] = vqsubq_s16(step2[18], step2[21]);
-  step1[22] = vqsubq_s16(step2[17], step2[22]);
-  step1[23] = vqsubq_s16(step2[16], step2[23]);
-  step1[24] = vqsubq_s16(step2[31], step2[24]);
-  step1[25] = vqsubq_s16(step2[30], step2[25]);
-  step1[26] = vqsubq_s16(step2[29], step2[26]);
-  step1[27] = vqsubq_s16(step2[28], step2[27]);
-  step1[28] = vqaddq_s16(step2[27], step2[28]);
-  step1[29] = vqaddq_s16(step2[26], step2[29]);
-  step1[30] = vqaddq_s16(step2[25], step2[30]);
-  step1[31] = vqaddq_s16(step2[24], step2[31]);
-
-  // stage 8
-
-  btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
-  btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
-  btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
-  btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
-
-  step2[0] = vqaddq_s16(step1[0], step1[15]);
-  step2[1] = vqaddq_s16(step1[1], step1[14]);
-  step2[2] = vqaddq_s16(step1[2], step1[13]);
-  step2[3] = vqaddq_s16(step1[3], step1[12]);
-  step2[4] = vqaddq_s16(step1[4], step1[11]);
-  step2[5] = vqaddq_s16(step1[5], step1[10]);
-  step2[6] = vqaddq_s16(step1[6], step1[9]);
-  step2[7] = vqaddq_s16(step1[7], step1[8]);
-  step2[8] = vqsubq_s16(step1[7], step1[8]);
-  step2[9] = vqsubq_s16(step1[6], step1[9]);
-  step2[10] = vqsubq_s16(step1[5], step1[10]);
-  step2[11] = vqsubq_s16(step1[4], step1[11]);
-  step2[12] = vqsubq_s16(step1[3], step1[12]);
-  step2[13] = vqsubq_s16(step1[2], step1[13]);
-  step2[14] = vqsubq_s16(step1[1], step1[14]);
-  step2[15] = vqsubq_s16(step1[0], step1[15]);
-  step2[16] = step1[16];
-  step2[17] = step1[17];
-  step2[18] = step1[18];
-  step2[19] = step1[19];
-  step2[28] = step1[28];
-  step2[29] = step1[29];
-  step2[30] = step1[30];
-  step2[31] = step1[31];
-
-  // stage 9
-
-  out[0] = vqaddq_s16(step2[0], step2[31]);
-  out[1] = vqaddq_s16(step2[1], step2[30]);
-  out[2] = vqaddq_s16(step2[2], step2[29]);
-  out[3] = vqaddq_s16(step2[3], step2[28]);
-  out[4] = vqaddq_s16(step2[4], step2[27]);
-  out[5] = vqaddq_s16(step2[5], step2[26]);
-  out[6] = vqaddq_s16(step2[6], step2[25]);
-  out[7] = vqaddq_s16(step2[7], step2[24]);
-  out[8] = vqaddq_s16(step2[8], step2[23]);
-  out[9] = vqaddq_s16(step2[9], step2[22]);
-  out[10] = vqaddq_s16(step2[10], step2[21]);
-  out[11] = vqaddq_s16(step2[11], step2[20]);
-  out[12] = vqaddq_s16(step2[12], step2[19]);
-  out[13] = vqaddq_s16(step2[13], step2[18]);
-  out[14] = vqaddq_s16(step2[14], step2[17]);
-  out[15] = vqaddq_s16(step2[15], step2[16]);
-  out[16] = vqsubq_s16(step2[15], step2[16]);
-  out[17] = vqsubq_s16(step2[14], step2[17]);
-  out[18] = vqsubq_s16(step2[13], step2[18]);
-  out[19] = vqsubq_s16(step2[12], step2[19]);
-  out[20] = vqsubq_s16(step2[11], step2[20]);
-  out[21] = vqsubq_s16(step2[10], step2[21]);
-  out[22] = vqsubq_s16(step2[9], step2[22]);
-  out[23] = vqsubq_s16(step2[8], step2[23]);
-  out[24] = vqsubq_s16(step2[7], step2[24]);
-  out[25] = vqsubq_s16(step2[6], step2[25]);
-  out[26] = vqsubq_s16(step2[5], step2[26]);
-  out[27] = vqsubq_s16(step2[4], step2[27]);
-  out[28] = vqsubq_s16(step2[3], step2[28]);
-  out[29] = vqsubq_s16(step2[2], step2[29]);
-  out[30] = vqsubq_s16(step2[1], step2[30]);
-  out[31] = vqsubq_s16(step2[0], step2[31]);
-}
-
-static INLINE void idct32_low16_neon(int16x8_t *in, int16x8_t *out,
-                                     int8_t cos_bit, int bit) {
-  (void)bit;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  int16x8_t step1[32], step2[32];
-  int32x4_t t32[16];
-  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
-                                      (int16_t)cospi[40], (int16_t)cospi[24]);
-  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
-                                      (int16_t)cospi[16], (int16_t)cospi[48]);
-  const int16x4_t c2 =
-      set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
-                     (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
-  const int16x4_t c3 =
-      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
-                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
-
-  // stage 1
-  // stage 2
-
-  btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
-  btf_16_neon(in[15], -cospi[34], cospi[30], &step2[17], &step2[30]);
-  btf_16_neon(in[9], cospi[46], cospi[18], &step2[18], &step2[29]);
-  btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
-  btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
-  btf_16_neon(in[11], -cospi[42], cospi[22], &step2[21], &step2[26]);
-  btf_16_neon(in[13], cospi[38], cospi[26], &step2[22], &step2[25]);
-  btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
-
-  step2[0] = in[0];
-  step2[2] = in[8];
-  step2[4] = in[4];
-  step2[6] = in[12];
-  step2[8] = in[2];
-  step2[10] = in[10];
-  step2[12] = in[6];
-  step2[14] = in[14];
-
-  // stage 3
-
-  btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
-  btf_16_neon(step2[14], -cospi[36], cospi[28], &step1[9], &step1[14]);
-  btf_16_neon(step2[10], cospi[44], cospi[20], &step1[10], &step1[13]);
-  btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
-
-  step1[0] = step2[0];
-  step1[2] = step2[2];
-  step1[4] = step2[4];
-  step1[6] = step2[6];
-  step1[16] = vqaddq_s16(step2[16], step2[17]);
-  step1[17] = vqsubq_s16(step2[16], step2[17]);
-  step1[18] = vqsubq_s16(step2[19], step2[18]);
-  step1[19] = vqaddq_s16(step2[19], step2[18]);
-  step1[20] = vqaddq_s16(step2[20], step2[21]);
-  step1[21] = vqsubq_s16(step2[20], step2[21]);
-  step1[22] = vqsubq_s16(step2[23], step2[22]);
-  step1[23] = vqaddq_s16(step2[23], step2[22]);
-  step1[24] = vqaddq_s16(step2[24], step2[25]);
-  step1[25] = vqsubq_s16(step2[24], step2[25]);
-  step1[26] = vqsubq_s16(step2[27], step2[26]);
-  step1[27] = vqaddq_s16(step2[27], step2[26]);
-  step1[28] = vqaddq_s16(step2[28], step2[29]);
-  step1[29] = vqsubq_s16(step2[28], step2[29]);
-  step1[30] = vqsubq_s16(step2[31], step2[30]);
-  step1[31] = vqaddq_s16(step2[31], step2[30]);
-
-  // stage 4
-
-  btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
-  btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]);
-  btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
-  btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]);
-  btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
-  btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]);
-
-  step2[0] = step1[0];
-  step2[2] = step1[2];
-  step2[8] = vqaddq_s16(step1[8], step1[9]);
-  step2[9] = vqsubq_s16(step1[8], step1[9]);
-  step2[10] = vqsubq_s16(step1[11], step1[10]);
-  step2[11] = vqaddq_s16(step1[11], step1[10]);
-  step2[12] = vqaddq_s16(step1[12], step1[13]);
-  step2[13] = vqsubq_s16(step1[12], step1[13]);
-  step2[14] = vqsubq_s16(step1[15], step1[14]);
-  step2[15] = vqaddq_s16(step1[15], step1[14]);
-  step2[16] = step1[16];
-  step2[19] = step1[19];
-  step2[20] = step1[20];
-  step2[23] = step1[23];
-  step2[24] = step1[24];
-  step2[27] = step1[27];
-  step2[28] = step1[28];
-  step2[31] = step1[31];
-
-  // stage 5
-
-  t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
-  t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
-
-  step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
-                          vrshrn_n_s32(t32[1], INV_COS_BIT));
-
-  btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]);
-  btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
-  btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]);
-
-  step1[4] = vqaddq_s16(step2[4], step2[5]);
-  step1[5] = vqsubq_s16(step2[4], step2[5]);
-  step1[6] = vqsubq_s16(step2[7], step2[6]);
-  step1[7] = vqaddq_s16(step2[7], step2[6]);
-  step1[8] = step2[8];
-  step1[11] = step2[11];
-  step1[12] = step2[12];
-  step1[15] = step2[15];
-  step1[16] = vqaddq_s16(step2[16], step2[19]);
-  step1[17] = vqaddq_s16(step2[17], step2[18]);
-  step1[18] = vqsubq_s16(step2[17], step2[18]);
-  step1[19] = vqsubq_s16(step2[16], step2[19]);
-  step1[20] = vqsubq_s16(step2[23], step2[20]);
-  step1[21] = vqsubq_s16(step2[22], step2[21]);
-  step1[22] = vqaddq_s16(step2[22], step2[21]);
-  step1[23] = vqaddq_s16(step2[23], step2[20]);
-  step1[24] = vqaddq_s16(step2[24], step2[27]);
-  step1[25] = vqaddq_s16(step2[25], step2[26]);
-  step1[26] = vqsubq_s16(step2[25], step2[26]);
-  step1[27] = vqsubq_s16(step2[24], step2[27]);
-  step1[28] = vqsubq_s16(step2[31], step2[28]);
-  step1[29] = vqsubq_s16(step2[30], step2[29]);
-  step1[30] = vqaddq_s16(step2[30], step2[29]);
-  step1[31] = vqaddq_s16(step2[31], step2[28]);
-
-  // stage 6
-
-  btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
-  btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
-  btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
-  btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]);
-  btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]);
-
-  step2[0] = vqaddq_s16(step1[0], step1[3]);
-  step2[1] = vqaddq_s16(step1[0], step1[2]);
-  step2[2] = vqsubq_s16(step1[0], step1[2]);
-  step2[3] = vqsubq_s16(step1[0], step1[3]);
-  step2[4] = step1[4];
-  step2[7] = step1[7];
-  step2[8] = vqaddq_s16(step1[8], step1[11]);
-  step2[9] = vqaddq_s16(step1[9], step1[10]);
-  step2[10] = vqsubq_s16(step1[9], step1[10]);
-  step2[11] = vqsubq_s16(step1[8], step1[11]);
-  step2[12] = vqsubq_s16(step1[15], step1[12]);
-  step2[13] = vqsubq_s16(step1[14], step1[13]);
-  step2[14] = vqaddq_s16(step1[14], step1[13]);
-  step2[15] = vqaddq_s16(step1[15], step1[12]);
-  step2[16] = step1[16];
-  step2[17] = step1[17];
-  step2[22] = step1[22];
-  step2[23] = step1[23];
-  step2[24] = step1[24];
-  step2[25] = step1[25];
-  step2[30] = step1[30];
-  step2[31] = step1[31];
-
-  // stage 7
-
-  btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
-  btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
-
-  step1[0] = vqaddq_s16(step2[0], step2[7]);
-  step1[1] = vqaddq_s16(step2[1], step2[6]);
-  step1[2] = vqaddq_s16(step2[2], step2[5]);
-  step1[3] = vqaddq_s16(step2[3], step2[4]);
-  step1[4] = vqsubq_s16(step2[3], step2[4]);
-  step1[5] = vqsubq_s16(step2[2], step2[5]);
-  step1[6] = vqsubq_s16(step2[1], step2[6]);
-  step1[7] = vqsubq_s16(step2[0], step2[7]);
-  step1[8] = step2[8];
-  step1[9] = step2[9];
-  step1[14] = step2[14];
-  step1[15] = step2[15];
-  step1[16] = vqaddq_s16(step2[16], step2[23]);
-  step1[17] = vqaddq_s16(step2[17], step2[22]);
-  step1[18] = vqaddq_s16(step2[18], step2[21]);
-  step1[19] = vqaddq_s16(step2[19], step2[20]);
-  step1[20] = vqsubq_s16(step2[19], step2[20]);
-  step1[21] = vqsubq_s16(step2[18], step2[21]);
-  step1[22] = vqsubq_s16(step2[17], step2[22]);
-  step1[23] = vqsubq_s16(step2[16], step2[23]);
-  step1[24] = vqsubq_s16(step2[31], step2[24]);
-  step1[25] = vqsubq_s16(step2[30], step2[25]);
-  step1[26] = vqsubq_s16(step2[29], step2[26]);
-  step1[27] = vqsubq_s16(step2[28], step2[27]);
-  step1[28] = vqaddq_s16(step2[27], step2[28]);
-  step1[29] = vqaddq_s16(step2[26], step2[29]);
-  step1[30] = vqaddq_s16(step2[25], step2[30]);
-  step1[31] = vqaddq_s16(step2[24], step2[31]);
-
-  // stage 8
-
-  btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
-  btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
-  btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
-  btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
-
-  step2[0] = vqaddq_s16(step1[0], step1[15]);
-  step2[1] = vqaddq_s16(step1[1], step1[14]);
-  step2[2] = vqaddq_s16(step1[2], step1[13]);
-  step2[3] = vqaddq_s16(step1[3], step1[12]);
-  step2[4] = vqaddq_s16(step1[4], step1[11]);
-  step2[5] = vqaddq_s16(step1[5], step1[10]);
-  step2[6] = vqaddq_s16(step1[6], step1[9]);
-  step2[7] = vqaddq_s16(step1[7], step1[8]);
-  step2[8] = vqsubq_s16(step1[7], step1[8]);
-  step2[9] = vqsubq_s16(step1[6], step1[9]);
-  step2[10] = vqsubq_s16(step1[5], step1[10]);
-  step2[11] = vqsubq_s16(step1[4], step1[11]);
-  step2[12] = vqsubq_s16(step1[3], step1[12]);
-  step2[13] = vqsubq_s16(step1[2], step1[13]);
-  step2[14] = vqsubq_s16(step1[1], step1[14]);
-  step2[15] = vqsubq_s16(step1[0], step1[15]);
-  step2[16] = step1[16];
-  step2[17] = step1[17];
-  step2[18] = step1[18];
-  step2[19] = step1[19];
-  step2[28] = step1[28];
-  step2[29] = step1[29];
-  step2[30] = step1[30];
-  step2[31] = step1[31];
-
-  // stage 9
-
-  out[0] = vqaddq_s16(step2[0], step2[31]);
-  out[1] = vqaddq_s16(step2[1], step2[30]);
-  out[2] = vqaddq_s16(step2[2], step2[29]);
-  out[3] = vqaddq_s16(step2[3], step2[28]);
-  out[4] = vqaddq_s16(step2[4], step2[27]);
-  out[5] = vqaddq_s16(step2[5], step2[26]);
-  out[6] = vqaddq_s16(step2[6], step2[25]);
-  out[7] = vqaddq_s16(step2[7], step2[24]);
-  out[8] = vqaddq_s16(step2[8], step2[23]);
-  out[9] = vqaddq_s16(step2[9], step2[22]);
-  out[10] = vqaddq_s16(step2[10], step2[21]);
-  out[11] = vqaddq_s16(step2[11], step2[20]);
-  out[12] = vqaddq_s16(step2[12], step2[19]);
-  out[13] = vqaddq_s16(step2[13], step2[18]);
-  out[14] = vqaddq_s16(step2[14], step2[17]);
-  out[15] = vqaddq_s16(step2[15], step2[16]);
-  out[16] = vqsubq_s16(step2[15], step2[16]);
-  out[17] = vqsubq_s16(step2[14], step2[17]);
-  out[18] = vqsubq_s16(step2[13], step2[18]);
-  out[19] = vqsubq_s16(step2[12], step2[19]);
-  out[20] = vqsubq_s16(step2[11], step2[20]);
-  out[21] = vqsubq_s16(step2[10], step2[21]);
-  out[22] = vqsubq_s16(step2[9], step2[22]);
-  out[23] = vqsubq_s16(step2[8], step2[23]);
-  out[24] = vqsubq_s16(step2[7], step2[24]);
-  out[25] = vqsubq_s16(step2[6], step2[25]);
-  out[26] = vqsubq_s16(step2[5], step2[26]);
-  out[27] = vqsubq_s16(step2[4], step2[27]);
-  out[28] = vqsubq_s16(step2[3], step2[28]);
-  out[29] = vqsubq_s16(step2[2], step2[29]);
-  out[30] = vqsubq_s16(step2[1], step2[30]);
-  out[31] = vqsubq_s16(step2[0], step2[31]);
-}
-static INLINE void idct64_stage9_neon(int16x8_t *step2, int16x8_t *step1,
-                                      int8_t cos_bit) {
-  const int32_t *cospi = cospi_arr(cos_bit);
-  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
-                                      (int16_t)cospi[16], (int16_t)cospi[48]);
-
-  btf_16_lane_0_1_neon(step2[27], step2[20], c3, &step1[27], &step1[20]);
-  btf_16_lane_0_1_neon(step2[26], step2[21], c3, &step1[26], &step1[21]);
-  btf_16_lane_0_1_neon(step2[25], step2[22], c3, &step1[25], &step1[22]);
-  btf_16_lane_0_1_neon(step2[24], step2[23], c3, &step1[24], &step1[23]);
-
-  step1[0] = vqaddq_s16(step2[0], step2[15]);
-  step1[1] = vqaddq_s16(step2[1], step2[14]);
-  step1[2] = vqaddq_s16(step2[2], step2[13]);
-  step1[3] = vqaddq_s16(step2[3], step2[12]);
-  step1[4] = vqaddq_s16(step2[4], step2[11]);
-  step1[5] = vqaddq_s16(step2[5], step2[10]);
-  step1[6] = vqaddq_s16(step2[6], step2[9]);
-  step1[7] = vqaddq_s16(step2[7], step2[8]);
-  step1[8] = vqsubq_s16(step2[7], step2[8]);
-  step1[9] = vqsubq_s16(step2[6], step2[9]);
-  step1[10] = vqsubq_s16(step2[5], step2[10]);
-  step1[11] = vqsubq_s16(step2[4], step2[11]);
-  step1[12] = vqsubq_s16(step2[3], step2[12]);
-  step1[13] = vqsubq_s16(step2[2], step2[13]);
-  step1[14] = vqsubq_s16(step2[1], step2[14]);
-  step1[15] = vqsubq_s16(step2[0], step2[15]);
-  step1[16] = step2[16];
-  step1[17] = step2[17];
-  step1[18] = step2[18];
-  step1[19] = step2[19];
-  step1[28] = step2[28];
-  step1[29] = step2[29];
-  step1[30] = step2[30];
-  step1[31] = step2[31];
-  step1[32] = vqaddq_s16(step2[32], step2[47]);
-  step1[33] = vqaddq_s16(step2[33], step2[46]);
-  step1[34] = vqaddq_s16(step2[34], step2[45]);
-  step1[35] = vqaddq_s16(step2[35], step2[44]);
-  step1[36] = vqaddq_s16(step2[36], step2[43]);
-  step1[37] = vqaddq_s16(step2[37], step2[42]);
-  step1[38] = vqaddq_s16(step2[38], step2[41]);
-  step1[39] = vqaddq_s16(step2[39], step2[40]);
-  step1[40] = vqsubq_s16(step2[39], step2[40]);
-  step1[41] = vqsubq_s16(step2[38], step2[41]);
-  step1[42] = vqsubq_s16(step2[37], step2[42]);
-  step1[43] = vqsubq_s16(step2[36], step2[43]);
-  step1[44] = vqsubq_s16(step2[35], step2[44]);
-  step1[45] = vqsubq_s16(step2[34], step2[45]);
-  step1[46] = vqsubq_s16(step2[33], step2[46]);
-  step1[47] = vqsubq_s16(step2[32], step2[47]);
-  step1[48] = vqsubq_s16(step2[63], step2[48]);
-  step1[49] = vqsubq_s16(step2[62], step2[49]);
-  step1[50] = vqsubq_s16(step2[61], step2[50]);
-  step1[51] = vqsubq_s16(step2[60], step2[51]);
-  step1[52] = vqsubq_s16(step2[59], step2[52]);
-  step1[53] = vqsubq_s16(step2[58], step2[53]);
-  step1[54] = vqsubq_s16(step2[57], step2[54]);
-  step1[55] = vqsubq_s16(step2[56], step2[55]);
-  step1[56] = vqaddq_s16(step2[56], step2[55]);
-  step1[57] = vqaddq_s16(step2[57], step2[54]);
-  step1[58] = vqaddq_s16(step2[58], step2[53]);
-  step1[59] = vqaddq_s16(step2[59], step2[52]);
-  step1[60] = vqaddq_s16(step2[60], step2[51]);
-  step1[61] = vqaddq_s16(step2[61], step2[50]);
-  step1[62] = vqaddq_s16(step2[62], step2[49]);
-  step1[63] = vqaddq_s16(step2[63], step2[48]);
-}
-
-static INLINE void idct64_stage10_neon(int16x8_t *step1, int16x8_t *step2,
-                                       int8_t cos_bit) {
-  const int32_t *cospi = cospi_arr(cos_bit);
-  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
-                                      (int16_t)cospi[16], (int16_t)cospi[48]);
-
-  btf_16_lane_0_1_neon(step1[55], step1[40], c3, &step2[55], &step2[40]);
-  btf_16_lane_0_1_neon(step1[54], step1[41], c3, &step2[54], &step2[41]);
-  btf_16_lane_0_1_neon(step1[53], step1[42], c3, &step2[53], &step2[42]);
-  btf_16_lane_0_1_neon(step1[52], step1[43], c3, &step2[52], &step2[43]);
-  btf_16_lane_0_1_neon(step1[51], step1[44], c3, &step2[51], &step2[44]);
-  btf_16_lane_0_1_neon(step1[50], step1[45], c3, &step2[50], &step2[45]);
-  btf_16_lane_0_1_neon(step1[49], step1[46], c3, &step2[49], &step2[46]);
-  btf_16_lane_0_1_neon(step1[48], step1[47], c3, &step2[48], &step2[47]);
-
-  step2[0] = vqaddq_s16(step1[0], step1[31]);
-  step2[1] = vqaddq_s16(step1[1], step1[30]);
-  step2[2] = vqaddq_s16(step1[2], step1[29]);
-  step2[3] = vqaddq_s16(step1[3], step1[28]);
-  step2[4] = vqaddq_s16(step1[4], step1[27]);
-  step2[5] = vqaddq_s16(step1[5], step1[26]);
-  step2[6] = vqaddq_s16(step1[6], step1[25]);
-  step2[7] = vqaddq_s16(step1[7], step1[24]);
-  step2[8] = vqaddq_s16(step1[8], step1[23]);
-  step2[9] = vqaddq_s16(step1[9], step1[22]);
-  step2[10] = vqaddq_s16(step1[10], step1[21]);
-  step2[11] = vqaddq_s16(step1[11], step1[20]);
-  step2[12] = vqaddq_s16(step1[12], step1[19]);
-  step2[13] = vqaddq_s16(step1[13], step1[18]);
-  step2[14] = vqaddq_s16(step1[14], step1[17]);
-  step2[15] = vqaddq_s16(step1[15], step1[16]);
-  step2[16] = vqsubq_s16(step1[15], step1[16]);
-  step2[17] = vqsubq_s16(step1[14], step1[17]);
-  step2[18] = vqsubq_s16(step1[13], step1[18]);
-  step2[19] = vqsubq_s16(step1[12], step1[19]);
-  step2[20] = vqsubq_s16(step1[11], step1[20]);
-  step2[21] = vqsubq_s16(step1[10], step1[21]);
-  step2[22] = vqsubq_s16(step1[9], step1[22]);
-  step2[23] = vqsubq_s16(step1[8], step1[23]);
-  step2[24] = vqsubq_s16(step1[7], step1[24]);
-  step2[25] = vqsubq_s16(step1[6], step1[25]);
-  step2[26] = vqsubq_s16(step1[5], step1[26]);
-  step2[27] = vqsubq_s16(step1[4], step1[27]);
-  step2[28] = vqsubq_s16(step1[3], step1[28]);
-  step2[29] = vqsubq_s16(step1[2], step1[29]);
-  step2[30] = vqsubq_s16(step1[1], step1[30]);
-  step2[31] = vqsubq_s16(step1[0], step1[31]);
-  step2[32] = step1[32];
-  step2[33] = step1[33];
-  step2[34] = step1[34];
-  step2[35] = step1[35];
-  step2[36] = step1[36];
-  step2[37] = step1[37];
-  step2[38] = step1[38];
-  step2[39] = step1[39];
-  step2[56] = step1[56];
-  step2[57] = step1[57];
-  step2[58] = step1[58];
-  step2[59] = step1[59];
-  step2[60] = step1[60];
-  step2[61] = step1[61];
-  step2[62] = step1[62];
-  step2[63] = step1[63];
-}
-
-static INLINE void idct64_low32_neon(int16x8_t *in, int16x8_t *out,
-                                     int8_t cos_bit, int bit) {
-  (void)bit;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  int16x8_t step2[64], step1[64];
-  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
-                                      (int16_t)cospi[36], (int16_t)cospi[28]);
-  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
-                                      (int16_t)cospi[52], (int16_t)cospi[12]);
-  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
-                                      (int16_t)cospi[40], (int16_t)cospi[24]);
-  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
-                                      (int16_t)cospi[16], (int16_t)cospi[48]);
-  const int16x4_t c4 =
-      set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]),
-                     (int16_t)(-cospi[36]), (int16_t)(-cospi[28]));
-  const int16x4_t c5 =
-      set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]),
-                     (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
-  const int16x4_t c6 =
-      set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
-                     (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
-  const int16x4_t c7 =
-      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
-                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
-
-  // stage 1
-  // stage 2
-
-  step2[0] = in[0];
-  step2[2] = in[16];
-  step2[4] = in[8];
-  step2[6] = in[24];
-  step2[8] = in[4];
-  step2[10] = in[20];
-  step2[12] = in[12];
-  step2[14] = in[28];
-  step2[16] = in[2];
-  step2[18] = in[18];
-  step2[20] = in[10];
-  step2[22] = in[26];
-  step2[24] = in[6];
-  step2[26] = in[22];
-  step2[28] = in[14];
-  step2[30] = in[30];
-
-  btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
-  btf_16_neon(in[31], -cospi[33], cospi[31], &step2[33], &step2[62]);
-  btf_16_neon(in[17], cospi[47], cospi[17], &step2[34], &step2[61]);
-  btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]);
-  btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]);
-  btf_16_neon(in[23], -cospi[41], cospi[23], &step2[37], &step2[58]);
-  btf_16_neon(in[25], cospi[39], cospi[25], &step2[38], &step2[57]);
-  btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
-  btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
-  btf_16_neon(in[27], -cospi[37], cospi[27], &step2[41], &step2[54]);
-  btf_16_neon(in[21], cospi[43], cospi[21], &step2[42], &step2[53]);
-  btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]);
-  btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]);
-  btf_16_neon(in[19], -cospi[45], cospi[19], &step2[45], &step2[50]);
-  btf_16_neon(in[29], cospi[35], cospi[29], &step2[46], &step2[49]);
-  btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
-
-  // stage 3
-
-  step1[0] = step2[0];
-  step1[2] = step2[2];
-  step1[4] = step2[4];
-  step1[6] = step2[6];
-  step1[8] = step2[8];
-  step1[10] = step2[10];
-  step1[12] = step2[12];
-  step1[14] = step2[14];
-
-  btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
-  btf_16_neon(step2[30], -cospi[34], cospi[30], &step1[17], &step1[30]);
-  btf_16_neon(step2[18], cospi[46], cospi[18], &step1[18], &step1[29]);
-  btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]);
-  btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]);
-  btf_16_neon(step2[26], -cospi[42], cospi[22], &step1[21], &step1[26]);
-  btf_16_neon(step2[22], cospi[38], cospi[26], &step1[22], &step1[25]);
-  btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
-
-  step1[32] = vqaddq_s16(step2[32], step2[33]);
-  step1[33] = vqsubq_s16(step2[32], step2[33]);
-  step1[34] = vqsubq_s16(step2[35], step2[34]);
-  step1[35] = vqaddq_s16(step2[35], step2[34]);
-  step1[36] = vqaddq_s16(step2[36], step2[37]);
-  step1[37] = vqsubq_s16(step2[36], step2[37]);
-  step1[38] = vqsubq_s16(step2[39], step2[38]);
-  step1[39] = vqaddq_s16(step2[39], step2[38]);
-  step1[40] = vqaddq_s16(step2[40], step2[41]);
-  step1[41] = vqsubq_s16(step2[40], step2[41]);
-  step1[42] = vqsubq_s16(step2[43], step2[42]);
-  step1[43] = vqaddq_s16(step2[43], step2[42]);
-  step1[44] = vqaddq_s16(step2[44], step2[45]);
-  step1[45] = vqsubq_s16(step2[44], step2[45]);
-  step1[46] = vqsubq_s16(step2[47], step2[46]);
-  step1[47] = vqaddq_s16(step2[47], step2[46]);
-  step1[48] = vqaddq_s16(step2[48], step2[49]);
-  step1[49] = vqsubq_s16(step2[48], step2[49]);
-  step1[50] = vqsubq_s16(step2[51], step2[50]);
-  step1[51] = vqaddq_s16(step2[51], step2[50]);
-  step1[52] = vqaddq_s16(step2[52], step2[53]);
-  step1[53] = vqsubq_s16(step2[52], step2[53]);
-  step1[54] = vqsubq_s16(step2[55], step2[54]);
-  step1[55] = vqaddq_s16(step2[55], step2[54]);
-  step1[56] = vqaddq_s16(step2[56], step2[57]);
-  step1[57] = vqsubq_s16(step2[56], step2[57]);
-  step1[58] = vqsubq_s16(step2[59], step2[58]);
-  step1[59] = vqaddq_s16(step2[59], step2[58]);
-  step1[60] = vqaddq_s16(step2[60], step2[61]);
-  step1[61] = vqsubq_s16(step2[60], step2[61]);
-  step1[62] = vqsubq_s16(step2[63], step2[62]);
-  step1[63] = vqaddq_s16(step2[63], step2[62]);
-
-  // stage 4
-
-  step2[0] = step1[0];
-  step2[2] = step1[2];
-  step2[4] = step1[4];
-  step2[6] = step1[6];
-
-  btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
-  btf_16_neon(step1[14], -cospi[36], cospi[28], &step2[9], &step2[14]);
-  btf_16_neon(step1[10], cospi[44], cospi[20], &step2[10], &step2[13]);
-  btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]);
-  btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
-  btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]);
-  btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]);
-  btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
-  btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
-  btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]);
-  btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]);
-  btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]);
-
-  step2[16] = vqaddq_s16(step1[16], step1[17]);
-  step2[17] = vqsubq_s16(step1[16], step1[17]);
-  step2[18] = vqsubq_s16(step1[19], step1[18]);
-  step2[19] = vqaddq_s16(step1[19], step1[18]);
-  step2[20] = vqaddq_s16(step1[20], step1[21]);
-  step2[21] = vqsubq_s16(step1[20], step1[21]);
-  step2[22] = vqsubq_s16(step1[23], step1[22]);
-  step2[23] = vqaddq_s16(step1[23], step1[22]);
-  step2[24] = vqaddq_s16(step1[24], step1[25]);
-  step2[25] = vqsubq_s16(step1[24], step1[25]);
-  step2[26] = vqsubq_s16(step1[27], step1[26]);
-  step2[27] = vqaddq_s16(step1[27], step1[26]);
-  step2[28] = vqaddq_s16(step1[28], step1[29]);
-  step2[29] = vqsubq_s16(step1[28], step1[29]);
-  step2[30] = vqsubq_s16(step1[31], step1[30]);
-  step2[31] = vqaddq_s16(step1[31], step1[30]);
-  step2[32] = step1[32];
-  step2[35] = step1[35];
-  step2[36] = step1[36];
-  step2[39] = step1[39];
-  step2[40] = step1[40];
-  step2[43] = step1[43];
-  step2[44] = step1[44];
-  step2[47] = step1[47];
-  step2[48] = step1[48];
-  step2[51] = step1[51];
-  step2[52] = step1[52];
-  step2[55] = step1[55];
-  step2[56] = step1[56];
-  step2[59] = step1[59];
-  step2[60] = step1[60];
-  step2[63] = step1[63];
-
-  // stage 5
-
-  step1[0] = step2[0];
-  step1[2] = step2[2];
-
-  btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
-  btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
-  btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
-  btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]);
-  btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]);
-  btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]);
-
-  step1[8] = vqaddq_s16(step2[8], step2[9]);
-  step1[9] = vqsubq_s16(step2[8], step2[9]);
-  step1[10] = vqsubq_s16(step2[11], step2[10]);
-  step1[11] = vqaddq_s16(step2[11], step2[10]);
-  step1[12] = vqaddq_s16(step2[12], step2[13]);
-  step1[13] = vqsubq_s16(step2[12], step2[13]);
-  step1[14] = vqsubq_s16(step2[15], step2[14]);
-  step1[15] = vqaddq_s16(step2[15], step2[14]);
-  step1[16] = step2[16];
-  step1[19] = step2[19];
-  step1[20] = step2[20];
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[27] = step2[27];
-  step1[28] = step2[28];
-  step1[31] = step2[31];
-  step1[32] = vqaddq_s16(step2[32], step2[35]);
-  step1[33] = vqaddq_s16(step2[33], step2[34]);
-  step1[34] = vqsubq_s16(step2[33], step2[34]);
-  step1[35] = vqsubq_s16(step2[32], step2[35]);
-  step1[36] = vqsubq_s16(step2[39], step2[36]);
-  step1[37] = vqsubq_s16(step2[38], step2[37]);
-  step1[38] = vqaddq_s16(step2[38], step2[37]);
-  step1[39] = vqaddq_s16(step2[39], step2[36]);
-  step1[40] = vqaddq_s16(step2[40], step2[43]);
-  step1[41] = vqaddq_s16(step2[41], step2[42]);
-  step1[42] = vqsubq_s16(step2[41], step2[42]);
-  step1[43] = vqsubq_s16(step2[40], step2[43]);
-  step1[44] = vqsubq_s16(step2[47], step2[44]);
-  step1[45] = vqsubq_s16(step2[46], step2[45]);
-  step1[46] = vqaddq_s16(step2[46], step2[45]);
-  step1[47] = vqaddq_s16(step2[47], step2[44]);
-  step1[48] = vqaddq_s16(step2[48], step2[51]);
-  step1[49] = vqaddq_s16(step2[49], step2[50]);
-  step1[50] = vqsubq_s16(step2[49], step2[50]);
-  step1[51] = vqsubq_s16(step2[48], step2[51]);
-  step1[52] = vqsubq_s16(step2[55], step2[52]);
-  step1[53] = vqsubq_s16(step2[54], step2[53]);
-  step1[54] = vqaddq_s16(step2[54], step2[53]);
-  step1[55] = vqaddq_s16(step2[55], step2[52]);
-  step1[56] = vqaddq_s16(step2[56], step2[59]);
-  step1[57] = vqaddq_s16(step2[57], step2[58]);
-  step1[58] = vqsubq_s16(step2[57], step2[58]);
-  step1[59] = vqsubq_s16(step2[56], step2[59]);
-  step1[60] = vqsubq_s16(step2[63], step2[60]);
-  step1[61] = vqsubq_s16(step2[62], step2[61]);
-  step1[62] = vqaddq_s16(step2[62], step2[61]);
-  step1[63] = vqaddq_s16(step2[63], step2[60]);
-
-  // stage 6
-
-  btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
-  btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
-  btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
-  btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]);
-  btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
-  btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
-  btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]);
-  btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]);
-  btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
-  btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
-  btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]);
-  btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]);
-
-  step2[4] = vqaddq_s16(step1[4], step1[5]);
-  step2[5] = vqsubq_s16(step1[4], step1[5]);
-  step2[6] = vqsubq_s16(step1[7], step1[6]);
-  step2[7] = vqaddq_s16(step1[7], step1[6]);
-  step2[8] = step1[8];
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-  step2[15] = step1[15];
-  step2[16] = vqaddq_s16(step1[16], step1[19]);
-  step2[17] = vqaddq_s16(step1[17], step1[18]);
-  step2[18] = vqsubq_s16(step1[17], step1[18]);
-  step2[19] = vqsubq_s16(step1[16], step1[19]);
-  step2[20] = vqsubq_s16(step1[23], step1[20]);
-  step2[21] = vqsubq_s16(step1[22], step1[21]);
-  step2[22] = vqaddq_s16(step1[22], step1[21]);
-  step2[23] = vqaddq_s16(step1[23], step1[20]);
-  step2[24] = vqaddq_s16(step1[24], step1[27]);
-  step2[25] = vqaddq_s16(step1[25], step1[26]);
-  step2[26] = vqsubq_s16(step1[25], step1[26]);
-  step2[27] = vqsubq_s16(step1[24], step1[27]);
-  step2[28] = vqsubq_s16(step1[31], step1[28]);
-  step2[29] = vqsubq_s16(step1[30], step1[29]);
-  step2[30] = vqaddq_s16(step1[30], step1[29]);
-  step2[31] = vqaddq_s16(step1[31], step1[28]);
-  step2[32] = step1[32];
-  step2[33] = step1[33];
-  step2[38] = step1[38];
-  step2[39] = step1[39];
-  step2[40] = step1[40];
-  step2[41] = step1[41];
-  step2[46] = step1[46];
-  step2[47] = step1[47];
-  step2[48] = step1[48];
-  step2[49] = step1[49];
-  step2[54] = step1[54];
-  step2[55] = step1[55];
-  step2[56] = step1[56];
-  step2[57] = step1[57];
-  step2[62] = step1[62];
-  step2[63] = step1[63];
-
-  // stage 7
-
-  btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
-  btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
-  btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
-  btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]);
-  btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]);
-
-  step1[0] = vqaddq_s16(step2[0], step2[3]);
-  step1[1] = vqaddq_s16(step2[1], step2[2]);
-  step1[2] = vqsubq_s16(step2[1], step2[2]);
-  step1[3] = vqsubq_s16(step2[0], step2[3]);
-  step1[4] = step2[4];
-  step1[7] = step2[7];
-  step1[8] = vqaddq_s16(step2[8], step2[11]);
-  step1[9] = vqaddq_s16(step2[9], step2[10]);
-  step1[10] = vqsubq_s16(step2[9], step2[10]);
-  step1[11] = vqsubq_s16(step2[8], step2[11]);
-  step1[12] = vqsubq_s16(step2[15], step2[12]);
-  step1[13] = vqsubq_s16(step2[14], step2[13]);
-  step1[14] = vqaddq_s16(step2[14], step2[13]);
-  step1[15] = vqaddq_s16(step2[15], step2[12]);
-  step1[16] = step2[16];
-  step1[17] = step2[17];
-  step1[22] = step2[22];
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[25] = step2[25];
-  step1[30] = step2[30];
-  step1[31] = step2[31];
-  step1[32] = vqaddq_s16(step2[32], step2[39]);
-  step1[33] = vqaddq_s16(step2[33], step2[38]);
-  step1[34] = vqaddq_s16(step2[34], step2[37]);
-  step1[35] = vqaddq_s16(step2[35], step2[36]);
-  step1[36] = vqsubq_s16(step2[35], step2[36]);
-  step1[37] = vqsubq_s16(step2[34], step2[37]);
-  step1[38] = vqsubq_s16(step2[33], step2[38]);
-  step1[39] = vqsubq_s16(step2[32], step2[39]);
-  step1[40] = vqsubq_s16(step2[47], step2[40]);
-  step1[41] = vqsubq_s16(step2[46], step2[41]);
-  step1[42] = vqsubq_s16(step2[45], step2[42]);
-  step1[43] = vqsubq_s16(step2[44], step2[43]);
-  step1[44] = vqaddq_s16(step2[43], step2[44]);
-  step1[45] = vqaddq_s16(step2[42], step2[45]);
-  step1[46] = vqaddq_s16(step2[41], step2[46]);
-  step1[47] = vqaddq_s16(step2[40], step2[47]);
-  step1[48] = vqaddq_s16(step2[48], step2[55]);
-  step1[49] = vqaddq_s16(step2[49], step2[54]);
-  step1[50] = vqaddq_s16(step2[50], step2[53]);
-  step1[51] = vqaddq_s16(step2[51], step2[52]);
-  step1[52] = vqsubq_s16(step2[51], step2[52]);
-  step1[53] = vqsubq_s16(step2[50], step2[53]);
-  step1[54] = vqsubq_s16(step2[49], step2[54]);
-  step1[55] = vqsubq_s16(step2[48], step2[55]);
-  step1[56] = vqsubq_s16(step2[63], step2[56]);
-  step1[57] = vqsubq_s16(step2[62], step2[57]);
-  step1[58] = vqsubq_s16(step2[61], step2[58]);
-  step1[59] = vqsubq_s16(step2[60], step2[59]);
-  step1[60] = vqaddq_s16(step2[59], step2[60]);
-  step1[61] = vqaddq_s16(step2[58], step2[61]);
-  step1[62] = vqaddq_s16(step2[57], step2[62]);
-  step1[63] = vqaddq_s16(step2[56], step2[63]);
-
-  // stage 8
-
-  btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
-  btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
-  btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
-  btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
-  btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
-  btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
-  btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]);
-  btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]);
-  btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]);
-  btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]);
-
-  step2[0] = vqaddq_s16(step1[0], step1[7]);
-  step2[1] = vqaddq_s16(step1[1], step1[6]);
-  step2[2] = vqaddq_s16(step1[2], step1[5]);
-  step2[3] = vqaddq_s16(step1[3], step1[4]);
-  step2[4] = vqsubq_s16(step1[3], step1[4]);
-  step2[5] = vqsubq_s16(step1[2], step1[5]);
-  step2[6] = vqsubq_s16(step1[1], step1[6]);
-  step2[7] = vqsubq_s16(step1[0], step1[7]);
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-  step2[16] = vqaddq_s16(step1[16], step1[23]);
-  step2[17] = vqaddq_s16(step1[17], step1[22]);
-  step2[18] = vqaddq_s16(step1[18], step1[21]);
-  step2[19] = vqaddq_s16(step1[19], step1[20]);
-  step2[20] = vqsubq_s16(step1[19], step1[20]);
-  step2[21] = vqsubq_s16(step1[18], step1[21]);
-  step2[22] = vqsubq_s16(step1[17], step1[22]);
-  step2[23] = vqsubq_s16(step1[16], step1[23]);
-  step2[24] = vqsubq_s16(step1[31], step1[24]);
-  step2[25] = vqsubq_s16(step1[30], step1[25]);
-  step2[26] = vqsubq_s16(step1[29], step1[26]);
-  step2[27] = vqsubq_s16(step1[28], step1[27]);
-  step2[28] = vqaddq_s16(step1[28], step1[27]);
-  step2[29] = vqaddq_s16(step1[29], step1[26]);
-  step2[30] = vqaddq_s16(step1[30], step1[25]);
-  step2[31] = vqaddq_s16(step1[31], step1[24]);
-  step2[32] = step1[32];
-  step2[33] = step1[33];
-  step2[34] = step1[34];
-  step2[35] = step1[35];
-  step2[44] = step1[44];
-  step2[45] = step1[45];
-  step2[46] = step1[46];
-  step2[47] = step1[47];
-  step2[48] = step1[48];
-  step2[49] = step1[49];
-  step2[50] = step1[50];
-  step2[51] = step1[51];
-  step2[60] = step1[60];
-  step2[61] = step1[61];
-  step2[62] = step1[62];
-  step2[63] = step1[63];
-
-  // stage 9
-  idct64_stage9_neon(step2, step1, cos_bit);
-
-  // stage 10
-  idct64_stage10_neon(step1, step2, cos_bit);
-
-  // stage 11
-
-  out[0] = vqaddq_s16(step2[0], step2[63]);
-  out[1] = vqaddq_s16(step2[1], step2[62]);
-  out[2] = vqaddq_s16(step2[2], step2[61]);
-  out[3] = vqaddq_s16(step2[3], step2[60]);
-  out[4] = vqaddq_s16(step2[4], step2[59]);
-  out[5] = vqaddq_s16(step2[5], step2[58]);
-  out[6] = vqaddq_s16(step2[6], step2[57]);
-  out[7] = vqaddq_s16(step2[7], step2[56]);
-  out[8] = vqaddq_s16(step2[8], step2[55]);
-  out[9] = vqaddq_s16(step2[9], step2[54]);
-  out[10] = vqaddq_s16(step2[10], step2[53]);
-  out[11] = vqaddq_s16(step2[11], step2[52]);
-  out[12] = vqaddq_s16(step2[12], step2[51]);
-  out[13] = vqaddq_s16(step2[13], step2[50]);
-  out[14] = vqaddq_s16(step2[14], step2[49]);
-  out[15] = vqaddq_s16(step2[15], step2[48]);
-  out[16] = vqaddq_s16(step2[16], step2[47]);
-  out[17] = vqaddq_s16(step2[17], step2[46]);
-  out[18] = vqaddq_s16(step2[18], step2[45]);
-  out[19] = vqaddq_s16(step2[19], step2[44]);
-  out[20] = vqaddq_s16(step2[20], step2[43]);
-  out[21] = vqaddq_s16(step2[21], step2[42]);
-  out[22] = vqaddq_s16(step2[22], step2[41]);
-  out[23] = vqaddq_s16(step2[23], step2[40]);
-  out[24] = vqaddq_s16(step2[24], step2[39]);
-  out[25] = vqaddq_s16(step2[25], step2[38]);
-  out[26] = vqaddq_s16(step2[26], step2[37]);
-  out[27] = vqaddq_s16(step2[27], step2[36]);
-  out[28] = vqaddq_s16(step2[28], step2[35]);
-  out[29] = vqaddq_s16(step2[29], step2[34]);
-  out[30] = vqaddq_s16(step2[30], step2[33]);
-  out[31] = vqaddq_s16(step2[31], step2[32]);
-  out[32] = vqsubq_s16(step2[31], step2[32]);
-  out[33] = vqsubq_s16(step2[30], step2[33]);
-  out[34] = vqsubq_s16(step2[29], step2[34]);
-  out[35] = vqsubq_s16(step2[28], step2[35]);
-  out[36] = vqsubq_s16(step2[27], step2[36]);
-  out[37] = vqsubq_s16(step2[26], step2[37]);
-  out[38] = vqsubq_s16(step2[25], step2[38]);
-  out[39] = vqsubq_s16(step2[24], step2[39]);
-  out[40] = vqsubq_s16(step2[23], step2[40]);
-  out[41] = vqsubq_s16(step2[22], step2[41]);
-  out[42] = vqsubq_s16(step2[21], step2[42]);
-  out[43] = vqsubq_s16(step2[20], step2[43]);
-  out[44] = vqsubq_s16(step2[19], step2[44]);
-  out[45] = vqsubq_s16(step2[18], step2[45]);
-  out[46] = vqsubq_s16(step2[17], step2[46]);
-  out[47] = vqsubq_s16(step2[16], step2[47]);
-  out[48] = vqsubq_s16(step2[15], step2[48]);
-  out[49] = vqsubq_s16(step2[14], step2[49]);
-  out[50] = vqsubq_s16(step2[13], step2[50]);
-  out[51] = vqsubq_s16(step2[12], step2[51]);
-  out[52] = vqsubq_s16(step2[11], step2[52]);
-  out[53] = vqsubq_s16(step2[10], step2[53]);
-  out[54] = vqsubq_s16(step2[9], step2[54]);
-  out[55] = vqsubq_s16(step2[8], step2[55]);
-  out[56] = vqsubq_s16(step2[7], step2[56]);
-  out[57] = vqsubq_s16(step2[6], step2[57]);
-  out[58] = vqsubq_s16(step2[5], step2[58]);
-  out[59] = vqsubq_s16(step2[4], step2[59]);
-  out[60] = vqsubq_s16(step2[3], step2[60]);
-  out[61] = vqsubq_s16(step2[2], step2[61]);
-  out[62] = vqsubq_s16(step2[1], step2[62]);
-  out[63] = vqsubq_s16(step2[0], step2[63]);
-}
-
-static INLINE void idct64_low1_neon(int16x8_t *input, int16x8_t *out,
-                                    int8_t cos_bit, int bit) {
-  (void)bit;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  int16x8_t step1;
-  int32x4_t t32[2];
-
-  // stage 1
-  // stage 2
-  // stage 3
-  // stage 4
-  // stage 5
-  // stage 6
-
-  t32[0] = vmull_n_s16(vget_low_s16(input[0]), cospi[32]);
-  t32[1] = vmull_n_s16(vget_high_s16(input[0]), cospi[32]);
-
-  step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
-                       vrshrn_n_s32(t32[1], INV_COS_BIT));
-  // stage 7
-  // stage 8
-  // stage 9
-  // stage 10
-  // stage 11
-  out[0] = step1;
-  out[1] = step1;
-  out[2] = step1;
-  out[3] = step1;
-  out[4] = step1;
-  out[5] = step1;
-  out[6] = step1;
-  out[7] = step1;
-  out[8] = step1;
-  out[9] = step1;
-  out[10] = step1;
-  out[11] = step1;
-  out[12] = step1;
-  out[13] = step1;
-  out[14] = step1;
-  out[15] = step1;
-  out[16] = step1;
-  out[17] = step1;
-  out[18] = step1;
-  out[19] = step1;
-  out[20] = step1;
-  out[21] = step1;
-  out[22] = step1;
-  out[23] = step1;
-  out[24] = step1;
-  out[25] = step1;
-  out[26] = step1;
-  out[27] = step1;
-  out[28] = step1;
-  out[29] = step1;
-  out[30] = step1;
-  out[31] = step1;
-  out[32] = step1;
-  out[33] = step1;
-  out[34] = step1;
-  out[35] = step1;
-  out[36] = step1;
-  out[37] = step1;
-  out[38] = step1;
-  out[39] = step1;
-  out[40] = step1;
-  out[41] = step1;
-  out[42] = step1;
-  out[43] = step1;
-  out[44] = step1;
-  out[45] = step1;
-  out[46] = step1;
-  out[47] = step1;
-  out[48] = step1;
-  out[49] = step1;
-  out[50] = step1;
-  out[51] = step1;
-  out[52] = step1;
-  out[53] = step1;
-  out[54] = step1;
-  out[55] = step1;
-  out[56] = step1;
-  out[57] = step1;
-  out[58] = step1;
-  out[59] = step1;
-  out[60] = step1;
-  out[61] = step1;
-  out[62] = step1;
-  out[63] = step1;
-}
-
-static INLINE void idct64_low8_neon(int16x8_t *in, int16x8_t *out,
-                                    int8_t cos_bit, int bit) {
-  (void)bit;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  int16x8_t step2[64], step1[64];
-
-  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
-                                      (int16_t)cospi[36], (int16_t)cospi[28]);
-  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
-                                      (int16_t)cospi[52], (int16_t)cospi[12]);
-  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
-                                      (int16_t)cospi[40], (int16_t)cospi[24]);
-  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
-                                      (int16_t)cospi[16], (int16_t)cospi[48]);
-  const int16x4_t c4 =
-      set_s16x4_neon((int16_t)(-cospi[36]), (int16_t)(-cospi[28]),
-                     (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
-  const int16x4_t c5 =
-      set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
-                     (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
-  const int16x4_t c6 =
-      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
-                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
-
-  // stage 1
-  // stage 2
-
-  step2[0] = in[0];
-  step2[8] = in[4];
-  step2[16] = in[2];
-  step2[24] = in[6];
-
-  btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
-  btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
-  btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
-  btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
-
-  // stage 3
-
-  step1[0] = step2[0];
-  step1[8] = step2[8];
-
-  btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
-  btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
-
-  step1[32] = step2[32];
-  step1[33] = step2[32];
-  step1[38] = step2[39];
-  step1[39] = step2[39];
-  step1[40] = step2[40];
-  step1[41] = step2[40];
-  step1[46] = step2[47];
-  step1[47] = step2[47];
-  step1[48] = step2[48];
-  step1[49] = step2[48];
-  step1[54] = step2[55];
-  step1[55] = step2[55];
-  step1[56] = step2[56];
-  step1[57] = step2[56];
-  step1[62] = step2[63];
-  step1[63] = step2[63];
-
-  // stage 4
-
-  step2[0] = step1[0];
-
-  btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
-  btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
-  btf_16_lane_1_0_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
-  btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
-  btf_16_lane_3_2_neon(step1[46], step1[49], c4, &step2[46], &step2[49]);
-
-  step2[16] = step1[16];
-  step2[17] = step1[16];
-  step2[22] = step1[23];
-  step2[23] = step1[23];
-  step2[24] = step1[24];
-  step2[25] = step1[24];
-  step2[30] = step1[31];
-  step2[31] = step1[31];
-  step2[32] = step1[32];
-  step2[39] = step1[39];
-  step2[40] = step1[40];
-  step2[47] = step1[47];
-  step2[48] = step1[48];
-  step2[55] = step1[55];
-  step2[56] = step1[56];
-  step2[63] = step1[63];
-
-  // stage 5
-
-  step1[0] = step2[0];
-
-  btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
-  btf_16_lane_3_2_neon(step2[22], step2[25], c5, &step1[22], &step1[25]);
-
-  step1[8] = step2[8];
-  step1[9] = step2[8];
-  step1[14] = step2[15];
-  step1[15] = step2[15];
-
-  step1[16] = step2[16];
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[31] = step2[31];
-  step1[32] = step2[32];
-  step1[33] = step2[33];
-  step1[34] = step2[33];
-  step1[35] = step2[32];
-  step1[36] = step2[39];
-  step1[37] = step2[38];
-  step1[38] = step2[38];
-  step1[39] = step2[39];
-  step1[40] = step2[40];
-  step1[41] = step2[41];
-  step1[42] = step2[41];
-  step1[43] = step2[40];
-  step1[44] = step2[47];
-  step1[45] = step2[46];
-  step1[46] = step2[46];
-  step1[47] = step2[47];
-  step1[48] = step2[48];
-  step1[49] = step2[49];
-  step1[50] = step2[49];
-  step1[51] = step2[48];
-  step1[52] = step2[55];
-  step1[53] = step2[54];
-  step1[54] = step2[54];
-  step1[55] = step2[55];
-  step1[56] = step2[56];
-  step1[57] = step2[57];
-  step1[58] = step2[57];
-  step1[59] = step2[56];
-  step1[60] = step2[63];
-  step1[61] = step2[62];
-  step1[62] = step2[62];
-  step1[63] = step2[63];
-
-  // stage 6
-
-  btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
-  btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
-  btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
-  btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
-  btf_16_lane_1_0_neon(step1[36], step1[59], c5, &step2[36], &step2[59]);
-  btf_16_lane_1_0_neon(step1[37], step1[58], c5, &step2[37], &step2[58]);
-  btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
-  btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
-  btf_16_lane_3_2_neon(step1[44], step1[51], c5, &step2[44], &step2[51]);
-  btf_16_lane_3_2_neon(step1[45], step1[50], c5, &step2[45], &step2[50]);
-
-  step2[8] = step1[8];
-  step2[15] = step1[15];
-  step2[16] = step1[16];
-  step2[17] = step1[17];
-  step2[18] = step1[17];
-  step2[19] = step1[16];
-  step2[20] = step1[23];
-  step2[21] = step1[22];
-  step2[22] = step1[22];
-  step2[23] = step1[23];
-  step2[24] = step1[24];
-  step2[25] = step1[25];
-  step2[26] = step1[25];
-  step2[27] = step1[24];
-  step2[28] = step1[31];
-  step2[29] = step1[30];
-  step2[30] = step1[30];
-  step2[31] = step1[31];
-  step2[32] = step1[32];
-  step2[33] = step1[33];
-  step2[38] = step1[38];
-  step2[39] = step1[39];
-  step2[40] = step1[40];
-  step2[41] = step1[41];
-  step2[46] = step1[46];
-  step2[47] = step1[47];
-  step2[48] = step1[48];
-  step2[49] = step1[49];
-  step2[54] = step1[54];
-  step2[55] = step1[55];
-  step2[56] = step1[56];
-  step2[57] = step1[57];
-  step2[62] = step1[62];
-  step2[63] = step1[63];
-
-  // stage 7
-
-  btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
-  btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
-  btf_16_lane_3_2_neon(step2[20], step2[27], c6, &step1[20], &step1[27]);
-  btf_16_lane_3_2_neon(step2[21], step2[26], c6, &step1[21], &step1[26]);
-
-  step1[0] = step2[0];
-  step1[1] = step2[1];
-  step1[2] = step2[1];
-  step1[3] = step2[0];
-  step1[8] = step2[8];
-  step1[9] = step2[9];
-  step1[10] = step2[9];
-  step1[11] = step2[8];
-  step1[12] = step2[15];
-  step1[13] = step2[14];
-  step1[14] = step2[14];
-  step1[15] = step2[15];
-  step1[16] = step2[16];
-  step1[17] = step2[17];
-  step1[22] = step2[22];
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[25] = step2[25];
-  step1[30] = step2[30];
-  step1[31] = step2[31];
-  step1[32] = vqaddq_s16(step2[32], step2[39]);
-  step1[33] = vqaddq_s16(step2[33], step2[38]);
-  step1[34] = vqaddq_s16(step2[34], step2[37]);
-  step1[35] = vqaddq_s16(step2[35], step2[36]);
-  step1[36] = vqsubq_s16(step2[35], step2[36]);
-  step1[37] = vqsubq_s16(step2[34], step2[37]);
-  step1[38] = vqsubq_s16(step2[33], step2[38]);
-  step1[39] = vqsubq_s16(step2[32], step2[39]);
-  step1[40] = vqsubq_s16(step2[47], step2[40]);
-  step1[41] = vqsubq_s16(step2[46], step2[41]);
-  step1[42] = vqsubq_s16(step2[45], step2[42]);
-  step1[43] = vqsubq_s16(step2[44], step2[43]);
-  step1[44] = vqaddq_s16(step2[43], step2[44]);
-  step1[45] = vqaddq_s16(step2[42], step2[45]);
-  step1[46] = vqaddq_s16(step2[41], step2[46]);
-  step1[47] = vqaddq_s16(step2[40], step2[47]);
-  step1[48] = vqaddq_s16(step2[48], step2[55]);
-  step1[49] = vqaddq_s16(step2[49], step2[54]);
-  step1[50] = vqaddq_s16(step2[50], step2[53]);
-  step1[51] = vqaddq_s16(step2[51], step2[52]);
-  step1[52] = vqsubq_s16(step2[51], step2[52]);
-  step1[53] = vqsubq_s16(step2[50], step2[53]);
-  step1[54] = vqsubq_s16(step2[49], step2[54]);
-  step1[55] = vqsubq_s16(step2[48], step2[55]);
-  step1[56] = vqsubq_s16(step2[63], step2[56]);
-  step1[57] = vqsubq_s16(step2[62], step2[57]);
-  step1[58] = vqsubq_s16(step2[61], step2[58]);
-  step1[59] = vqsubq_s16(step2[60], step2[59]);
-  step1[60] = vqaddq_s16(step2[59], step2[60]);
-  step1[61] = vqaddq_s16(step2[58], step2[61]);
-  step1[62] = vqaddq_s16(step2[57], step2[62]);
-  step1[63] = vqaddq_s16(step2[56], step2[63]);
-
-  // stage 8
-
-  btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
-  btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
-  btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
-  btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
-  btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
-  btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
-  btf_16_lane_3_2_neon(step1[40], step1[55], c6, &step2[40], &step2[55]);
-  btf_16_lane_3_2_neon(step1[41], step1[54], c6, &step2[41], &step2[54]);
-  btf_16_lane_3_2_neon(step1[42], step1[53], c6, &step2[42], &step2[53]);
-  btf_16_lane_3_2_neon(step1[43], step1[52], c6, &step2[43], &step2[52]);
-
-  step2[0] = step1[0];
-  step2[1] = step1[1];
-  step2[2] = step1[2];
-  step2[3] = step1[3];
-  step2[4] = step1[3];
-  step2[5] = step1[2];
-  step2[6] = step1[1];
-  step2[7] = step1[0];
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-  step2[16] = vqaddq_s16(step1[16], step1[23]);
-  step2[17] = vqaddq_s16(step1[17], step1[22]);
-  step2[18] = vqaddq_s16(step1[18], step1[21]);
-  step2[19] = vqaddq_s16(step1[19], step1[20]);
-  step2[20] = vqsubq_s16(step1[19], step1[20]);
-  step2[21] = vqsubq_s16(step1[18], step1[21]);
-  step2[22] = vqsubq_s16(step1[17], step1[22]);
-  step2[23] = vqsubq_s16(step1[16], step1[23]);
-  step2[24] = vqsubq_s16(step1[31], step1[24]);
-  step2[25] = vqsubq_s16(step1[30], step1[25]);
-  step2[26] = vqsubq_s16(step1[29], step1[26]);
-  step2[27] = vqsubq_s16(step1[28], step1[27]);
-  step2[28] = vqaddq_s16(step1[28], step1[27]);
-  step2[29] = vqaddq_s16(step1[29], step1[26]);
-  step2[30] = vqaddq_s16(step1[30], step1[25]);
-  step2[31] = vqaddq_s16(step1[31], step1[24]);
-  step2[32] = step1[32];
-  step2[33] = step1[33];
-  step2[34] = step1[34];
-  step2[35] = step1[35];
-  step2[44] = step1[44];
-  step2[45] = step1[45];
-  step2[46] = step1[46];
-  step2[47] = step1[47];
-  step2[48] = step1[48];
-  step2[49] = step1[49];
-  step2[50] = step1[50];
-  step2[51] = step1[51];
-  step2[60] = step1[60];
-  step2[61] = step1[61];
-  step2[62] = step1[62];
-  step2[63] = step1[63];
-
-  // stage 9
-  idct64_stage9_neon(step2, step1, cos_bit);
-
-  // stage 10
-  idct64_stage10_neon(step1, step2, cos_bit);
-
-  // stage 11
-
-  out[0] = vqaddq_s16(step2[0], step2[63]);
-  out[1] = vqaddq_s16(step2[1], step2[62]);
-  out[2] = vqaddq_s16(step2[2], step2[61]);
-  out[3] = vqaddq_s16(step2[3], step2[60]);
-  out[4] = vqaddq_s16(step2[4], step2[59]);
-  out[5] = vqaddq_s16(step2[5], step2[58]);
-  out[6] = vqaddq_s16(step2[6], step2[57]);
-  out[7] = vqaddq_s16(step2[7], step2[56]);
-  out[8] = vqaddq_s16(step2[8], step2[55]);
-  out[9] = vqaddq_s16(step2[9], step2[54]);
-  out[10] = vqaddq_s16(step2[10], step2[53]);
-  out[11] = vqaddq_s16(step2[11], step2[52]);
-  out[12] = vqaddq_s16(step2[12], step2[51]);
-  out[13] = vqaddq_s16(step2[13], step2[50]);
-  out[14] = vqaddq_s16(step2[14], step2[49]);
-  out[15] = vqaddq_s16(step2[15], step2[48]);
-  out[16] = vqaddq_s16(step2[16], step2[47]);
-  out[17] = vqaddq_s16(step2[17], step2[46]);
-  out[18] = vqaddq_s16(step2[18], step2[45]);
-  out[19] = vqaddq_s16(step2[19], step2[44]);
-  out[20] = vqaddq_s16(step2[20], step2[43]);
-  out[21] = vqaddq_s16(step2[21], step2[42]);
-  out[22] = vqaddq_s16(step2[22], step2[41]);
-  out[23] = vqaddq_s16(step2[23], step2[40]);
-  out[24] = vqaddq_s16(step2[24], step2[39]);
-  out[25] = vqaddq_s16(step2[25], step2[38]);
-  out[26] = vqaddq_s16(step2[26], step2[37]);
-  out[27] = vqaddq_s16(step2[27], step2[36]);
-  out[28] = vqaddq_s16(step2[28], step2[35]);
-  out[29] = vqaddq_s16(step2[29], step2[34]);
-  out[30] = vqaddq_s16(step2[30], step2[33]);
-  out[31] = vqaddq_s16(step2[31], step2[32]);
-  out[32] = vqsubq_s16(step2[31], step2[32]);
-  out[33] = vqsubq_s16(step2[30], step2[33]);
-  out[34] = vqsubq_s16(step2[29], step2[34]);
-  out[35] = vqsubq_s16(step2[28], step2[35]);
-  out[36] = vqsubq_s16(step2[27], step2[36]);
-  out[37] = vqsubq_s16(step2[26], step2[37]);
-  out[38] = vqsubq_s16(step2[25], step2[38]);
-  out[39] = vqsubq_s16(step2[24], step2[39]);
-  out[40] = vqsubq_s16(step2[23], step2[40]);
-  out[41] = vqsubq_s16(step2[22], step2[41]);
-  out[42] = vqsubq_s16(step2[21], step2[42]);
-  out[43] = vqsubq_s16(step2[20], step2[43]);
-  out[44] = vqsubq_s16(step2[19], step2[44]);
-  out[45] = vqsubq_s16(step2[18], step2[45]);
-  out[46] = vqsubq_s16(step2[17], step2[46]);
-  out[47] = vqsubq_s16(step2[16], step2[47]);
-  out[48] = vqsubq_s16(step2[15], step2[48]);
-  out[49] = vqsubq_s16(step2[14], step2[49]);
-  out[50] = vqsubq_s16(step2[13], step2[50]);
-  out[51] = vqsubq_s16(step2[12], step2[51]);
-  out[52] = vqsubq_s16(step2[11], step2[52]);
-  out[53] = vqsubq_s16(step2[10], step2[53]);
-  out[54] = vqsubq_s16(step2[9], step2[54]);
-  out[55] = vqsubq_s16(step2[8], step2[55]);
-  out[56] = vqsubq_s16(step2[7], step2[56]);
-  out[57] = vqsubq_s16(step2[6], step2[57]);
-  out[58] = vqsubq_s16(step2[5], step2[58]);
-  out[59] = vqsubq_s16(step2[4], step2[59]);
-  out[60] = vqsubq_s16(step2[3], step2[60]);
-  out[61] = vqsubq_s16(step2[2], step2[61]);
-  out[62] = vqsubq_s16(step2[1], step2[62]);
-  out[63] = vqsubq_s16(step2[0], step2[63]);
-}
-
-static INLINE void idct64_low16_neon(int16x8_t *in, int16x8_t *out,
-                                     int8_t cos_bit, int bit) {
-  (void)bit;
-  const int32_t *cospi = cospi_arr(cos_bit);
-  int16x8_t step2[64], step1[64];
-
-  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
-                                      (int16_t)cospi[36], (int16_t)cospi[28]);
-  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
-                                      (int16_t)cospi[52], (int16_t)cospi[12]);
-  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
-                                      (int16_t)cospi[40], (int16_t)cospi[24]);
-  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
-                                      (int16_t)cospi[16], (int16_t)cospi[48]);
-  const int16x4_t c4 =
-      set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]),
-                     (int16_t)(-cospi[36]), (int16_t)(-cospi[28]));
-  const int16x4_t c5 =
-      set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]),
-                     (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
-  const int16x4_t c6 =
-      set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
-                     (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
-  const int16x4_t c7 =
-      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
-                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
-
-  // stage 1
-  // stage 2
-
-  step2[0] = in[0];
-  step2[4] = in[8];
-  step2[8] = in[4];
-  step2[12] = in[12];
-  step2[16] = in[2];
-  step2[20] = in[10];
-  step2[24] = in[6];
-  step2[28] = in[14];
-
-  btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
-  btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]);
-  btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]);
-  btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
-  btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
-  btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]);
-  btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]);
-  btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
-
-  // stage 3
-
-  step1[0] = step2[0];
-  step1[4] = step2[4];
-  step1[8] = step2[8];
-  step1[12] = step2[12];
-
-  btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
-  btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]);
-  btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
-  btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]);
-
-  step1[32] = step2[32];
-  step1[33] = step2[32];
-  step1[34] = step2[35];
-  step1[35] = step2[35];
-  step1[36] = step2[36];
-  step1[37] = step2[36];
-  step1[38] = step2[39];
-  step1[39] = step2[39];
-  step1[40] = step2[40];
-  step1[41] = step2[40];
-  step1[42] = step2[43];
-  step1[43] = step2[43];
-  step1[44] = step2[44];
-  step1[45] = step2[44];
-  step1[46] = step2[47];
-  step1[47] = step2[47];
-  step1[48] = step2[48];
-  step1[49] = step2[48];
-  step1[50] = step2[51];
-  step1[51] = step2[51];
-  step1[52] = step2[52];
-  step1[53] = step2[52];
-  step1[54] = step2[55];
-  step1[55] = step2[55];
-  step1[56] = step2[56];
-  step1[57] = step2[56];
-  step1[58] = step2[59];
-  step1[59] = step2[59];
-  step1[60] = step2[60];
-  step1[61] = step2[60];
-  step1[62] = step2[63];
-  step1[63] = step2[63];
-
-  // stage 4
-
-  step2[0] = step1[0];
-  step2[4] = step1[4];
-
-  btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
-  btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]);
-  btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
-  btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]);
-  btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]);
-  btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
-  btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
-  btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]);
-  btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]);
-  btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]);
-
-  step2[16] = step1[16];
-  step2[17] = step1[16];
-  step2[18] = step1[19];
-  step2[19] = step1[19];
-  step2[20] = step1[20];
-  step2[21] = step1[20];
-  step2[22] = step1[23];
-  step2[23] = step1[23];
-  step2[24] = step1[24];
-  step2[25] = step1[24];
-  step2[26] = step1[27];
-  step2[27] = step1[27];
-  step2[28] = step1[28];
-  step2[29] = step1[28];
-  step2[30] = step1[31];
-  step2[31] = step1[31];
-  step2[32] = step1[32];
-  step2[35] = step1[35];
-  step2[36] = step1[36];
-  step2[39] = step1[39];
-  step2[40] = step1[40];
-  step2[43] = step1[43];
-  step2[44] = step1[44];
-  step2[47] = step1[47];
-  step2[48] = step1[48];
-  step2[51] = step1[51];
-  step2[52] = step1[52];
-  step2[55] = step1[55];
-  step2[56] = step1[56];
-  step2[59] = step1[59];
-  step2[60] = step1[60];
-  step2[63] = step1[63];
-
-  // stage 5
-
-  step1[0] = step2[0];
-
-  btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
-  btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
-  btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]);
-  btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]);
-  btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]);
-
-  step1[8] = step2[8];
-  step1[9] = step2[8];
-  step1[10] = step2[11];
-  step1[11] = step2[11];
-  step1[12] = step2[12];
-  step1[13] = step2[12];
-  step1[14] = step2[15];
-  step1[15] = step2[15];
-  step1[16] = step2[16];
-  step1[19] = step2[19];
-  step1[20] = step2[20];
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[27] = step2[27];
-  step1[28] = step2[28];
-  step1[31] = step2[31];
-  step1[32] = vqaddq_s16(step2[32], step2[35]);
-  step1[33] = vqaddq_s16(step2[33], step2[34]);
-  step1[34] = vqsubq_s16(step2[33], step2[34]);
-  step1[35] = vqsubq_s16(step2[32], step2[35]);
-  step1[36] = vqsubq_s16(step2[39], step2[36]);
-  step1[37] = vqsubq_s16(step2[38], step2[37]);
-  step1[38] = vqaddq_s16(step2[38], step2[37]);
-  step1[39] = vqaddq_s16(step2[39], step2[36]);
-  step1[40] = vqaddq_s16(step2[40], step2[43]);
-  step1[41] = vqaddq_s16(step2[41], step2[42]);
-  step1[42] = vqsubq_s16(step2[41], step2[42]);
-  step1[43] = vqsubq_s16(step2[40], step2[43]);
-  step1[44] = vqsubq_s16(step2[47], step2[44]);
-  step1[45] = vqsubq_s16(step2[46], step2[45]);
-  step1[46] = vqaddq_s16(step2[46], step2[45]);
-  step1[47] = vqaddq_s16(step2[47], step2[44]);
-  step1[48] = vqaddq_s16(step2[48], step2[51]);
-  step1[49] = vqaddq_s16(step2[49], step2[50]);
-  step1[50] = vqsubq_s16(step2[49], step2[50]);
-  step1[51] = vqsubq_s16(step2[48], step2[51]);
-  step1[52] = vqsubq_s16(step2[55], step2[52]);
-  step1[53] = vqsubq_s16(step2[54], step2[53]);
-  step1[54] = vqaddq_s16(step2[54], step2[53]);
-  step1[55] = vqaddq_s16(step2[55], step2[52]);
-  step1[56] = vqaddq_s16(step2[56], step2[59]);
-  step1[57] = vqaddq_s16(step2[57], step2[58]);
-  step1[58] = vqsubq_s16(step2[57], step2[58]);
-  step1[59] = vqsubq_s16(step2[56], step2[59]);
-  step1[60] = vqsubq_s16(step2[63], step2[60]);
-  step1[61] = vqsubq_s16(step2[62], step2[61]);
-  step1[62] = vqaddq_s16(step2[62], step2[61]);
-  step1[63] = vqaddq_s16(step2[63], step2[60]);
-
-  // stage 6
-
-  btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
-  btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
-  btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]);
-  btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
-  btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
-  btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]);
-  btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]);
-  btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
-  btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
-  btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]);
-  btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]);
-
-  step2[4] = step1[4];
-  step2[5] = step1[4];
-  step2[6] = step1[7];
-  step2[7] = step1[7];
-  step2[8] = step1[8];
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-  step2[15] = step1[15];
-  step2[16] = vqaddq_s16(step1[16], step1[19]);
-  step2[17] = vqaddq_s16(step1[17], step1[18]);
-  step2[18] = vqsubq_s16(step1[17], step1[18]);
-  step2[19] = vqsubq_s16(step1[16], step1[19]);
-  step2[20] = vqsubq_s16(step1[23], step1[20]);
-  step2[21] = vqsubq_s16(step1[22], step1[21]);
-  step2[22] = vqaddq_s16(step1[22], step1[21]);
-  step2[23] = vqaddq_s16(step1[23], step1[20]);
-  step2[24] = vqaddq_s16(step1[24], step1[27]);
-  step2[25] = vqaddq_s16(step1[25], step1[26]);
-  step2[26] = vqsubq_s16(step1[25], step1[26]);
-  step2[27] = vqsubq_s16(step1[24], step1[27]);
-  step2[28] = vqsubq_s16(step1[31], step1[28]);
-  step2[29] = vqsubq_s16(step1[30], step1[29]);
-  step2[30] = vqaddq_s16(step1[30], step1[29]);
-  step2[31] = vqaddq_s16(step1[31], step1[28]);
-  step2[32] = step1[32];
-  step2[33] = step1[33];
-  step2[38] = step1[38];
-  step2[39] = step1[39];
-  step2[40] = step1[40];
-  step2[41] = step1[41];
-  step2[46] = step1[46];
-  step2[47] = step1[47];
-  step2[48] = step1[48];
-  step2[49] = step1[49];
-  step2[54] = step1[54];
-  step2[55] = step1[55];
-  step2[56] = step1[56];
-  step2[57] = step1[57];
-  step2[62] = step1[62];
-  step2[63] = step1[63];
-
-  // stage 7
-
-  btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
-  btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
-  btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
-  btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]);
-  btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]);
-
-  step1[0] = step2[0];
-  step1[1] = step2[1];
-  step1[2] = step2[1];
-  step1[3] = step2[0];
-  step1[4] = step2[4];
-  step1[7] = step2[7];
-  step1[8] = vqaddq_s16(step2[8], step2[11]);
-  step1[9] = vqaddq_s16(step2[9], step2[10]);
-  step1[10] = vqsubq_s16(step2[9], step2[10]);
-  step1[11] = vqsubq_s16(step2[8], step2[11]);
-  step1[12] = vqsubq_s16(step2[15], step2[12]);
-  step1[13] = vqsubq_s16(step2[14], step2[13]);
-  step1[14] = vqaddq_s16(step2[14], step2[13]);
-  step1[15] = vqaddq_s16(step2[15], step2[12]);
-  step1[16] = step2[16];
-  step1[17] = step2[17];
-  step1[22] = step2[22];
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[25] = step2[25];
-  step1[30] = step2[30];
-  step1[31] = step2[31];
-  step1[32] = vqaddq_s16(step2[32], step2[39]);
-  step1[33] = vqaddq_s16(step2[33], step2[38]);
-  step1[34] = vqaddq_s16(step2[34], step2[37]);
-  step1[35] = vqaddq_s16(step2[35], step2[36]);
-  step1[36] = vqsubq_s16(step2[35], step2[36]);
-  step1[37] = vqsubq_s16(step2[34], step2[37]);
-  step1[38] = vqsubq_s16(step2[33], step2[38]);
-  step1[39] = vqsubq_s16(step2[32], step2[39]);
-  step1[40] = vqsubq_s16(step2[47], step2[40]);
-  step1[41] = vqsubq_s16(step2[46], step2[41]);
-  step1[42] = vqsubq_s16(step2[45], step2[42]);
-  step1[43] = vqsubq_s16(step2[44], step2[43]);
-  step1[44] = vqaddq_s16(step2[43], step2[44]);
-  step1[45] = vqaddq_s16(step2[42], step2[45]);
-  step1[46] = vqaddq_s16(step2[41], step2[46]);
-  step1[47] = vqaddq_s16(step2[40], step2[47]);
-  step1[48] = vqaddq_s16(step2[48], step2[55]);
-  step1[49] = vqaddq_s16(step2[49], step2[54]);
-  step1[50] = vqaddq_s16(step2[50], step2[53]);
-  step1[51] = vqaddq_s16(step2[51], step2[52]);
-  step1[52] = vqsubq_s16(step2[51], step2[52]);
-  step1[53] = vqsubq_s16(step2[50], step2[53]);
-  step1[54] = vqsubq_s16(step2[49], step2[54]);
-  step1[55] = vqsubq_s16(step2[48], step2[55]);
-  step1[56] = vqsubq_s16(step2[63], step2[56]);
-  step1[57] = vqsubq_s16(step2[62], step2[57]);
-  step1[58] = vqsubq_s16(step2[61], step2[58]);
-  step1[59] = vqsubq_s16(step2[60], step2[59]);
-  step1[60] = vqaddq_s16(step2[59], step2[60]);
-  step1[61] = vqaddq_s16(step2[58], step2[61]);
-  step1[62] = vqaddq_s16(step2[57], step2[62]);
-  step1[63] = vqaddq_s16(step2[56], step2[63]);
-
-  // stage 8
-
-  btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
-  btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
-  btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
-  btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
-  btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
-  btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
-  btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]);
-  btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]);
-  btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]);
-  btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]);
-
-  step2[0] = vqaddq_s16(step1[0], step1[7]);
-  step2[1] = vqaddq_s16(step1[1], step1[6]);
-  step2[2] = vqaddq_s16(step1[2], step1[5]);
-  step2[3] = vqaddq_s16(step1[3], step1[4]);
-  step2[4] = vqsubq_s16(step1[3], step1[4]);
-  step2[5] = vqsubq_s16(step1[2], step1[5]);
-  step2[6] = vqsubq_s16(step1[1], step1[6]);
-  step2[7] = vqsubq_s16(step1[0], step1[7]);
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-  step2[16] = vqaddq_s16(step1[16], step1[23]);
-  step2[17] = vqaddq_s16(step1[17], step1[22]);
-  step2[18] = vqaddq_s16(step1[18], step1[21]);
-  step2[19] = vqaddq_s16(step1[19], step1[20]);
-  step2[20] = vqsubq_s16(step1[19], step1[20]);
-  step2[21] = vqsubq_s16(step1[18], step1[21]);
-  step2[22] = vqsubq_s16(step1[17], step1[22]);
-  step2[23] = vqsubq_s16(step1[16], step1[23]);
-  step2[24] = vqsubq_s16(step1[31], step1[24]);
-  step2[25] = vqsubq_s16(step1[30], step1[25]);
-  step2[26] = vqsubq_s16(step1[29], step1[26]);
-  step2[27] = vqsubq_s16(step1[28], step1[27]);
-  step2[28] = vqaddq_s16(step1[28], step1[27]);
-  step2[29] = vqaddq_s16(step1[29], step1[26]);
-  step2[30] = vqaddq_s16(step1[30], step1[25]);
-  step2[31] = vqaddq_s16(step1[31], step1[24]);
-  step2[32] = step1[32];
-  step2[33] = step1[33];
-  step2[34] = step1[34];
-  step2[35] = step1[35];
-  step2[44] = step1[44];
-  step2[45] = step1[45];
-  step2[46] = step1[46];
-  step2[47] = step1[47];
-  step2[48] = step1[48];
-  step2[49] = step1[49];
-  step2[50] = step1[50];
-  step2[51] = step1[51];
-  step2[60] = step1[60];
-  step2[61] = step1[61];
-  step2[62] = step1[62];
-  step2[63] = step1[63];
-
-  // stage 9
-  idct64_stage9_neon(step2, step1, cos_bit);
-
-  // stage 10
-  idct64_stage10_neon(step1, step2, cos_bit);
-
-  // stage 11
-
-  out[0] = vqaddq_s16(step2[0], step2[63]);
-  out[1] = vqaddq_s16(step2[1], step2[62]);
-  out[2] = vqaddq_s16(step2[2], step2[61]);
-  out[3] = vqaddq_s16(step2[3], step2[60]);
-  out[4] = vqaddq_s16(step2[4], step2[59]);
-  out[5] = vqaddq_s16(step2[5], step2[58]);
-  out[6] = vqaddq_s16(step2[6], step2[57]);
-  out[7] = vqaddq_s16(step2[7], step2[56]);
-  out[8] = vqaddq_s16(step2[8], step2[55]);
-  out[9] = vqaddq_s16(step2[9], step2[54]);
-  out[10] = vqaddq_s16(step2[10], step2[53]);
-  out[11] = vqaddq_s16(step2[11], step2[52]);
-  out[12] = vqaddq_s16(step2[12], step2[51]);
-  out[13] = vqaddq_s16(step2[13], step2[50]);
-  out[14] = vqaddq_s16(step2[14], step2[49]);
-  out[15] = vqaddq_s16(step2[15], step2[48]);
-  out[16] = vqaddq_s16(step2[16], step2[47]);
-  out[17] = vqaddq_s16(step2[17], step2[46]);
-  out[18] = vqaddq_s16(step2[18], step2[45]);
-  out[19] = vqaddq_s16(step2[19], step2[44]);
-  out[20] = vqaddq_s16(step2[20], step2[43]);
-  out[21] = vqaddq_s16(step2[21], step2[42]);
-  out[22] = vqaddq_s16(step2[22], step2[41]);
-  out[23] = vqaddq_s16(step2[23], step2[40]);
-  out[24] = vqaddq_s16(step2[24], step2[39]);
-  out[25] = vqaddq_s16(step2[25], step2[38]);
-  out[26] = vqaddq_s16(step2[26], step2[37]);
-  out[27] = vqaddq_s16(step2[27], step2[36]);
-  out[28] = vqaddq_s16(step2[28], step2[35]);
-  out[29] = vqaddq_s16(step2[29], step2[34]);
-  out[30] = vqaddq_s16(step2[30], step2[33]);
-  out[31] = vqaddq_s16(step2[31], step2[32]);
-  out[32] = vqsubq_s16(step2[31], step2[32]);
-  out[33] = vqsubq_s16(step2[30], step2[33]);
-  out[34] = vqsubq_s16(step2[29], step2[34]);
-  out[35] = vqsubq_s16(step2[28], step2[35]);
-  out[36] = vqsubq_s16(step2[27], step2[36]);
-  out[37] = vqsubq_s16(step2[26], step2[37]);
-  out[38] = vqsubq_s16(step2[25], step2[38]);
-  out[39] = vqsubq_s16(step2[24], step2[39]);
-  out[40] = vqsubq_s16(step2[23], step2[40]);
-  out[41] = vqsubq_s16(step2[22], step2[41]);
-  out[42] = vqsubq_s16(step2[21], step2[42]);
-  out[43] = vqsubq_s16(step2[20], step2[43]);
-  out[44] = vqsubq_s16(step2[19], step2[44]);
-  out[45] = vqsubq_s16(step2[18], step2[45]);
-  out[46] = vqsubq_s16(step2[17], step2[46]);
-  out[47] = vqsubq_s16(step2[16], step2[47]);
-  out[48] = vqsubq_s16(step2[15], step2[48]);
-  out[49] = vqsubq_s16(step2[14], step2[49]);
-  out[50] = vqsubq_s16(step2[13], step2[50]);
-  out[51] = vqsubq_s16(step2[12], step2[51]);
-  out[52] = vqsubq_s16(step2[11], step2[52]);
-  out[53] = vqsubq_s16(step2[10], step2[53]);
-  out[54] = vqsubq_s16(step2[9], step2[54]);
-  out[55] = vqsubq_s16(step2[8], step2[55]);
-  out[56] = vqsubq_s16(step2[7], step2[56]);
-  out[57] = vqsubq_s16(step2[6], step2[57]);
-  out[58] = vqsubq_s16(step2[5], step2[58]);
-  out[59] = vqsubq_s16(step2[4], step2[59]);
-  out[60] = vqsubq_s16(step2[3], step2[60]);
-  out[61] = vqsubq_s16(step2[2], step2[61]);
-  out[62] = vqsubq_s16(step2[1], step2[62]);
-  out[63] = vqsubq_s16(step2[0], step2[63]);
-}
-
-// Functions for blocks with eob at DC and within
-// topleft 8x8, 16x16, 32x32 corner
-static const transform_neon
-    lowbd_txfm_all_1d_zeros_w_arr[TX_SIZES][ITX_TYPES_1D][4] = {
-      {
-          { NULL, NULL, NULL, NULL },
-          { NULL, NULL, NULL, NULL },
-          { NULL, NULL, NULL, NULL },
-      },
-      { { idct8_low1_neon, idct8_neon, NULL, NULL },
-        { iadst8_low1_neon, iadst8_neon, NULL, NULL },
-        { NULL, NULL, NULL, NULL } },
-      {
-          { idct16_low1_neon, idct16_low8_neon, idct16_neon, NULL },
-          { iadst16_low1_neon, iadst16_low8_neon, iadst16_neon, NULL },
-          { NULL, NULL, NULL, NULL },
-      },
-      { { idct32_low1_neon, idct32_low8_neon, idct32_low16_neon, idct32_neon },
-        { NULL, NULL, NULL, NULL },
-        { NULL, NULL, NULL, NULL } },
-      { { idct64_low1_neon, idct64_low8_neon, idct64_low16_neon,
-          idct64_low32_neon },
-        { NULL, NULL, NULL, NULL },
-        { NULL, NULL, NULL, NULL } }
-    };
-
-static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
-                                                  uint8_t *output, int stride,
-                                                  TX_TYPE tx_type,
-                                                  TX_SIZE tx_size, int eob) {
-  (void)tx_type;
-  int16x8_t a[32 * 4];
-  int16x8_t b[32 * 4];
-  int eobx, eoby;
-  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-  lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
-                               0);
-  lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
-                               0);
-  const int buf_size_w_div8 = txfm_size_col >> 3;
-  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
-  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
-  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
-  const int32_t *input_1;
-  int temp_b = 0;
-
-  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
-    input_1 = input;
-    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
-      int k = j * 8 + i * txfm_size_col;
-      load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
-      transpose_s16_8x8q(&a[k], &a[k]);
-      input_1 += 8;
-    }
-    input += (txfm_size_col * 8);
-    if (abs(rect_type) == 1) {
-      int y = i * txfm_size_col;
-      round_shift_for_rect(&a[y], &a[y], txfm_size_col);
-    }
-    identity_txfm_round_neon(&a[i * txfm_size_col], &a[i * txfm_size_col],
-                             txw_idx, txfm_size_col, -shift[0]);
-    for (int j = 0; j < buf_size_w_div8; ++j) {
-      int k = j * 8 + i * txfm_size_col;
-      transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
-    }
-    temp_b += 8;
-  }
-  for (int j = 0; j < buf_size_w_div8; ++j) {
-    identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row],
-                             txh_idx, txfm_size_row, -shift[1]);
-  }
-  if (txfm_size_col >= 16) {
-    for (int i = 0; i < (txfm_size_col >> 4); i++) {
-      lowbd_add_flip_buffer_16xn_neon(
-          &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
-    }
-  } else if (txfm_size_col == 8) {
-    lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
-  }
-}
-
-static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
-    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
-    TX_SIZE tx_size, int eob) {
-  int16x8_t a[16 * 2];
-  int16x8_t b[16 * 2];
-  int eobx, eoby, ud_flip, lr_flip;
-  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-  lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
-                               0);
-  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
-  const int buf_size_w_div8 = txfm_size_col >> 3;
-  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
-  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
-  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
-  const int32_t *input_1;
-  int temp_b = 0;
-  const transform_neon row_txfm =
-      lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
-
-  assert(row_txfm != NULL);
-
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-
-  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
-    input_1 = input;
-    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
-      int k = j * 8 + i * txfm_size_col;
-      load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
-      transpose_s16_8x8q(&a[k], &a[k]);
-      input_1 += 8;
-    }
-    input += (txfm_size_col * 8);
-    if (abs(rect_type) == 1) {
-      int y = i * txfm_size_col;
-      round_shift_for_rect(&a[y], &a[y], txfm_size_col);
-    }
-    row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
-    av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
-                                  -shift[0]);
-    if (lr_flip == 1) {
-      for (int j = 0; j < buf_size_w_div8; ++j) {
-        int k = j * 8 + i * txfm_size_col;
-        flip_buf_ud_neon(&a[k], 8);
-        transpose_s16_8x8q(
-            &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
-      }
-      temp_b += 8;
-    } else {
-      for (int j = 0; j < buf_size_w_div8; ++j) {
-        int k = j * 8 + i * txfm_size_col;
-        transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
-      }
-      temp_b += 8;
-    }
-  }
-  for (int j = 0; j < buf_size_w_div8; ++j) {
-    identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row],
-                             txh_idx, txfm_size_row, -shift[1]);
-  }
-  if (txfm_size_col >= 16) {
-    for (int i = 0; i < (txfm_size_col >> 4); i++) {
-      lowbd_add_flip_buffer_16xn_neon(
-          &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
-    }
-  } else if (txfm_size_col == 8) {
-    lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
-  }
-}
-
-static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
-    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
-    TX_SIZE tx_size, int eob) {
-  int16x8_t a[16 * 2];
-  int16x8_t b[16 * 2];
-  int eobx, eoby, ud_flip, lr_flip;
-  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-  lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
-                               0);
-  const int buf_size_w_div8 = txfm_size_col >> 3;
-  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
-  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
-  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
-  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
-  const int32_t *input_1;
-  int temp_b = 0;
-  const transform_neon col_txfm =
-      lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
-
-  assert(col_txfm != NULL);
-
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-
-  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
-    input_1 = input;
-    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
-      int k = j * 8 + i * txfm_size_col;
-      load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
-      transpose_s16_8x8q(&a[k], &a[k]);
-      input_1 += 8;
-    }
-    input += (txfm_size_col * 8);
-    if (abs(rect_type) == 1) {
-      int y = i * txfm_size_col;
-      round_shift_for_rect(&a[y], &a[y], txfm_size_col);
-    }
-    identity_txfm_round_neon(&a[i * txfm_size_col], &a[i * txfm_size_col],
-                             txw_idx, txfm_size_col, -shift[0]);
-    for (int j = 0; j < buf_size_w_div8; ++j) {
-      int k = j * 8 + i * txfm_size_col;
-      transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
-    }
-    temp_b += 8;
-  }
-  for (int j = 0; j < buf_size_w_div8; ++j) {
-    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
-    av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
-                                  -shift[1]);
-  }
-  if (txfm_size_col >= 16) {
-    for (int i = 0; i < (txfm_size_col >> 4); i++) {
-      lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
-                                      output + 16 * i, stride, ud_flip,
-                                      txfm_size_row);
-    }
-  } else if (txfm_size_col == 8) {
-    lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
-  }
-}
-
-static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
-                                                 uint8_t *output, int stride,
-                                                 TX_TYPE tx_type, int eob) {
-  (void)eob;
-  TX_SIZE tx_size = TX_4X4;
-  DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 8 + 8]);
-  int32_t *temp_in = txfm_buf;
-
-  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
-  int32_t *temp_out = temp_in + buf_offset;
-  int32_t *buf = temp_out + buf_offset;
-  int32_t *buf_ptr = buf;
-  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, 16, 16 };
-  int r, bd = 8;
-  const transform_1d_neon row_txfm =
-      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
-  const transform_1d_neon col_txfm =
-      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
-
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-
-  for (int i = 0; i < txfm_size_row; i++) {
-    row_txfm(input, buf_ptr, cos_bit_row, stage_range);
-
-    input += txfm_size_col;
-    buf_ptr += txfm_size_col;
-  }
-
-  for (int c = 0; c < txfm_size_col; ++c) {
-    if (lr_flip == 0) {
-      for (r = 0; r < txfm_size_row; ++r)
-        temp_in[r] = buf[r * txfm_size_col + c];
-    } else {
-      // flip left right
-      for (r = 0; r < txfm_size_row; ++r)
-        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
-    }
-    clamp_buf(temp_in, txfm_size_row, bd + 8);
-    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
-    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
-
-    if (ud_flip == 0) {
-      for (r = 0; r < txfm_size_row; ++r) {
-        output[r * stride + c] =
-            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
-      }
-    } else {
-      // flip upside down
-      for (r = 0; r < txfm_size_row; ++r) {
-        output[r * stride + c] = highbd_clip_pixel_add(
-            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
-      }
-    }
-  }
-}
-
-void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
-                                   int stride, TX_TYPE tx_type, int eob) {
-  (void)eob;
-  TX_SIZE tx_size = TX_4X8;
-  DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
-  int32_t *temp_in = txfm_buf;
-
-  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
-  int32_t *temp_out = temp_in + buf_offset;
-  int32_t *buf = temp_out + buf_offset;
-  int32_t *buf_ptr = buf;
-  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16,
-                                                   16, 16, 16, 16 };
-  int r, bd = 8;
-  const transform_1d_neon row_txfm =
-      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
-  const transform_1d_neon col_txfm =
-      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
-
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-
-  for (int i = 0; i < txfm_size_row; i++) {
-    for (int j = 0; j < txfm_size_col; j++)
-      temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
-
-    row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
-    input += txfm_size_col;
-    buf_ptr += txfm_size_col;
-  }
-
-  for (int c = 0; c < txfm_size_col; ++c) {
-    if (lr_flip == 0) {
-      for (r = 0; r < txfm_size_row; ++r)
-        temp_in[r] = buf[r * txfm_size_col + c];
-    } else {
-      // flip left right
-      for (r = 0; r < txfm_size_row; ++r)
-        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
-    }
-    clamp_buf(temp_in, txfm_size_row, bd + 8);
-    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
-    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
-
-    if (ud_flip == 0) {
-      for (r = 0; r < txfm_size_row; ++r) {
-        output[r * stride + c] =
-            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
-      }
-    } else {
-      // flip upside down
-      for (r = 0; r < txfm_size_row; ++r) {
-        output[r * stride + c] = highbd_clip_pixel_add(
-            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
-      }
-    }
-  }
-}
-
-void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
-                                   int stride, TX_TYPE tx_type, int eob) {
-  (void)eob;
-  TX_SIZE tx_size = TX_8X4;
-  DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
-  int32_t *temp_in = txfm_buf;
-
-  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
-  int32_t *temp_out = temp_in + buf_offset;
-  int32_t *buf = temp_out + buf_offset;
-  int32_t *buf_ptr = buf;
-  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16,
-                                                   16, 16, 16, 16 };
-  int r, bd = 8;
-  const transform_1d_neon row_txfm =
-      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
-  const transform_1d_neon col_txfm =
-      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
-
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-
-  for (int i = 0; i < txfm_size_row; i++) {
-    for (int j = 0; j < txfm_size_col; j++)
-      temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
-
-    row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
-    input += txfm_size_col;
-    buf_ptr += txfm_size_col;
-  }
-
-  for (int c = 0; c < txfm_size_col; ++c) {
-    if (lr_flip == 0) {
-      for (r = 0; r < txfm_size_row; ++r)
-        temp_in[r] = buf[r * txfm_size_col + c];
-    } else {
-      // flip left right
-      for (r = 0; r < txfm_size_row; ++r)
-        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
-    }
-    clamp_buf(temp_in, txfm_size_row, bd + 8);
-    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
-    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
-
-    if (ud_flip == 0) {
-      for (r = 0; r < txfm_size_row; ++r) {
-        output[r * stride + c] =
-            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
-      }
-    } else {
-      // flip upside down
-      for (r = 0; r < txfm_size_row; ++r) {
-        output[r * stride + c] = highbd_clip_pixel_add(
-            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
-      }
-    }
-  }
-}
-
-void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
-                                    int stride, TX_TYPE tx_type, int eob) {
-  (void)eob;
-  TX_SIZE tx_size = TX_4X16;
-  DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
-  int32_t *temp_in = txfm_buf;
-
-  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
-  int32_t *temp_out = temp_in + buf_offset;
-  int32_t *buf = temp_out + buf_offset;
-  int32_t *buf_ptr = buf;
-  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16,
-                                                   16, 16, 16, 16, 16 };
-  int r, bd = 8;
-  const transform_1d_neon row_txfm =
-      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
-  const transform_1d_neon col_txfm =
-      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
-
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-
-  for (int i = 0; i < txfm_size_row; i++) {
-    row_txfm(input, buf_ptr, cos_bit_row, stage_range);
-    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
-    input += txfm_size_col;
-    buf_ptr += txfm_size_col;
-  }
-
-  for (int c = 0; c < txfm_size_col; ++c) {
-    if (lr_flip == 0) {
-      for (r = 0; r < txfm_size_row; ++r)
-        temp_in[r] = buf[r * txfm_size_col + c];
-    } else {
-      // flip left right
-      for (r = 0; r < txfm_size_row; ++r)
-        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
-    }
-    clamp_buf(temp_in, txfm_size_row, bd + 8);
-    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
-    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
-
-    if (ud_flip == 0) {
-      for (r = 0; r < txfm_size_row; ++r) {
-        output[r * stride + c] =
-            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
-      }
-    } else {
-      // flip upside down
-      for (r = 0; r < txfm_size_row; ++r) {
-        output[r * stride + c] = highbd_clip_pixel_add(
-            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
-      }
-    }
-  }
-}
-
-void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
-                                    int stride, TX_TYPE tx_type, int eob) {
-  (void)eob;
-  TX_SIZE tx_size = TX_16X4;
-  DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]);
-  int32_t *temp_in = txfm_buf;
-
-  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
-  int32_t *temp_out = temp_in + buf_offset;
-  int32_t *buf = temp_out + buf_offset;
-  int32_t *buf_ptr = buf;
-  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16,
-                                                   16, 16, 16, 16, 16 };
-  int r, bd = 8;
-  const transform_1d_neon row_txfm =
-      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
-  const transform_1d_neon col_txfm =
-      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
-
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-
-  for (int i = 0; i < txfm_size_row; i++) {
-    row_txfm(input, buf_ptr, cos_bit_row, stage_range);
-    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
-    input += txfm_size_col;
-    buf_ptr += txfm_size_col;
-  }
-
-  for (int c = 0; c < txfm_size_col; ++c) {
-    if (lr_flip == 0) {
-      for (r = 0; r < txfm_size_row; ++r)
-        temp_in[r] = buf[r * txfm_size_col + c];
-    } else {
-      // flip left right
-      for (r = 0; r < txfm_size_row; ++r)
-        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
-    }
-    clamp_buf(temp_in, txfm_size_row, bd + 8);
-    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
-    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
-
-    if (ud_flip == 0) {
-      for (r = 0; r < txfm_size_row; ++r) {
-        output[r * stride + c] =
-            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
-      }
-    } else {
-      // flip upside down
-      for (r = 0; r < txfm_size_row; ++r) {
-        output[r * stride + c] = highbd_clip_pixel_add(
-            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
-      }
-    }
-  }
-}
-
-static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
-    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
-    TX_SIZE tx_size, int eob) {
-  int16x8_t a[64 * 8];
-  int16x8_t b[64 * 8];
-  int eobx, eoby, ud_flip, lr_flip;
-  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
-  const int buf_size_w_div8 = txfm_size_col >> 3;
-  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
-  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
-  const int input_stride = AOMMIN(32, txfm_size_col);
-  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
-  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
-  const int32_t *input_1;
-  int temp_b = 0;
-
-  const transform_neon row_txfm =
-      lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
-  const transform_neon col_txfm =
-      lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
-
-  assert(col_txfm != NULL);
-  assert(row_txfm != NULL);
-
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-
-  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
-    input_1 = input;
-    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
-      int k = j * 8 + i * txfm_size_col;
-      load_buffer_32bit_to_16bit_neon(input_1, &a[k], input_stride);
-      transpose_s16_8x8q(&a[k], &a[k]);
-      input_1 += 8;
-    }
-    input += (input_stride * 8);
-    if (abs(rect_type) == 1) {
-      int y = i * txfm_size_col;
-      round_shift_for_rect(&a[y], &a[y], input_stride);
-    }
-    row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
-    av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
-                                  -shift[0]);
-    if (lr_flip == 1) {
-      for (int j = 0; j < buf_size_w_div8; ++j) {
-        int k = j * 8 + i * txfm_size_col;
-        flip_buf_ud_neon(&a[k], 8);
-        transpose_s16_8x8q(
-            &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
-      }
-      temp_b += 8;
-    } else {
-      for (int j = 0; j < buf_size_w_div8; ++j) {
-        int k = j * 8 + i * txfm_size_col;
-        transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
-      }
-      temp_b += 8;
-    }
-  }
-  for (int j = 0; j < buf_size_w_div8; ++j) {
-    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
-    av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
-                                  -shift[1]);
-  }
-
-  if (txfm_size_col >= 16) {
-    for (int i = 0; i < (txfm_size_col >> 4); i++) {
-      lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
-                                      output + 16 * i, stride, ud_flip,
-                                      txfm_size_row);
-    }
-  } else if (txfm_size_col == 8) {
-    lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
-  }
-}
-
-static INLINE void lowbd_inv_txfm2d_add_universe_neon(
-    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
-    TX_SIZE tx_size, int eob) {
-  switch (tx_type) {
-    case IDTX:
-      lowbd_inv_txfm2d_add_idtx_neon(input, output, stride, tx_type, tx_size,
-                                     eob);
-      break;
-
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST:
-      lowbd_inv_txfm2d_add_v_identity_neon(input, output, stride, tx_type,
-                                           tx_size, eob);
-      break;
-
-    case V_DCT:
-    case V_ADST:
-    case V_FLIPADST:
-      lowbd_inv_txfm2d_add_h_identity_neon(input, output, stride, tx_type,
-                                           tx_size, eob);
-      break;
-
-    default:
-      lowbd_inv_txfm2d_add_no_identity_neon(input, output, stride, tx_type,
-                                            tx_size, eob);
-      break;
-  }
-}
-
-void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
-                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
-                                   int eob) {
-  switch (tx_size) {
-    case TX_4X4:
-      lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, eob);
-      break;
-
-    case TX_4X8:
-      lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, eob);
-      break;
-
-    case TX_8X4:
-      lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, eob);
-      break;
-
-    case TX_4X16:
-      lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, eob);
-      break;
-
-    case TX_16X4:
-      lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, eob);
-      break;
-
-    default:
-      lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
-                                         tx_size, eob);
-      break;
-  }
-}
-void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
-                           const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  if (!txfm_param->lossless) {
-    av1_lowbd_inv_txfm2d_add_neon(dqcoeff, dst, stride, tx_type,
-                                  txfm_param->tx_size, txfm_param->eob);
-  } else {
-    av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
-  }
-}

diff --git a/av1/common/arm/av1_inv_txfm_neon.h b/av1/common/arm/av1_inv_txfm_neon.h
deleted file mode 100644
index 644a8bc..0000000
--- a/av1/common/arm/av1_inv_txfm_neon.h
+++ /dev/null

@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-#ifndef AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
-#define AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
-
-#include "config/aom_config.h"
-#include "config/av1_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "av1/common/enums.h"
-#include "av1/common/av1_inv_txfm1d.h"
-#include "av1/common/av1_inv_txfm1d_cfg.h"
-#include "av1/common/av1_txfm.h"
-
-typedef void (*transform_1d_neon)(const int32_t *input, int32_t *output,
-                                  const int8_t cos_bit,
-                                  const int8_t *stage_ptr);
-typedef void (*transform_neon)(int16x8_t *input, int16x8_t *output,
-                               int8_t cos_bit, int bit);
-
-DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
-  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                av1_eob_to_eobxy_16x16_default[16]) = {
-  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
-  0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                av1_eob_to_eobxy_32x32_default[32]) = {
-  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
-  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
-  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
-  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
-  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
-  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
-  0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                av1_eob_to_eobxy_16x32_default[32]) = {
-  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
-  0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
-  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
-  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
-};
-
-DECLARE_ALIGNED(16, static const int16_t,
-                av1_eob_to_eobxy_32x16_default[16]) = {
-  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
-  0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
-  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
-  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
-  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
-  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
-};
-
-DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
-  0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
-};
-
-DECLARE_ALIGNED(16, static const int16_t *,
-                av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
-  NULL,
-  av1_eob_to_eobxy_8x8_default,
-  av1_eob_to_eobxy_16x16_default,
-  av1_eob_to_eobxy_32x32_default,
-  av1_eob_to_eobxy_32x32_default,
-  NULL,
-  NULL,
-  av1_eob_to_eobxy_8x16_default,
-  av1_eob_to_eobxy_16x8_default,
-  av1_eob_to_eobxy_16x32_default,
-  av1_eob_to_eobxy_32x16_default,
-  av1_eob_to_eobxy_32x32_default,
-  av1_eob_to_eobxy_32x32_default,
-  NULL,
-  NULL,
-  av1_eob_to_eobxy_8x32_default,
-  av1_eob_to_eobxy_32x8_default,
-  av1_eob_to_eobxy_16x32_default,
-  av1_eob_to_eobxy_32x16_default,
-};
-
-static const int lowbd_txfm_all_1d_zeros_idx[32] = {
-  0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
-  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-};
-
-// Transform block width in log2 for eob (size of 64 map to 32)
-static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
-  2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
-};
-
-static int eob_fill[32] = {
-  0,  7,  7,  7,  7,  7,  7,  7,  15, 15, 15, 15, 15, 15, 15, 15,
-  31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-};
-
-static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
-                                              TX_SIZE tx_size, int eob) {
-  if (eob == 1) {
-    *eobx = 0;
-    *eoby = 0;
-    return;
-  }
-
-  const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
-  const int eob_row = (eob - 1) >> tx_w_log2;
-  const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
-  *eobx = eobxy & 0xFF;
-  *eoby = eobxy >> 8;
-}
-
-static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
-                                                 TX_SIZE tx_size, int eob) {
-  eob -= 1;
-  const int txfm_size_row = tx_size_high[tx_size];
-  const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
-  *eobx = eob / (eoby_max + 1);
-  *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
-}
-
-static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
-                                                 TX_SIZE tx_size, int eob) {
-  eob -= 1;
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
-  *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
-  const int temp_eoby = eob / (eobx_max + 1);
-  assert(temp_eoby < 32);
-  *eoby = eob_fill[temp_eoby];
-}
-
-#endif  // AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_

diff --git a/av1/common/arm/cfl_neon.c b/av1/common/arm/cfl_neon.c
index 3d0dcb3..cc5f82f 100644
--- a/av1/common/arm/cfl_neon.c
+++ b/av1/common/arm/cfl_neon.c

@@ -35,103 +35,6 @@
   *((uint32_t *)ptr) = vreinterpret_u32_u16(val)[0];
 }
 
-// Store half of a vector.
-static INLINE void vsth_u8(uint8_t *ptr, uint8x8_t val) {
-  *((uint32_t *)ptr) = vreinterpret_u32_u8(val)[0];
-}
-
-static void cfl_luma_subsampling_420_lbd_neon(const uint8_t *input,
-                                              int input_stride,
-                                              uint16_t *pred_buf_q3, int width,
-                                              int height) {
-  const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
-  const int luma_stride = input_stride << 1;
-  do {
-    if (width == 4) {
-      const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input));
-      const uint16x4_t sum = vpadal_u8(top, vldh_dup_u8(input + input_stride));
-      vsth_u16(pred_buf_q3, vshl_n_u16(sum, 1));
-    } else if (width == 8) {
-      const uint16x4_t top = vpaddl_u8(vld1_u8(input));
-      const uint16x4_t sum = vpadal_u8(top, vld1_u8(input + input_stride));
-      vst1_u16(pred_buf_q3, vshl_n_u16(sum, 1));
-    } else if (width == 16) {
-      const uint16x8_t top = vpaddlq_u8(vld1q_u8(input));
-      const uint16x8_t sum = vpadalq_u8(top, vld1q_u8(input + input_stride));
-      vst1q_u16(pred_buf_q3, vshlq_n_u16(sum, 1));
-    } else {
-      const uint8x8x4_t top = vld4_u8(input);
-      const uint8x8x4_t bot = vld4_u8(input + input_stride);
-      // equivalent to a vpaddlq_u8 (because vld4q interleaves)
-      const uint16x8_t top_0 = vaddl_u8(top.val[0], top.val[1]);
-      // equivalent to a vpaddlq_u8 (because vld4q interleaves)
-      const uint16x8_t bot_0 = vaddl_u8(bot.val[0], bot.val[1]);
-      // equivalent to a vpaddlq_u8 (because vld4q interleaves)
-      const uint16x8_t top_1 = vaddl_u8(top.val[2], top.val[3]);
-      // equivalent to a vpaddlq_u8 (because vld4q interleaves)
-      const uint16x8_t bot_1 = vaddl_u8(bot.val[2], bot.val[3]);
-      uint16x8x2_t sum;
-      sum.val[0] = vshlq_n_u16(vaddq_u16(top_0, bot_0), 1);
-      sum.val[1] = vshlq_n_u16(vaddq_u16(top_1, bot_1), 1);
-      vst2q_u16(pred_buf_q3, sum);
-    }
-    input += luma_stride;
-  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
-}
-
-static void cfl_luma_subsampling_422_lbd_neon(const uint8_t *input,
-                                              int input_stride,
-                                              uint16_t *pred_buf_q3, int width,
-                                              int height) {
-  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
-  do {
-    if (width == 4) {
-      const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input));
-      vsth_u16(pred_buf_q3, vshl_n_u16(top, 2));
-    } else if (width == 8) {
-      const uint16x4_t top = vpaddl_u8(vld1_u8(input));
-      vst1_u16(pred_buf_q3, vshl_n_u16(top, 2));
-    } else if (width == 16) {
-      const uint16x8_t top = vpaddlq_u8(vld1q_u8(input));
-      vst1q_u16(pred_buf_q3, vshlq_n_u16(top, 2));
-    } else {
-      const uint8x8x4_t top = vld4_u8(input);
-      uint16x8x2_t sum;
-      // vaddl_u8 is equivalent to a vpaddlq_u8 (because vld4q interleaves)
-      sum.val[0] = vshlq_n_u16(vaddl_u8(top.val[0], top.val[1]), 2);
-      sum.val[1] = vshlq_n_u16(vaddl_u8(top.val[2], top.val[3]), 2);
-      vst2q_u16(pred_buf_q3, sum);
-    }
-    input += input_stride;
-  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
-}
-
-static void cfl_luma_subsampling_444_lbd_neon(const uint8_t *input,
-                                              int input_stride,
-                                              uint16_t *pred_buf_q3, int width,
-                                              int height) {
-  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
-  do {
-    if (width == 4) {
-      const uint16x8_t top = vshll_n_u8(vldh_dup_u8(input), 3);
-      vst1_u16(pred_buf_q3, vget_low_u16(top));
-    } else if (width == 8) {
-      const uint16x8_t top = vshll_n_u8(vld1_u8(input), 3);
-      vst1q_u16(pred_buf_q3, top);
-    } else {
-      const uint8x16_t top = vld1q_u8(input);
-      vst1q_u16(pred_buf_q3, vshll_n_u8(vget_low_u8(top), 3));
-      vst1q_u16(pred_buf_q3 + 8, vshll_n_u8(vget_high_u8(top), 3));
-      if (width == 32) {
-        const uint8x16_t next_top = vld1q_u8(input + 16);
-        vst1q_u16(pred_buf_q3 + 16, vshll_n_u8(vget_low_u8(next_top), 3));
-        vst1q_u16(pred_buf_q3 + 24, vshll_n_u8(vget_high_u8(next_top), 3));
-      }
-    }
-    input += input_stride;
-  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
-}
-
 #ifndef __aarch64__
 uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
@@ -469,49 +372,6 @@
   return result;
 }
 
-static INLINE void cfl_predict_lbd_neon(const int16_t *pred_buf_q3,
-                                        uint8_t *dst, int dst_stride,
-                                        int alpha_q3, int width, int height) {
-  const int16_t abs_alpha_q12 = abs(alpha_q3) << 9;
-  const int16_t *const end = pred_buf_q3 + height * CFL_BUF_LINE;
-  if (width == 4) {
-    const int16x4_t alpha_sign = vdup_n_s16(alpha_q3);
-    const int16x4_t dc = vdup_n_s16(*dst);
-    do {
-      const int16x4_t pred =
-          predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
-      vsth_u8(dst, vqmovun_s16(vcombine_s16(pred, pred)));
-      dst += dst_stride;
-    } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
-  } else {
-    const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3);
-    const int16x8_t dc = vdupq_n_s16(*dst);
-    do {
-      if (width == 8) {
-        vst1_u8(dst, vqmovun_s16(predict_w8(pred_buf_q3, alpha_sign,
-                                            abs_alpha_q12, dc)));
-      } else if (width == 16) {
-        const int16x8x2_t pred =
-            predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
-        const uint8x8x2_t predun = { { vqmovun_s16(pred.val[0]),
-                                       vqmovun_s16(pred.val[1]) } };
-        vst2_u8(dst, predun);
-      } else {
-        const int16x8x4_t pred =
-            predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
-        const uint8x8x4_t predun = {
-          { vqmovun_s16(pred.val[0]), vqmovun_s16(pred.val[1]),
-            vqmovun_s16(pred.val[2]), vqmovun_s16(pred.val[3]) }
-        };
-        vst4_u8(dst, predun);
-      }
-      dst += dst_stride;
-    } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
-  }
-}
-
-CFL_PREDICT_FN(neon, lbd)
-
 static INLINE uint16x4_t clamp_s16(int16x4_t a, int16x4_t max) {
   return vreinterpret_u16_s16(vmax_s16(vmin_s16(a, max), vdup_n_s16(0)));
 }

diff --git a/av1/common/arm/resize_neon.c b/av1/common/arm/resize_neon.c
deleted file mode 100644
index 76cd713..0000000
--- a/av1/common/arm/resize_neon.c
+++ /dev/null

@@ -1,834 +0,0 @@
-/*
- *
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "av1/common/resize.h"
-#include "av1/common/arm/mem_neon.h"
-#include "av1/common/arm/transpose_neon.h"
-#include "config/av1_rtcd.h"
-#include "config/aom_scale_rtcd.h"
-
-static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
-                                    const int16x8_t s2, const int16x8_t s3,
-                                    const int16x8_t s4, const int16x8_t s5,
-                                    const int16x8_t s6, const int16x8_t s7,
-                                    const int16x8_t filters,
-                                    const int16x8_t filter3,
-                                    const int16x8_t filter4) {
-  const int16x4_t filters_lo = vget_low_s16(filters);
-  const int16x4_t filters_hi = vget_high_s16(filters);
-  int16x8_t sum;
-
-  sum = vmulq_lane_s16(s0, filters_lo, 0);
-  sum = vmlaq_lane_s16(sum, s1, filters_lo, 1);
-  sum = vmlaq_lane_s16(sum, s2, filters_lo, 2);
-  sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
-  sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
-  sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
-  sum = vqaddq_s16(sum, vmulq_s16(s3, filter3));
-  sum = vqaddq_s16(sum, vmulq_s16(s4, filter4));
-  return vqrshrun_n_s16(sum, 7);
-}
-
-static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
-                                       const int16x8_t filters) {
-  const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
-  const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
-  int16x8_t ss[8];
-
-  ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
-  ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
-  ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
-  ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
-  ss[4] = vreinterpretq_s16_u16(vmovl_u8(s[4]));
-  ss[5] = vreinterpretq_s16_u16(vmovl_u8(s[5]));
-  ss[6] = vreinterpretq_s16_u16(vmovl_u8(s[6]));
-  ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7]));
-
-  return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7],
-                     filters, filter3, filter4);
-}
-
-static INLINE void scale_plane_2_to_1_phase_0(const uint8_t *src,
-                                              const int src_stride,
-                                              uint8_t *dst,
-                                              const int dst_stride, const int w,
-                                              const int h) {
-  const int max_width = (w + 15) & ~15;
-  int y = h;
-
-  assert(w && h);
-
-  do {
-    int x = max_width;
-    do {
-      const uint8x16x2_t s = vld2q_u8(src);
-      vst1q_u8(dst, s.val[0]);
-      src += 32;
-      dst += 16;
-      x -= 16;
-    } while (x);
-    src += 2 * (src_stride - max_width);
-    dst += dst_stride - max_width;
-  } while (--y);
-}
-
-static INLINE void scale_plane_4_to_1_phase_0(const uint8_t *src,
-                                              const int src_stride,
-                                              uint8_t *dst,
-                                              const int dst_stride, const int w,
-                                              const int h) {
-  const int max_width = (w + 15) & ~15;
-  int y = h;
-
-  assert(w && h);
-
-  do {
-    int x = max_width;
-    do {
-      const uint8x16x4_t s = vld4q_u8(src);
-      vst1q_u8(dst, s.val[0]);
-      src += 64;
-      dst += 16;
-      x -= 16;
-    } while (x);
-    src += 4 * (src_stride - max_width);
-    dst += dst_stride - max_width;
-  } while (--y);
-}
-
-static INLINE void scale_plane_bilinear_kernel(
-    const uint8x16_t in0, const uint8x16_t in1, const uint8x16_t in2,
-    const uint8x16_t in3, const uint8x8_t coef0, const uint8x8_t coef1,
-    uint8_t *const dst) {
-  const uint16x8_t h0 = vmull_u8(vget_low_u8(in0), coef0);
-  const uint16x8_t h1 = vmull_u8(vget_high_u8(in0), coef0);
-  const uint16x8_t h2 = vmull_u8(vget_low_u8(in2), coef0);
-  const uint16x8_t h3 = vmull_u8(vget_high_u8(in2), coef0);
-  const uint16x8_t h4 = vmlal_u8(h0, vget_low_u8(in1), coef1);
-  const uint16x8_t h5 = vmlal_u8(h1, vget_high_u8(in1), coef1);
-  const uint16x8_t h6 = vmlal_u8(h2, vget_low_u8(in3), coef1);
-  const uint16x8_t h7 = vmlal_u8(h3, vget_high_u8(in3), coef1);
-
-  const uint8x8_t hor0 = vrshrn_n_u16(h4, 7);  // temp: 00 01 02 03 04 05 06 07
-  const uint8x8_t hor1 = vrshrn_n_u16(h5, 7);  // temp: 08 09 0A 0B 0C 0D 0E 0F
-  const uint8x8_t hor2 = vrshrn_n_u16(h6, 7);  // temp: 10 11 12 13 14 15 16 17
-  const uint8x8_t hor3 = vrshrn_n_u16(h7, 7);  // temp: 18 19 1A 1B 1C 1D 1E 1F
-  const uint16x8_t v0 = vmull_u8(hor0, coef0);
-  const uint16x8_t v1 = vmull_u8(hor1, coef0);
-  const uint16x8_t v2 = vmlal_u8(v0, hor2, coef1);
-  const uint16x8_t v3 = vmlal_u8(v1, hor3, coef1);
-  // dst: 0 1 2 3 4 5 6 7  8 9 A B C D E F
-  const uint8x16_t d = vcombine_u8(vrshrn_n_u16(v2, 7), vrshrn_n_u16(v3, 7));
-  vst1q_u8(dst, d);
-}
-
-static INLINE void scale_plane_2_to_1_bilinear(
-    const uint8_t *const src, const int src_stride, uint8_t *dst,
-    const int dst_stride, const int w, const int h, const int16_t c0,
-    const int16_t c1) {
-  const int max_width = (w + 15) & ~15;
-  const uint8_t *src0 = src;
-  const uint8_t *src1 = src + src_stride;
-  const uint8x8_t coef0 = vdup_n_u8(c0);
-  const uint8x8_t coef1 = vdup_n_u8(c1);
-  int y = h;
-
-  assert(w && h);
-
-  do {
-    int x = max_width;
-    do {
-      // 000 002 004 006 008 00A 00C 00E  010 012 014 016 018 01A 01C 01E
-      // 001 003 005 007 009 00B 00D 00F  011 013 015 017 019 01B 01D 01F
-      const uint8x16x2_t s0 = vld2q_u8(src0);
-      // 100 102 104 106 108 10A 10C 10E  110 112 114 116 118 11A 11C 11E
-      // 101 103 105 107 109 10B 10D 10F  111 113 115 117 119 11B 11D 11F
-      const uint8x16x2_t s1 = vld2q_u8(src1);
-      scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1],
-                                  coef0, coef1, dst);
-      src0 += 32;
-      src1 += 32;
-      dst += 16;
-      x -= 16;
-    } while (x);
-    src0 += 2 * (src_stride - max_width);
-    src1 += 2 * (src_stride - max_width);
-    dst += dst_stride - max_width;
-  } while (--y);
-}
-
-static INLINE void scale_plane_4_to_1_bilinear(
-    const uint8_t *const src, const int src_stride, uint8_t *dst,
-    const int dst_stride, const int w, const int h, const int16_t c0,
-    const int16_t c1) {
-  const int max_width = (w + 15) & ~15;
-  const uint8_t *src0 = src;
-  const uint8_t *src1 = src + src_stride;
-  const uint8x8_t coef0 = vdup_n_u8(c0);
-  const uint8x8_t coef1 = vdup_n_u8(c1);
-  int y = h;
-
-  assert(w && h);
-
-  do {
-    int x = max_width;
-    do {
-      // (*) -- useless
-      // 000 004 008 00C 010 014 018 01C  020 024 028 02C 030 034 038 03C
-      // 001 005 009 00D 011 015 019 01D  021 025 029 02D 031 035 039 03D
-      // 002 006 00A 00E 012 016 01A 01E  022 026 02A 02E 032 036 03A 03E (*)
-      // 003 007 00B 00F 013 017 01B 01F  023 027 02B 02F 033 037 03B 03F (*)
-      const uint8x16x4_t s0 = vld4q_u8(src0);
-      // 100 104 108 10C 110 114 118 11C  120 124 128 12C 130 134 138 13C
-      // 101 105 109 10D 111 115 119 11D  121 125 129 12D 131 135 139 13D
-      // 102 106 10A 10E 112 116 11A 11E  122 126 12A 12E 132 136 13A 13E (*)
-      // 103 107 10B 10F 113 117 11B 11F  123 127 12B 12F 133 137 13B 13F (*)
-      const uint8x16x4_t s1 = vld4q_u8(src1);
-      scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1],
-                                  coef0, coef1, dst);
-      src0 += 64;
-      src1 += 64;
-      dst += 16;
-      x -= 16;
-    } while (x);
-    src0 += 4 * (src_stride - max_width);
-    src1 += 4 * (src_stride - max_width);
-    dst += dst_stride - max_width;
-  } while (--y);
-}
-
-static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
-                                       uint8_t *dst, const int dst_stride,
-                                       const int w, const int h,
-                                       const int16_t *const coef,
-                                       uint8_t *const temp_buffer) {
-  const int width_hor = (w + 3) & ~3;
-  const int width_ver = (w + 7) & ~7;
-  const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
-  const int height_ver = (h + 3) & ~3;
-  const int16x8_t filters = vld1q_s16(coef);
-  int x, y = height_hor;
-  uint8_t *t = temp_buffer;
-  uint8x8_t s[14], d[4];
-
-  assert(w && h);
-
-  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1;
-
-  // horizontal 4x8
-  // Note: processing 4x8 is about 20% faster than processing row by row using
-  // vld4_u8().
-  do {
-    load_u8_8x8(src + 2, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
-                &s[6], &s[7]);
-    transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
-    x = width_hor;
-
-    do {
-      src += 8;
-      load_u8_8x8(src, src_stride, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
-                  &s[12], &s[13]);
-      transpose_u8_8x8(&s[6], &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
-                       &s[13]);
-
-      d[0] = scale_filter_8(&s[0], filters);  // 00 10 20 30 40 50 60 70
-      d[1] = scale_filter_8(&s[2], filters);  // 01 11 21 31 41 51 61 71
-      d[2] = scale_filter_8(&s[4], filters);  // 02 12 22 32 42 52 62 72
-      d[3] = scale_filter_8(&s[6], filters);  // 03 13 23 33 43 53 63 73
-      // 00 01 02 03 40 41 42 43
-      // 10 11 12 13 50 51 52 53
-      // 20 21 22 23 60 61 62 63
-      // 30 31 32 33 70 71 72 73
-      transpose_u8_8x4(&d[0], &d[1], &d[2], &d[3]);
-      vst1_lane_u32((uint32_t *)(t + 0 * width_hor), vreinterpret_u32_u8(d[0]),
-                    0);
-      vst1_lane_u32((uint32_t *)(t + 1 * width_hor), vreinterpret_u32_u8(d[1]),
-                    0);
-      vst1_lane_u32((uint32_t *)(t + 2 * width_hor), vreinterpret_u32_u8(d[2]),
-                    0);
-      vst1_lane_u32((uint32_t *)(t + 3 * width_hor), vreinterpret_u32_u8(d[3]),
-                    0);
-      vst1_lane_u32((uint32_t *)(t + 4 * width_hor), vreinterpret_u32_u8(d[0]),
-                    1);
-      vst1_lane_u32((uint32_t *)(t + 5 * width_hor), vreinterpret_u32_u8(d[1]),
-                    1);
-      vst1_lane_u32((uint32_t *)(t + 6 * width_hor), vreinterpret_u32_u8(d[2]),
-                    1);
-      vst1_lane_u32((uint32_t *)(t + 7 * width_hor), vreinterpret_u32_u8(d[3]),
-                    1);
-
-      s[0] = s[8];
-      s[1] = s[9];
-      s[2] = s[10];
-      s[3] = s[11];
-      s[4] = s[12];
-      s[5] = s[13];
-
-      t += 4;
-      x -= 4;
-    } while (x);
-    src += 8 * src_stride - 2 * width_hor;
-    t += 7 * width_hor;
-    y -= 8;
-  } while (y);
-
-  // vertical 8x4
-  x = width_ver;
-  t = temp_buffer;
-  do {
-    load_u8_8x8(t, width_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
-                &s[7]);
-    t += 6 * width_hor;
-    y = height_ver;
-
-    do {
-      load_u8_8x8(t, width_hor, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
-                  &s[12], &s[13]);
-      t += 8 * width_hor;
-
-      d[0] = scale_filter_8(&s[0], filters);  // 00 01 02 03 04 05 06 07
-      d[1] = scale_filter_8(&s[2], filters);  // 10 11 12 13 14 15 16 17
-      d[2] = scale_filter_8(&s[4], filters);  // 20 21 22 23 24 25 26 27
-      d[3] = scale_filter_8(&s[6], filters);  // 30 31 32 33 34 35 36 37
-      vst1_u8(dst + 0 * dst_stride, d[0]);
-      vst1_u8(dst + 1 * dst_stride, d[1]);
-      vst1_u8(dst + 2 * dst_stride, d[2]);
-      vst1_u8(dst + 3 * dst_stride, d[3]);
-
-      s[0] = s[8];
-      s[1] = s[9];
-      s[2] = s[10];
-      s[3] = s[11];
-      s[4] = s[12];
-      s[5] = s[13];
-
-      dst += 4 * dst_stride;
-      y -= 4;
-    } while (y);
-    t -= width_hor * (2 * height_ver + 6);
-    t += 8;
-    dst -= height_ver * dst_stride;
-    dst += 8;
-    x -= 8;
-  } while (x);
-}
-
-static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
-                                       uint8_t *dst, const int dst_stride,
-                                       const int w, const int h,
-                                       const int16_t *const coef,
-                                       uint8_t *const temp_buffer) {
-  const int width_hor = (w + 1) & ~1;
-  const int width_ver = (w + 7) & ~7;
-  const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
-  const int height_ver = (h + 1) & ~1;
-  const int16x8_t filters = vld1q_s16(coef);
-  int x, y = height_hor;
-  uint8_t *t = temp_buffer;
-  uint8x8_t s[12], d[2];
-
-  assert(w && h);
-
-  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3;
-
-  // horizontal 2x8
-  // Note: processing 2x8 is about 20% faster than processing row by row using
-  // vld4_u8().
-  do {
-    load_u8_8x8(src + 4, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
-                &s[6], &s[7]);
-    transpose_u8_4x8(&s[0], &s[1], &s[2], &s[3], s[4], s[5], s[6], s[7]);
-    x = width_hor;
-
-    do {
-      uint8x8x2_t dd;
-      src += 8;
-      load_u8_8x8(src, src_stride, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
-                  &s[10], &s[11]);
-      transpose_u8_8x8(&s[4], &s[5], &s[6], &s[7], &s[8], &s[9], &s[10],
-                       &s[11]);
-
-      d[0] = scale_filter_8(&s[0], filters);  // 00 10 20 30 40 50 60 70
-      d[1] = scale_filter_8(&s[4], filters);  // 01 11 21 31 41 51 61 71
-      // dd.val[0]: 00 01 20 21 40 41 60 61
-      // dd.val[1]: 10 11 30 31 50 51 70 71
-      dd = vtrn_u8(d[0], d[1]);
-      vst1_lane_u16((uint16_t *)(t + 0 * width_hor),
-                    vreinterpret_u16_u8(dd.val[0]), 0);
-      vst1_lane_u16((uint16_t *)(t + 1 * width_hor),
-                    vreinterpret_u16_u8(dd.val[1]), 0);
-      vst1_lane_u16((uint16_t *)(t + 2 * width_hor),
-                    vreinterpret_u16_u8(dd.val[0]), 1);
-      vst1_lane_u16((uint16_t *)(t + 3 * width_hor),
-                    vreinterpret_u16_u8(dd.val[1]), 1);
-      vst1_lane_u16((uint16_t *)(t + 4 * width_hor),
-                    vreinterpret_u16_u8(dd.val[0]), 2);
-      vst1_lane_u16((uint16_t *)(t + 5 * width_hor),
-                    vreinterpret_u16_u8(dd.val[1]), 2);
-      vst1_lane_u16((uint16_t *)(t + 6 * width_hor),
-                    vreinterpret_u16_u8(dd.val[0]), 3);
-      vst1_lane_u16((uint16_t *)(t + 7 * width_hor),
-                    vreinterpret_u16_u8(dd.val[1]), 3);
-
-      s[0] = s[8];
-      s[1] = s[9];
-      s[2] = s[10];
-      s[3] = s[11];
-
-      t += 2;
-      x -= 2;
-    } while (x);
-    src += 8 * src_stride - 4 * width_hor;
-    t += 7 * width_hor;
-    y -= 8;
-  } while (y);
-
-  // vertical 8x2
-  x = width_ver;
-  t = temp_buffer;
-  do {
-    load_u8_8x4(t, width_hor, &s[0], &s[1], &s[2], &s[3]);
-    t += 4 * width_hor;
-    y = height_ver;
-
-    do {
-      load_u8_8x8(t, width_hor, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
-                  &s[10], &s[11]);
-      t += 8 * width_hor;
-
-      d[0] = scale_filter_8(&s[0], filters);  // 00 01 02 03 04 05 06 07
-      d[1] = scale_filter_8(&s[4], filters);  // 10 11 12 13 14 15 16 17
-      vst1_u8(dst + 0 * dst_stride, d[0]);
-      vst1_u8(dst + 1 * dst_stride, d[1]);
-
-      s[0] = s[8];
-      s[1] = s[9];
-      s[2] = s[10];
-      s[3] = s[11];
-
-      dst += 2 * dst_stride;
-      y -= 2;
-    } while (y);
-    t -= width_hor * (4 * height_ver + 4);
-    t += 8;
-    dst -= height_ver * dst_stride;
-    dst += 8;
-    x -= 8;
-  } while (x);
-}
-
-static INLINE uint8x8_t scale_filter_bilinear(const uint8x8_t *const s,
-                                              const uint8x8_t *const coef) {
-  const uint16x8_t h0 = vmull_u8(s[0], coef[0]);
-  const uint16x8_t h1 = vmlal_u8(h0, s[1], coef[1]);
-
-  return vrshrn_n_u16(h1, 7);
-}
-
-// Notes for 4 to 3 scaling:
-//
-// 1. 6 rows are calculated in each horizontal inner loop, so width_hor must be
-// multiple of 6, and no less than w.
-//
-// 2. 8 rows are calculated in each vertical inner loop, so width_ver must be
-// multiple of 8, and no less than w.
-//
-// 3. 8 columns are calculated in each horizontal inner loop for further
-// vertical scaling, so height_hor must be multiple of 8, and no less than
-// 4 * h / 3.
-//
-// 4. 6 columns are calculated in each vertical inner loop, so height_ver must
-// be multiple of 6, and no less than h.
-//
-// 5. The physical location of the last row of the 4 to 3 scaled frame is
-// decided by phase_scaler, and are always less than 1 pixel below the last row
-// of the original image.
-static void scale_plane_4_to_3_bilinear(const uint8_t *src,
-                                        const int src_stride, uint8_t *dst,
-                                        const int dst_stride, const int w,
-                                        const int h, const int phase_scaler,
-                                        uint8_t *const temp_buffer) {
-  static const int step_q4 = 16 * 4 / 3;
-  const int width_hor = (w + 5) - ((w + 5) % 6);
-  const int stride_hor = width_hor + 2;  // store 2 extra pixels
-  const int width_ver = (w + 7) & ~7;
-  // We only need 1 extra row below because there are only 2 bilinear
-  // coefficients.
-  const int height_hor = (4 * h / 3 + 1 + 7) & ~7;
-  const int height_ver = (h + 5) - ((h + 5) % 6);
-  int x, y = height_hor;
-  uint8_t *t = temp_buffer;
-  uint8x8_t s[9], d[8], c[6];
-  const InterpKernel *interp_kernel =
-      (const InterpKernel *)av1_interp_filter_params_list[BILINEAR].filter_ptr;
-  assert(w && h);
-
-  c[0] = vdup_n_u8((uint8_t)interp_kernel[phase_scaler][3]);
-  c[1] = vdup_n_u8((uint8_t)interp_kernel[phase_scaler][4]);
-  c[2] = vdup_n_u8(
-      (uint8_t)interp_kernel[(phase_scaler + 1 * step_q4) & SUBPEL_MASK][3]);
-  c[3] = vdup_n_u8(
-      (uint8_t)interp_kernel[(phase_scaler + 1 * step_q4) & SUBPEL_MASK][4]);
-  c[4] = vdup_n_u8(
-      (uint8_t)interp_kernel[(phase_scaler + 2 * step_q4) & SUBPEL_MASK][3]);
-  c[5] = vdup_n_u8(
-      (uint8_t)interp_kernel[(phase_scaler + 2 * step_q4) & SUBPEL_MASK][4]);
-
-  d[6] = vdup_n_u8(0);
-  d[7] = vdup_n_u8(0);
-
-  // horizontal 6x8
-  do {
-    load_u8_8x8(src, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
-                &s[6], &s[7]);
-    src += 1;
-    transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
-    x = width_hor;
-
-    do {
-      load_u8_8x8(src, src_stride, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
-                  &s[7], &s[8]);
-      src += 8;
-      transpose_u8_8x8(&s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7], &s[8]);
-
-      // 00 10 20 30 40 50 60 70
-      // 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72
-      // 03 13 23 33 43 53 63 73
-      // 04 14 24 34 44 54 64 74
-      // 05 15 25 35 45 55 65 75
-      d[0] = scale_filter_bilinear(&s[0], &c[0]);
-      d[1] =
-          scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]);
-      d[2] =
-          scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]);
-      d[3] = scale_filter_bilinear(&s[4], &c[0]);
-      d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)],
-                                   &c[2]);
-      d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)],
-                                   &c[4]);
-
-      // 00 01 02 03 04 05 xx xx
-      // 10 11 12 13 14 15 xx xx
-      // 20 21 22 23 24 25 xx xx
-      // 30 31 32 33 34 35 xx xx
-      // 40 41 42 43 44 45 xx xx
-      // 50 51 52 53 54 55 xx xx
-      // 60 61 62 63 64 65 xx xx
-      // 70 71 72 73 74 75 xx xx
-      transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
-      // store 2 extra pixels
-      vst1_u8(t + 0 * stride_hor, d[0]);
-      vst1_u8(t + 1 * stride_hor, d[1]);
-      vst1_u8(t + 2 * stride_hor, d[2]);
-      vst1_u8(t + 3 * stride_hor, d[3]);
-      vst1_u8(t + 4 * stride_hor, d[4]);
-      vst1_u8(t + 5 * stride_hor, d[5]);
-      vst1_u8(t + 6 * stride_hor, d[6]);
-      vst1_u8(t + 7 * stride_hor, d[7]);
-
-      s[0] = s[8];
-
-      t += 6;
-      x -= 6;
-    } while (x);
-    src += 8 * src_stride - 4 * width_hor / 3 - 1;
-    t += 7 * stride_hor + 2;
-    y -= 8;
-  } while (y);
-
-  // vertical 8x6
-  x = width_ver;
-  t = temp_buffer;
-  do {
-    load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
-                &s[7]);
-    t += stride_hor;
-    y = height_ver;
-
-    do {
-      load_u8_8x8(t, stride_hor, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
-                  &s[7], &s[8]);
-      t += 8 * stride_hor;
-
-      d[0] = scale_filter_bilinear(&s[0], &c[0]);
-      d[1] =
-          scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]);
-      d[2] =
-          scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]);
-      d[3] = scale_filter_bilinear(&s[4], &c[0]);
-      d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)],
-                                   &c[2]);
-      d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)],
-                                   &c[4]);
-      vst1_u8(dst + 0 * dst_stride, d[0]);
-      vst1_u8(dst + 1 * dst_stride, d[1]);
-      vst1_u8(dst + 2 * dst_stride, d[2]);
-      vst1_u8(dst + 3 * dst_stride, d[3]);
-      vst1_u8(dst + 4 * dst_stride, d[4]);
-      vst1_u8(dst + 5 * dst_stride, d[5]);
-
-      s[0] = s[8];
-
-      dst += 6 * dst_stride;
-      y -= 6;
-    } while (y);
-    t -= stride_hor * (4 * height_ver / 3 + 1);
-    t += 8;
-    dst -= height_ver * dst_stride;
-    dst += 8;
-    x -= 8;
-  } while (x);
-}
-
-static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
-                                       uint8_t *dst, const int dst_stride,
-                                       const int w, const int h,
-                                       const int16_t *const coef,
-                                       const int phase_scaler,
-                                       uint8_t *const temp_buffer) {
-  static const int step_q4 = 16 * 4 / 3;
-  const int width_hor = (w + 5) - ((w + 5) % 6);
-  const int stride_hor = width_hor + 2;  // store 2 extra pixels
-  const int width_ver = (w + 7) & ~7;
-  // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows
-  // above and (SUBPEL_TAPS / 2) extra rows below.
-  const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
-  const int height_ver = (h + 5) - ((h + 5) % 6);
-  const int16x8_t filters0 =
-      vld1q_s16(&coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK]);
-  const int16x8_t filters1 =
-      vld1q_s16(&coef[(phase_scaler + 1 * step_q4) & SUBPEL_MASK]);
-  const int16x8_t filters2 =
-      vld1q_s16(&coef[(phase_scaler + 2 * step_q4) & SUBPEL_MASK]);
-  int x, y = height_hor;
-  uint8_t *t = temp_buffer;
-  uint8x8_t s[15], d[8];
-
-  assert(w && h);
-
-  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2;
-  d[6] = vdup_n_u8(0);
-  d[7] = vdup_n_u8(0);
-
-  // horizontal 6x8
-  do {
-    load_u8_8x8(src + 1, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
-                &s[6], &s[7]);
-    transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
-    x = width_hor;
-
-    do {
-      src += 8;
-      load_u8_8x8(src, src_stride, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
-                  &s[13], &s[14]);
-      transpose_u8_8x8(&s[7], &s[8], &s[9], &s[10], &s[11], &s[12], &s[13],
-                       &s[14]);
-
-      // 00 10 20 30 40 50 60 70
-      // 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72
-      // 03 13 23 33 43 53 63 73
-      // 04 14 24 34 44 54 64 74
-      // 05 15 25 35 45 55 65 75
-      d[0] = scale_filter_8(&s[0], filters0);
-      d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1);
-      d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2);
-      d[3] = scale_filter_8(&s[4], filters0);
-      d[4] =
-          scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1);
-      d[5] =
-          scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2);
-
-      // 00 01 02 03 04 05 xx xx
-      // 10 11 12 13 14 15 xx xx
-      // 20 21 22 23 24 25 xx xx
-      // 30 31 32 33 34 35 xx xx
-      // 40 41 42 43 44 45 xx xx
-      // 50 51 52 53 54 55 xx xx
-      // 60 61 62 63 64 65 xx xx
-      // 70 71 72 73 74 75 xx xx
-      transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
-      // store 2 extra pixels
-      vst1_u8(t + 0 * stride_hor, d[0]);
-      vst1_u8(t + 1 * stride_hor, d[1]);
-      vst1_u8(t + 2 * stride_hor, d[2]);
-      vst1_u8(t + 3 * stride_hor, d[3]);
-      vst1_u8(t + 4 * stride_hor, d[4]);
-      vst1_u8(t + 5 * stride_hor, d[5]);
-      vst1_u8(t + 6 * stride_hor, d[6]);
-      vst1_u8(t + 7 * stride_hor, d[7]);
-
-      s[0] = s[8];
-      s[1] = s[9];
-      s[2] = s[10];
-      s[3] = s[11];
-      s[4] = s[12];
-      s[5] = s[13];
-      s[6] = s[14];
-
-      t += 6;
-      x -= 6;
-    } while (x);
-    src += 8 * src_stride - 4 * width_hor / 3;
-    t += 7 * stride_hor + 2;
-    y -= 8;
-  } while (y);
-
-  // vertical 8x6
-  x = width_ver;
-  t = temp_buffer;
-  do {
-    load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
-                &s[7]);
-    t += 7 * stride_hor;
-    y = height_ver;
-
-    do {
-      load_u8_8x8(t, stride_hor, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
-                  &s[13], &s[14]);
-      t += 8 * stride_hor;
-
-      d[0] = scale_filter_8(&s[0], filters0);
-      d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1);
-      d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2);
-      d[3] = scale_filter_8(&s[4], filters0);
-      d[4] =
-          scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1);
-      d[5] =
-          scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2);
-      vst1_u8(dst + 0 * dst_stride, d[0]);
-      vst1_u8(dst + 1 * dst_stride, d[1]);
-      vst1_u8(dst + 2 * dst_stride, d[2]);
-      vst1_u8(dst + 3 * dst_stride, d[3]);
-      vst1_u8(dst + 4 * dst_stride, d[4]);
-      vst1_u8(dst + 5 * dst_stride, d[5]);
-
-      s[0] = s[8];
-      s[1] = s[9];
-      s[2] = s[10];
-      s[3] = s[11];
-      s[4] = s[12];
-      s[5] = s[13];
-      s[6] = s[14];
-
-      dst += 6 * dst_stride;
-      y -= 6;
-    } while (y);
-    t -= stride_hor * (4 * height_ver / 3 + 7);
-    t += 8;
-    dst -= height_ver * dst_stride;
-    dst += 8;
-    x -= 8;
-  } while (x);
-}
-
-void av1_resize_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src,
-                                      YV12_BUFFER_CONFIG *dst,
-                                      const InterpFilter filter,
-                                      const int phase, const int num_planes) {
-  // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
-  // the static analysis warnings.
-  for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
-    const int is_uv = i > 0;
-    const int src_w = src->crop_widths[is_uv];
-    const int src_h = src->crop_heights[is_uv];
-    const int dst_w = dst->crop_widths[is_uv];
-    const int dst_h = dst->crop_heights[is_uv];
-    const int dst_y_w = (dst->crop_widths[0] + 1) & ~1;
-    const int dst_y_h = (dst->crop_heights[0] + 1) & ~1;
-
-    if (2 * dst_w == src_w && 2 * dst_h == src_h) {
-      if (phase == 0) {
-        scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv],
-                                   dst->buffers[i], dst->strides[is_uv], dst_w,
-                                   dst_h);
-      } else if (filter == BILINEAR) {
-        const int16_t c0 = av1_bilinear_filters[phase][3];
-        const int16_t c1 = av1_bilinear_filters[phase][4];
-        scale_plane_2_to_1_bilinear(src->buffers[i], src->strides[is_uv],
-                                    dst->buffers[i], dst->strides[is_uv], dst_w,
-                                    dst_h, c0, c1);
-      } else {
-        const int buffer_stride = (dst_y_w + 3) & ~3;
-        const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
-        uint8_t *const temp_buffer =
-            (uint8_t *)malloc(buffer_stride * buffer_height);
-        if (temp_buffer) {
-          const InterpKernel *interp_kernel =
-              (const InterpKernel *)av1_interp_filter_params_list[filter]
-                  .filter_ptr;
-          scale_plane_2_to_1_general(src->buffers[i], src->strides[is_uv],
-                                     dst->buffers[i], dst->strides[is_uv],
-                                     dst_w, dst_h, interp_kernel[phase],
-                                     temp_buffer);
-          free(temp_buffer);
-        }
-      }
-    } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
-      if (phase == 0) {
-        scale_plane_4_to_1_phase_0(src->buffers[i], src->strides[is_uv],
-                                   dst->buffers[i], dst->strides[is_uv], dst_w,
-                                   dst_h);
-      } else if (filter == BILINEAR) {
-        const int16_t c0 = av1_bilinear_filters[phase][3];
-        const int16_t c1 = av1_bilinear_filters[phase][4];
-        scale_plane_4_to_1_bilinear(src->buffers[i], src->strides[is_uv],
-                                    dst->buffers[i], dst->strides[is_uv], dst_w,
-                                    dst_h, c0, c1);
-      } else {
-        const int buffer_stride = (dst_y_w + 1) & ~1;
-        const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
-        uint8_t *const temp_buffer =
-            (uint8_t *)malloc(buffer_stride * buffer_height);
-        if (temp_buffer) {
-          const InterpKernel *interp_kernel =
-              (const InterpKernel *)av1_interp_filter_params_list[filter]
-                  .filter_ptr;
-          scale_plane_4_to_1_general(src->buffers[i], src->strides[is_uv],
-                                     dst->buffers[i], dst->strides[is_uv],
-                                     dst_w, dst_h, interp_kernel[phase],
-                                     temp_buffer);
-          free(temp_buffer);
-        }
-      }
-    } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
-      // 4 to 3
-      const int buffer_stride = (dst_y_w + 5) - ((dst_y_w + 5) % 6) + 2;
-      const int buffer_height = (4 * dst_y_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
-      uint8_t *const temp_buffer =
-          (uint8_t *)malloc(buffer_stride * buffer_height);
-      if (temp_buffer) {
-        if (filter == BILINEAR) {
-          scale_plane_4_to_3_bilinear(src->buffers[i], src->strides[is_uv],
-                                      dst->buffers[i], dst->strides[is_uv],
-                                      dst_w, dst_h, phase, temp_buffer);
-        } else {
-          const InterpKernel *interp_kernel =
-              (const InterpKernel *)av1_interp_filter_params_list[filter]
-                  .filter_ptr;
-          scale_plane_4_to_3_general(src->buffers[i], src->strides[is_uv],
-                                     dst->buffers[i], dst->strides[is_uv],
-                                     dst_w, dst_h, interp_kernel[phase], phase,
-                                     temp_buffer);
-        }
-      }
-    } else {
-      av1_resize_plane(src->buffers[i], src_h, src_w, src->strides[is_uv],
-                       dst->buffers[i], dst_h, dst_w, dst->strides[is_uv]);
-    }
-    aom_extend_frame_borders(dst, num_planes);
-  }
-}

diff --git a/av1/common/arm/selfguided_neon.c b/av1/common/arm/selfguided_neon.c
index eaa3d7f..d81c4af 100644
--- a/av1/common/arm/selfguided_neon.c
+++ b/av1/common/arm/selfguided_neon.c

@@ -1386,7 +1386,7 @@
 int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
                                     int stride, int32_t *flt0, int32_t *flt1,
                                     int flt_stride, int sgr_params_idx,
-                                    int bit_depth, int highbd) {
+                                    int bit_depth) {
   const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   assert(!(params->r[0] == 0 && params->r[1] == 0));
 
@@ -1398,20 +1398,12 @@
   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
   const int dgd_stride = stride;
 
-  if (highbd) {
-    const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
-    src_convert_hbd_copy(
-        dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
-        dgd_stride,
-        dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
-        dgd16_stride, width_ext, height_ext);
-  } else {
-    src_convert_u8_to_u16(
-        dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
-        dgd_stride,
-        dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
-        dgd16_stride, width_ext, height_ext);
-  }
+  const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
+  src_convert_hbd_copy(
+      dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+      dgd_stride,
+      dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+      dgd16_stride, width_ext, height_ext);
 
   if (params->r[0] > 0)
     restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0,
@@ -1426,7 +1418,7 @@
                                            int height, int stride, int eps,
                                            const int *xqd, uint8_t *dst8,
                                            int dst_stride, int32_t *tmpbuf,
-                                           int bit_depth, int highbd) {
+                                           int bit_depth) {
   int32_t *flt0 = tmpbuf;
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   assert(width * height <= RESTORATION_UNITPELS_MAX);
@@ -1442,20 +1434,12 @@
 
   assert(!(params->r[0] == 0 && params->r[1] == 0));
 
-  if (highbd) {
-    const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
-    src_convert_hbd_copy(
-        dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
-        dgd_stride,
-        dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
-        dgd16_stride, width_ext, height_ext);
-  } else {
-    src_convert_u8_to_u16(
-        dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
-        dgd_stride,
-        dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
-        dgd16_stride, width_ext, height_ext);
-  }
+  const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
+  src_convert_hbd_copy(
+      dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+      dgd_stride,
+      dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+      dgd16_stride, width_ext, height_ext);
 
   if (params->r[0] > 0)
     restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, width,
@@ -1474,7 +1458,6 @@
     int16x8_t r0, s0;
     uint16x8_t r4;
     int32x4_t u0, u4, v0, v4, f00, f10;
-    uint8x8_t t0;
     int count = 0, w = width, h = height, rc = 0;
 
     const int32x4_t xq0_vec = vdupq_n_s32(xq[0]);
@@ -1527,13 +1510,8 @@
 
         r4 = vreinterpretq_u16_s16(vmaxq_s16(r0, zero));
 
-        if (highbd) {
-          r4 = vminq_u16(r4, max);
-          vst1q_u16(dst16_ptr, r4);
-        } else {
-          t0 = vqmovn_u16(r4);
-          vst1_u8(dst_ptr, t0);
-        }
+        r4 = vminq_u16(r4, max);
+        vst1q_u16(dst16_ptr, r4);
 
         w -= 8;
         count += 8;

diff --git a/av1/common/arm/warp_plane_neon.c b/av1/common/arm/warp_plane_neon.c
deleted file mode 100644
index 74ee90f..0000000
--- a/av1/common/arm/warp_plane_neon.c
+++ /dev/null

@@ -1,716 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <assert.h>
-#include <arm_neon.h>
-#include <memory.h>
-#include <math.h>
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-#include "config/av1_rtcd.h"
-#include "av1/common/warped_motion.h"
-#include "av1/common/scale.h"
-
-/* This is a modified version of 'av1_warped_filter' from warped_motion.c:
-   * Each coefficient is stored in 8 bits instead of 16 bits
-   * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
-
-     This is done in order to avoid overflow: Since the tap with the largest
-     coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
-     order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
-     convolve functions.
-
-     Instead, we use the summation order
-     ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
-     The rearrangement of coefficients in this table is so that we can get the
-     coefficients into the correct order more quickly.
-*/
-/* clang-format off */
-DECLARE_ALIGNED(8, static const int8_t,
-                filter_8bit_neon[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
-#if WARPEDPIXEL_PREC_BITS == 6
-  // [-1, 0)
-  { 0, 127,   0, 0,   0,   1, 0, 0}, { 0, 127,   0, 0,  -1,   2, 0, 0},
-  { 1, 127,  -1, 0,  -3,   4, 0, 0}, { 1, 126,  -2, 0,  -4,   6, 1, 0},
-  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 125,  -4, 0,  -6,  11, 1, 0},
-  { 1, 124,  -4, 0,  -7,  13, 1, 0}, { 2, 123,  -5, 0,  -8,  15, 1, 0},
-  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 121,  -6, 0, -10,  20, 1, 0},
-  { 2, 120,  -7, 0, -11,  22, 2, 0}, { 2, 119,  -8, 0, -12,  25, 2, 0},
-  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 116,  -9, 0, -13,  29, 2, 0},
-  { 3, 114, -10, 0, -14,  32, 3, 0}, { 3, 113, -10, 0, -15,  35, 2, 0},
-  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 109, -11, 0, -16,  40, 3, 0},
-  { 3, 108, -12, 0, -16,  42, 3, 0}, { 4, 106, -13, 0, -17,  45, 3, 0},
-  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 102, -14, 0, -17,  50, 3, 0},
-  { 4, 100, -14, 0, -17,  52, 3, 0}, { 4,  98, -15, 0, -18,  55, 4, 0},
-  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  94, -16, 0, -18,  60, 4, 0},
-  { 4,  91, -16, 0, -18,  63, 4, 0}, { 4,  89, -16, 0, -18,  65, 4, 0},
-  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  85, -17, 0, -18,  70, 4, 0},
-  { 4,  82, -17, 0, -18,  73, 4, 0}, { 4,  80, -17, 0, -18,  75, 4, 0},
-  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  75, -18, 0, -17,  80, 4, 0},
-  { 4,  73, -18, 0, -17,  82, 4, 0}, { 4,  70, -18, 0, -17,  85, 4, 0},
-  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  65, -18, 0, -16,  89, 4, 0},
-  { 4,  63, -18, 0, -16,  91, 4, 0}, { 4,  60, -18, 0, -16,  94, 4, 0},
-  { 3,  58, -18, 0, -15,  96, 4, 0}, { 4,  55, -18, 0, -15,  98, 4, 0},
-  { 3,  52, -17, 0, -14, 100, 4, 0}, { 3,  50, -17, 0, -14, 102, 4, 0},
-  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  45, -17, 0, -13, 106, 4, 0},
-  { 3,  42, -16, 0, -12, 108, 3, 0}, { 3,  40, -16, 0, -11, 109, 3, 0},
-  { 3,  37, -15, 0, -11, 111, 3, 0}, { 2,  35, -15, 0, -10, 113, 3, 0},
-  { 3,  32, -14, 0, -10, 114, 3, 0}, { 2,  29, -13, 0,  -9, 116, 3, 0},
-  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  25, -12, 0,  -8, 119, 2, 0},
-  { 2,  22, -11, 0,  -7, 120, 2, 0}, { 1,  20, -10, 0,  -6, 121, 2, 0},
-  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  15,  -8, 0,  -5, 123, 2, 0},
-  { 1,  13,  -7, 0,  -4, 124, 1, 0}, { 1,  11,  -6, 0,  -4, 125, 1, 0},
-  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 1,   6,  -4, 0,  -2, 126, 1, 0},
-  { 0,   4,  -3, 0,  -1, 127, 1, 0}, { 0,   2,  -1, 0,   0, 127, 0, 0},
-  // [0, 1)
-  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -1,   2, 0, 0, 127,   0,  0},
-  { 0,  -3,   4, 1, 1, 127,  -2,  0}, { 0,  -5,   6, 1, 1, 127,  -2,  0},
-  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -7,  11, 2, 2, 126,  -4, -1},
-  {-1,  -8,  13, 2, 3, 125,  -5, -1}, {-1, -10,  16, 3, 3, 124,  -6, -1},
-  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -12,  20, 3, 4, 122,  -7, -1},
-  {-1, -13,  23, 3, 4, 121,  -8, -1}, {-2, -14,  25, 4, 5, 120,  -9, -1},
-  {-1, -15,  27, 4, 5, 119, -10, -1}, {-1, -16,  30, 4, 5, 118, -11, -1},
-  {-2, -17,  33, 5, 6, 116, -12, -1}, {-2, -17,  35, 5, 6, 114, -12, -1},
-  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  41, 6, 7, 111, -14, -2},
-  {-2, -19,  43, 6, 7, 110, -15, -2}, {-2, -20,  46, 6, 7, 108, -15, -2},
-  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  51, 7, 7, 104, -16, -2},
-  {-2, -21,  54, 7, 7, 102, -17, -2}, {-2, -21,  56, 7, 8, 100, -18, -2},
-  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  62, 7, 8,  96, -19, -2},
-  {-2, -22,  64, 7, 8,  94, -19, -2}, {-2, -22,  67, 8, 8,  91, -20, -2},
-  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -22,  72, 8, 8,  87, -21, -2},
-  {-2, -21,  74, 8, 8,  84, -21, -2}, {-2, -22,  77, 8, 8,  82, -21, -2},
-  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  82, 8, 8,  77, -22, -2},
-  {-2, -21,  84, 8, 8,  74, -21, -2}, {-2, -21,  87, 8, 8,  72, -22, -2},
-  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -20,  91, 8, 8,  67, -22, -2},
-  {-2, -19,  94, 8, 7,  64, -22, -2}, {-2, -19,  96, 8, 7,  62, -22, -2},
-  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -18, 100, 8, 7,  56, -21, -2},
-  {-2, -17, 102, 7, 7,  54, -21, -2}, {-2, -16, 104, 7, 7,  51, -21, -2},
-  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 108, 7, 6,  46, -20, -2},
-  {-2, -15, 110, 7, 6,  43, -19, -2}, {-2, -14, 111, 7, 6,  41, -19, -2},
-  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 114, 6, 5,  35, -17, -2},
-  {-1, -12, 116, 6, 5,  33, -17, -2}, {-1, -11, 118, 5, 4,  30, -16, -1},
-  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -9, 120, 5, 4,  25, -14, -2},
-  {-1,  -8, 121, 4, 3,  23, -13, -1}, {-1,  -7, 122, 4, 3,  20, -12, -1},
-  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -6, 124, 3, 3,  16, -10, -1},
-  {-1,  -5, 125, 3, 2,  13,  -8, -1}, {-1,  -4, 126, 2, 2,  11,  -7, -1},
-  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   6,  -5,  0},
-  { 0,  -2, 127, 1, 1,   4,  -3,  0}, { 0,   0, 127, 0, 0,   2,  -1,  0},
-  // [1, 2)
-  { 0, 0, 127,   0, 0,   1,   0, 0}, { 0, 0, 127,   0, 0,  -1,   2, 0},
-  { 0, 1, 127,  -1, 0,  -3,   4, 0}, { 0, 1, 126,  -2, 0,  -4,   6, 1},
-  { 0, 1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 125,  -4, 0,  -6,  11, 1},
-  { 0, 1, 124,  -4, 0,  -7,  13, 1}, { 0, 2, 123,  -5, 0,  -8,  15, 1},
-  { 0, 2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 121,  -6, 0, -10,  20, 1},
-  { 0, 2, 120,  -7, 0, -11,  22, 2}, { 0, 2, 119,  -8, 0, -12,  25, 2},
-  { 0, 3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 116,  -9, 0, -13,  29, 2},
-  { 0, 3, 114, -10, 0, -14,  32, 3}, { 0, 3, 113, -10, 0, -15,  35, 2},
-  { 0, 3, 111, -11, 0, -15,  37, 3}, { 0, 3, 109, -11, 0, -16,  40, 3},
-  { 0, 3, 108, -12, 0, -16,  42, 3}, { 0, 4, 106, -13, 0, -17,  45, 3},
-  { 0, 4, 104, -13, 0, -17,  47, 3}, { 0, 4, 102, -14, 0, -17,  50, 3},
-  { 0, 4, 100, -14, 0, -17,  52, 3}, { 0, 4,  98, -15, 0, -18,  55, 4},
-  { 0, 4,  96, -15, 0, -18,  58, 3}, { 0, 4,  94, -16, 0, -18,  60, 4},
-  { 0, 4,  91, -16, 0, -18,  63, 4}, { 0, 4,  89, -16, 0, -18,  65, 4},
-  { 0, 4,  87, -17, 0, -18,  68, 4}, { 0, 4,  85, -17, 0, -18,  70, 4},
-  { 0, 4,  82, -17, 0, -18,  73, 4}, { 0, 4,  80, -17, 0, -18,  75, 4},
-  { 0, 4,  78, -18, 0, -18,  78, 4}, { 0, 4,  75, -18, 0, -17,  80, 4},
-  { 0, 4,  73, -18, 0, -17,  82, 4}, { 0, 4,  70, -18, 0, -17,  85, 4},
-  { 0, 4,  68, -18, 0, -17,  87, 4}, { 0, 4,  65, -18, 0, -16,  89, 4},
-  { 0, 4,  63, -18, 0, -16,  91, 4}, { 0, 4,  60, -18, 0, -16,  94, 4},
-  { 0, 3,  58, -18, 0, -15,  96, 4}, { 0, 4,  55, -18, 0, -15,  98, 4},
-  { 0, 3,  52, -17, 0, -14, 100, 4}, { 0, 3,  50, -17, 0, -14, 102, 4},
-  { 0, 3,  47, -17, 0, -13, 104, 4}, { 0, 3,  45, -17, 0, -13, 106, 4},
-  { 0, 3,  42, -16, 0, -12, 108, 3}, { 0, 3,  40, -16, 0, -11, 109, 3},
-  { 0, 3,  37, -15, 0, -11, 111, 3}, { 0, 2,  35, -15, 0, -10, 113, 3},
-  { 0, 3,  32, -14, 0, -10, 114, 3}, { 0, 2,  29, -13, 0,  -9, 116, 3},
-  { 0, 2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  25, -12, 0,  -8, 119, 2},
-  { 0, 2,  22, -11, 0,  -7, 120, 2}, { 0, 1,  20, -10, 0,  -6, 121, 2},
-  { 0, 1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  15,  -8, 0,  -5, 123, 2},
-  { 0, 1,  13,  -7, 0,  -4, 124, 1}, { 0, 1,  11,  -6, 0,  -4, 125, 1},
-  { 0, 1,   8,  -5, 0,  -3, 126, 1}, { 0, 1,   6,  -4, 0,  -2, 126, 1},
-  { 0, 0,   4,  -3, 0,  -1, 127, 1}, { 0, 0,   2,  -1, 0,   0, 127, 0},
-  // dummy (replicate row index 191)
-  { 0, 0,   2,  -1, 0,   0, 127, 0},
-
-#else
-  // [-1, 0)
-  { 0, 127,   0, 0,   0,   1, 0, 0}, { 1, 127,  -1, 0,  -3,   4, 0, 0},
-  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 124,  -4, 0,  -7,  13, 1, 0},
-  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 120,  -7, 0, -11,  22, 2, 0},
-  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 114, -10, 0, -14,  32, 3, 0},
-  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 108, -12, 0, -16,  42, 3, 0},
-  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 100, -14, 0, -17,  52, 3, 0},
-  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  91, -16, 0, -18,  63, 4, 0},
-  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  82, -17, 0, -18,  73, 4, 0},
-  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  73, -18, 0, -17,  82, 4, 0},
-  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  63, -18, 0, -16,  91, 4, 0},
-  { 3,  58, -18, 0, -15,  96, 4, 0}, { 3,  52, -17, 0, -14, 100, 4, 0},
-  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  42, -16, 0, -12, 108, 3, 0},
-  { 3,  37, -15, 0, -11, 111, 3, 0}, { 3,  32, -14, 0, -10, 114, 3, 0},
-  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  22, -11, 0,  -7, 120, 2, 0},
-  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  13,  -7, 0,  -4, 124, 1, 0},
-  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 0,   4,  -3, 0,  -1, 127, 1, 0},
-  // [0, 1)
-  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -3,   4, 1, 1, 127,  -2,  0},
-  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -8,  13, 2, 3, 125,  -5, -1},
-  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -13,  23, 3, 4, 121,  -8, -1},
-  {-1, -15,  27, 4, 5, 119, -10, -1}, {-2, -17,  33, 5, 6, 116, -12, -1},
-  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  43, 6, 7, 110, -15, -2},
-  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  54, 7, 7, 102, -17, -2},
-  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  64, 7, 8,  94, -19, -2},
-  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -21,  74, 8, 8,  84, -21, -2},
-  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  84, 8, 8,  74, -21, -2},
-  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -19,  94, 8, 7,  64, -22, -2},
-  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -17, 102, 7, 7,  54, -21, -2},
-  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 110, 7, 6,  43, -19, -2},
-  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 116, 6, 5,  33, -17, -2},
-  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -8, 121, 4, 3,  23, -13, -1},
-  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -5, 125, 3, 2,  13,  -8, -1},
-  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   4,  -3,  0},
-  // [1, 2)
-  { 0,  0, 127,   0, 0,   1,   0, 0}, { 0, 1, 127,  -1, 0,  -3,   4, 0},
-  { 0,  1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 124,  -4, 0,  -7,  13, 1},
-  { 0,  2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 120,  -7, 0, -11,  22, 2},
-  { 0,  3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 114, -10, 0, -14,  32, 3},
-  { 0,  3, 111, -11, 0, -15,  37, 3}, { 0, 3, 108, -12, 0, -16,  42, 3},
-  { 0,  4, 104, -13, 0, -17,  47, 3}, { 0, 4, 100, -14, 0, -17,  52, 3},
-  { 0,  4,  96, -15, 0, -18,  58, 3}, { 0, 4,  91, -16, 0, -18,  63, 4},
-  { 0,  4,  87, -17, 0, -18,  68, 4}, { 0, 4,  82, -17, 0, -18,  73, 4},
-  { 0,  4,  78, -18, 0, -18,  78, 4}, { 0, 4,  73, -18, 0, -17,  82, 4},
-  { 0,  4,  68, -18, 0, -17,  87, 4}, { 0, 4,  63, -18, 0, -16,  91, 4},
-  { 0,  3,  58, -18, 0, -15,  96, 4}, { 0, 3,  52, -17, 0, -14, 100, 4},
-  { 0,  3,  47, -17, 0, -13, 104, 4}, { 0, 3,  42, -16, 0, -12, 108, 3},
-  { 0,  3,  37, -15, 0, -11, 111, 3}, { 0, 3,  32, -14, 0, -10, 114, 3},
-  { 0,  2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  22, -11, 0,  -7, 120, 2},
-  { 0,  1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  13,  -7, 0,  -4, 124, 1},
-  { 0,  1,   8,  -5, 0,  -3, 126, 1}, { 0, 0,   4,  -3, 0,  -1, 127, 1},
-  // dummy (replicate row index 95)
-  { 0, 0,   4,  -3, 0,  -1, 127, 1},
-#endif  // WARPEDPIXEL_PREC_BITS == 6
-};
-/* clang-format on */
-
-static INLINE void convolve(int32x2x2_t x0, int32x2x2_t x1, uint8x8_t src_0,
-                            uint8x8_t src_1, int16x4_t *res) {
-  int16x8_t coeff_0, coeff_1;
-  int16x8_t pix_0, pix_1;
-
-  coeff_0 = vcombine_s16(vreinterpret_s16_s32(x0.val[0]),
-                         vreinterpret_s16_s32(x1.val[0]));
-  coeff_1 = vcombine_s16(vreinterpret_s16_s32(x0.val[1]),
-                         vreinterpret_s16_s32(x1.val[1]));
-
-  pix_0 = vreinterpretq_s16_u16(vmovl_u8(src_0));
-  pix_0 = vmulq_s16(coeff_0, pix_0);
-
-  pix_1 = vreinterpretq_s16_u16(vmovl_u8(src_1));
-  pix_0 = vmlaq_s16(pix_0, coeff_1, pix_1);
-
-  *res = vpadd_s16(vget_low_s16(pix_0), vget_high_s16(pix_0));
-}
-
-static INLINE void horizontal_filter_neon(uint8x16_t src_1, uint8x16_t src_2,
-                                          uint8x16_t src_3, uint8x16_t src_4,
-                                          int16x8_t *tmp_dst, int sx, int alpha,
-                                          int k, const int offset_bits_horiz,
-                                          const int reduce_bits_horiz) {
-  const uint8x16_t mask = { 255, 0, 255, 0, 255, 0, 255, 0,
-                            255, 0, 255, 0, 255, 0, 255, 0 };
-  const int32x4_t add_const = vdupq_n_s32((int32_t)(1 << offset_bits_horiz));
-  const int16x8_t shift = vdupq_n_s16(-(int16_t)reduce_bits_horiz);
-
-  int16x8_t f0, f1, f2, f3, f4, f5, f6, f7;
-  int32x2x2_t b0, b1;
-  uint8x8_t src_1_low, src_2_low, src_3_low, src_4_low, src_5_low, src_6_low;
-  int32x4_t tmp_res_low, tmp_res_high;
-  uint16x8_t res;
-  int16x4_t res_0246_even, res_0246_odd, res_1357_even, res_1357_odd;
-
-  uint8x16_t tmp_0 = vandq_u8(src_1, mask);
-  uint8x16_t tmp_1 = vandq_u8(src_2, mask);
-  uint8x16_t tmp_2 = vandq_u8(src_3, mask);
-  uint8x16_t tmp_3 = vandq_u8(src_4, mask);
-
-  tmp_2 = vextq_u8(tmp_0, tmp_0, 1);
-  tmp_3 = vextq_u8(tmp_1, tmp_1, 1);
-
-  src_1 = vaddq_u8(tmp_0, tmp_2);
-  src_2 = vaddq_u8(tmp_1, tmp_3);
-
-  src_1_low = vget_low_u8(src_1);
-  src_2_low = vget_low_u8(src_2);
-  src_3_low = vget_low_u8(vextq_u8(src_1, src_1, 4));
-  src_4_low = vget_low_u8(vextq_u8(src_2, src_2, 4));
-  src_5_low = vget_low_u8(vextq_u8(src_1, src_1, 2));
-  src_6_low = vget_low_u8(vextq_u8(src_1, src_1, 6));
-
-  // Loading the 8 filter taps
-  f0 = vmovl_s8(
-      vld1_s8(filter_8bit_neon[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]));
-  f1 = vmovl_s8(
-      vld1_s8(filter_8bit_neon[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]));
-  f2 = vmovl_s8(
-      vld1_s8(filter_8bit_neon[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]));
-  f3 = vmovl_s8(
-      vld1_s8(filter_8bit_neon[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]));
-  f4 = vmovl_s8(
-      vld1_s8(filter_8bit_neon[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]));
-  f5 = vmovl_s8(
-      vld1_s8(filter_8bit_neon[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]));
-  f6 = vmovl_s8(
-      vld1_s8(filter_8bit_neon[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]));
-  f7 = vmovl_s8(
-      vld1_s8(filter_8bit_neon[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]));
-
-  b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f0)),
-                vreinterpret_s32_s16(vget_low_s16(f2)));
-  b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f4)),
-                vreinterpret_s32_s16(vget_low_s16(f6)));
-  convolve(b0, b1, src_1_low, src_3_low, &res_0246_even);
-
-  b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f1)),
-                vreinterpret_s32_s16(vget_low_s16(f3)));
-  b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f5)),
-                vreinterpret_s32_s16(vget_low_s16(f7)));
-  convolve(b0, b1, src_2_low, src_4_low, &res_0246_odd);
-
-  b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f0)),
-                vreinterpret_s32_s16(vget_high_s16(f2)));
-  b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f4)),
-                vreinterpret_s32_s16(vget_high_s16(f6)));
-  convolve(b0, b1, src_2_low, src_4_low, &res_1357_even);
-
-  b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f1)),
-                vreinterpret_s32_s16(vget_high_s16(f3)));
-  b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f5)),
-                vreinterpret_s32_s16(vget_high_s16(f7)));
-  convolve(b0, b1, src_5_low, src_6_low, &res_1357_odd);
-
-  tmp_res_low = vaddl_s16(res_0246_even, res_1357_even);
-  tmp_res_high = vaddl_s16(res_0246_odd, res_1357_odd);
-
-  tmp_res_low = vaddq_s32(tmp_res_low, add_const);
-  tmp_res_high = vaddq_s32(tmp_res_high, add_const);
-
-  res = vcombine_u16(vqmovun_s32(tmp_res_low), vqmovun_s32(tmp_res_high));
-  res = vqrshlq_u16(res, shift);
-
-  tmp_dst[k + 7] = vreinterpretq_s16_u16(res);
-}
-
-static INLINE void vertical_filter_neon(const int16x8_t *src,
-                                        int32x4_t *res_low, int32x4_t *res_high,
-                                        int sy, int gamma) {
-  int16x4_t src_0, src_1, fltr_0, fltr_1;
-  int32x4_t res_0, res_1;
-  int32x2_t res_0_im, res_1_im;
-  int32x4_t res_even, res_odd, im_res_0, im_res_1;
-
-  int16x8_t f0, f1, f2, f3, f4, f5, f6, f7;
-  int16x8x2_t b0, b1, b2, b3;
-  int32x4x2_t c0, c1, c2, c3;
-  int32x4x2_t d0, d1, d2, d3;
-
-  b0 = vtrnq_s16(src[0], src[1]);
-  b1 = vtrnq_s16(src[2], src[3]);
-  b2 = vtrnq_s16(src[4], src[5]);
-  b3 = vtrnq_s16(src[6], src[7]);
-
-  c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
-                 vreinterpretq_s32_s16(b0.val[1]));
-  c1 = vtrnq_s32(vreinterpretq_s32_s16(b1.val[0]),
-                 vreinterpretq_s32_s16(b1.val[1]));
-  c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
-                 vreinterpretq_s32_s16(b2.val[1]));
-  c3 = vtrnq_s32(vreinterpretq_s32_s16(b3.val[0]),
-                 vreinterpretq_s32_s16(b3.val[1]));
-
-  f0 = vld1q_s16((int16_t *)(av1_warped_filter +
-                             ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f1 = vld1q_s16((int16_t *)(av1_warped_filter +
-                             ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f2 = vld1q_s16((int16_t *)(av1_warped_filter +
-                             ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f3 = vld1q_s16((int16_t *)(av1_warped_filter +
-                             ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f4 = vld1q_s16((int16_t *)(av1_warped_filter +
-                             ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f5 = vld1q_s16((int16_t *)(av1_warped_filter +
-                             ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f6 = vld1q_s16((int16_t *)(av1_warped_filter +
-                             ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  f7 = vld1q_s16((int16_t *)(av1_warped_filter +
-                             ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-  d0 = vtrnq_s32(vreinterpretq_s32_s16(f0), vreinterpretq_s32_s16(f2));
-  d1 = vtrnq_s32(vreinterpretq_s32_s16(f4), vreinterpretq_s32_s16(f6));
-  d2 = vtrnq_s32(vreinterpretq_s32_s16(f1), vreinterpretq_s32_s16(f3));
-  d3 = vtrnq_s32(vreinterpretq_s32_s16(f5), vreinterpretq_s32_s16(f7));
-
-  // row:0,1 even_col:0,2
-  src_0 = vget_low_s16(vreinterpretq_s16_s32(c0.val[0]));
-  fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[0]));
-  res_0 = vmull_s16(src_0, fltr_0);
-
-  // row:0,1,2,3 even_col:0,2
-  src_0 = vget_low_s16(vreinterpretq_s16_s32(c1.val[0]));
-  fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[1]));
-  res_0 = vmlal_s16(res_0, src_0, fltr_0);
-  res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
-
-  // row:0,1 even_col:4,6
-  src_1 = vget_low_s16(vreinterpretq_s16_s32(c0.val[1]));
-  fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[0]));
-  res_1 = vmull_s16(src_1, fltr_1);
-
-  // row:0,1,2,3 even_col:4,6
-  src_1 = vget_low_s16(vreinterpretq_s16_s32(c1.val[1]));
-  fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[1]));
-  res_1 = vmlal_s16(res_1, src_1, fltr_1);
-  res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
-
-  // row:0,1,2,3 even_col:0,2,4,6
-  im_res_0 = vcombine_s32(res_0_im, res_1_im);
-
-  // row:4,5 even_col:0,2
-  src_0 = vget_low_s16(vreinterpretq_s16_s32(c2.val[0]));
-  fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[0]));
-  res_0 = vmull_s16(src_0, fltr_0);
-
-  // row:4,5,6,7 even_col:0,2
-  src_0 = vget_low_s16(vreinterpretq_s16_s32(c3.val[0]));
-  fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[1]));
-  res_0 = vmlal_s16(res_0, src_0, fltr_0);
-  res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
-
-  // row:4,5 even_col:4,6
-  src_1 = vget_low_s16(vreinterpretq_s16_s32(c2.val[1]));
-  fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[0]));
-  res_1 = vmull_s16(src_1, fltr_1);
-
-  // row:4,5,6,7 even_col:4,6
-  src_1 = vget_low_s16(vreinterpretq_s16_s32(c3.val[1]));
-  fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[1]));
-  res_1 = vmlal_s16(res_1, src_1, fltr_1);
-  res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
-
-  // row:4,5,6,7 even_col:0,2,4,6
-  im_res_1 = vcombine_s32(res_0_im, res_1_im);
-
-  // row:0-7 even_col:0,2,4,6
-  res_even = vaddq_s32(im_res_0, im_res_1);
-
-  // row:0,1 odd_col:1,3
-  src_0 = vget_high_s16(vreinterpretq_s16_s32(c0.val[0]));
-  fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[0]));
-  res_0 = vmull_s16(src_0, fltr_0);
-
-  // row:0,1,2,3 odd_col:1,3
-  src_0 = vget_high_s16(vreinterpretq_s16_s32(c1.val[0]));
-  fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[1]));
-  res_0 = vmlal_s16(res_0, src_0, fltr_0);
-  res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
-
-  // row:0,1 odd_col:5,7
-  src_1 = vget_high_s16(vreinterpretq_s16_s32(c0.val[1]));
-  fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[0]));
-  res_1 = vmull_s16(src_1, fltr_1);
-
-  // row:0,1,2,3 odd_col:5,7
-  src_1 = vget_high_s16(vreinterpretq_s16_s32(c1.val[1]));
-  fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[1]));
-  res_1 = vmlal_s16(res_1, src_1, fltr_1);
-  res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
-
-  // row:0,1,2,3 odd_col:1,3,5,7
-  im_res_0 = vcombine_s32(res_0_im, res_1_im);
-
-  // row:4,5 odd_col:1,3
-  src_0 = vget_high_s16(vreinterpretq_s16_s32(c2.val[0]));
-  fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[0]));
-  res_0 = vmull_s16(src_0, fltr_0);
-
-  // row:4,5,6,7 odd_col:1,3
-  src_0 = vget_high_s16(vreinterpretq_s16_s32(c3.val[0]));
-  fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[1]));
-  res_0 = vmlal_s16(res_0, src_0, fltr_0);
-  res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
-
-  // row:4,5 odd_col:5,7
-  src_1 = vget_high_s16(vreinterpretq_s16_s32(c2.val[1]));
-  fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[0]));
-  res_1 = vmull_s16(src_1, fltr_1);
-
-  // row:4,5,6,7 odd_col:5,7
-  src_1 = vget_high_s16(vreinterpretq_s16_s32(c3.val[1]));
-  fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[1]));
-  res_1 = vmlal_s16(res_1, src_1, fltr_1);
-  res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
-
-  // row:4,5,6,7 odd_col:1,3,5,7
-  im_res_1 = vcombine_s32(res_0_im, res_1_im);
-
-  // row:0-7 odd_col:1,3,5,7
-  res_odd = vaddq_s32(im_res_0, im_res_1);
-
-  // reordering as 0 1 2 3 | 4 5 6 7
-  c0 = vtrnq_s32(res_even, res_odd);
-
-  // Final store
-  *res_low = vcombine_s32(vget_low_s32(c0.val[0]), vget_low_s32(c0.val[1]));
-  *res_high = vcombine_s32(vget_high_s32(c0.val[0]), vget_high_s32(c0.val[1]));
-}
-
-void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width,
-                          int height, int stride, uint8_t *pred, int p_col,
-                          int p_row, int p_width, int p_height, int p_stride,
-                          int subsampling_x, int subsampling_y,
-                          ConvolveParams *conv_params, int16_t alpha,
-                          int16_t beta, int16_t gamma, int16_t delta) {
-  int16x8_t tmp[15];
-  const int bd = 8;
-  const int w0 = conv_params->fwd_offset;
-  const int w1 = conv_params->bck_offset;
-  const int use_wtd_comp_avg = is_uneven_wtd_comp_avg(conv_params);
-  const int32x4_t fwd = vdupq_n_s32((int32_t)w0);
-  const int32x4_t bwd = vdupq_n_s32((int32_t)w1);
-  const int16x8_t sub_constant = vdupq_n_s16((1 << (bd - 1)) + (1 << bd));
-
-  int limit = 0;
-  uint8x16_t vec_dup, mask_val;
-  int32x4_t res_lo, res_hi;
-  int16x8_t result_final;
-  uint8x16_t src_1, src_2, src_3, src_4;
-  uint8x16_t indx_vec = {
-    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-  };
-  uint8x16_t cmp_vec;
-
-  const int reduce_bits_horiz = conv_params->round_0;
-  const int reduce_bits_vert = conv_params->is_compound
-                                   ? conv_params->round_1
-                                   : 2 * FILTER_BITS - reduce_bits_horiz;
-  const int32x4_t shift_vert = vdupq_n_s32(-(int32_t)reduce_bits_vert);
-  const int offset_bits_horiz = bd + FILTER_BITS - 1;
-
-  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
-
-  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
-  int32x4_t add_const_vert = vdupq_n_s32((int32_t)(1 << offset_bits_vert));
-  const int round_bits =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const int16x4_t round_bits_vec = vdup_n_s16(-(int16_t)round_bits);
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int16x4_t res_sub_const =
-      vdup_n_s16(-((1 << (offset_bits - conv_params->round_1)) +
-                   (1 << (offset_bits - conv_params->round_1 - 1))));
-  int k;
-
-  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
-
-  for (int i = 0; i < p_height; i += 8) {
-    for (int j = 0; j < p_width; j += 8) {
-      const int32_t src_x = (p_col + j + 4) << subsampling_x;
-      const int32_t src_y = (p_row + i + 4) << subsampling_y;
-      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
-      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
-      const int32_t x4 = dst_x >> subsampling_x;
-      const int32_t y4 = dst_y >> subsampling_y;
-
-      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
-      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
-      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-
-      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
-             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
-      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
-             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
-
-      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-      // horizontal
-      if (ix4 <= -7) {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          int16_t dup_val =
-              (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
-              ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz));
-
-          tmp[k + 7] = vdupq_n_s16(dup_val);
-        }
-      } else if (ix4 >= width + 6) {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
-                            ref[iy * stride + (width - 1)] *
-                                (1 << (FILTER_BITS - reduce_bits_horiz));
-          tmp[k + 7] = vdupq_n_s16(dup_val);
-        }
-      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
-        const int out_of_boundary_left = -(ix4 - 6);
-        const int out_of_boundary_right = (ix4 + 8) - width;
-
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          int sx = sx4 + beta * (k + 4);
-
-          const uint8_t *src = ref + iy * stride + ix4 - 7;
-          src_1 = vld1q_u8(src);
-
-          if (out_of_boundary_left >= 0) {
-            limit = out_of_boundary_left + 1;
-            cmp_vec = vdupq_n_u8(out_of_boundary_left);
-            vec_dup = vdupq_n_u8(*(src + limit));
-            mask_val = vcleq_u8(indx_vec, cmp_vec);
-            src_1 = vbslq_u8(mask_val, vec_dup, src_1);
-          }
-          if (out_of_boundary_right >= 0) {
-            limit = 15 - (out_of_boundary_right + 1);
-            cmp_vec = vdupq_n_u8(15 - out_of_boundary_right);
-            vec_dup = vdupq_n_u8(*(src + limit));
-            mask_val = vcgeq_u8(indx_vec, cmp_vec);
-            src_1 = vbslq_u8(mask_val, vec_dup, src_1);
-          }
-          src_2 = vextq_u8(src_1, src_1, 1);
-          src_3 = vextq_u8(src_2, src_2, 1);
-          src_4 = vextq_u8(src_3, src_3, 1);
-
-          horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k,
-                                 offset_bits_horiz, reduce_bits_horiz);
-        }
-      } else {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          int sx = sx4 + beta * (k + 4);
-
-          const uint8_t *src = ref + iy * stride + ix4 - 7;
-          src_1 = vld1q_u8(src);
-          src_2 = vextq_u8(src_1, src_1, 1);
-          src_3 = vextq_u8(src_2, src_2, 1);
-          src_4 = vextq_u8(src_3, src_3, 1);
-
-          horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k,
-                                 offset_bits_horiz, reduce_bits_horiz);
-        }
-      }
-
-      // vertical
-      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
-        int sy = sy4 + delta * (k + 4);
-
-        const int16x8_t *v_src = tmp + (k + 4);
-
-        vertical_filter_neon(v_src, &res_lo, &res_hi, sy, gamma);
-
-        res_lo = vaddq_s32(res_lo, add_const_vert);
-        res_hi = vaddq_s32(res_hi, add_const_vert);
-
-        if (conv_params->is_compound) {
-          uint16_t *const p =
-              (uint16_t *)&conv_params
-                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
-
-          res_lo = vrshlq_s32(res_lo, shift_vert);
-          if (conv_params->do_average) {
-            uint8_t *const dst8 = &pred[(i + k + 4) * p_stride + j];
-            uint16x4_t tmp16_lo = vld1_u16(p);
-            int32x4_t tmp32_lo = vreinterpretq_s32_u32(vmovl_u16(tmp16_lo));
-            int16x4_t tmp16_low;
-            if (use_wtd_comp_avg) {
-              res_lo = vmulq_s32(res_lo, bwd);
-              tmp32_lo = vmulq_s32(tmp32_lo, fwd);
-              tmp32_lo = vaddq_s32(tmp32_lo, res_lo);
-              tmp16_low = vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS);
-            } else {
-              tmp32_lo = vaddq_s32(tmp32_lo, res_lo);
-              tmp16_low = vshrn_n_s32(tmp32_lo, 1);
-            }
-            int16x4_t res_low = vadd_s16(tmp16_low, res_sub_const);
-            res_low = vqrshl_s16(res_low, round_bits_vec);
-            int16x8_t final_res_low = vcombine_s16(res_low, res_low);
-            uint8x8_t res_8_low = vqmovun_s16(final_res_low);
-
-            vst1_lane_u32((uint32_t *)dst8, vreinterpret_u32_u8(res_8_low), 0);
-          } else {
-            uint16x4_t res_u16_low = vqmovun_s32(res_lo);
-            vst1_u16(p, res_u16_low);
-          }
-          if (p_width > 4) {
-            uint16_t *const p4 =
-                (uint16_t *)&conv_params
-                    ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
-
-            res_hi = vrshlq_s32(res_hi, shift_vert);
-            if (conv_params->do_average) {
-              uint8_t *const dst8_4 = &pred[(i + k + 4) * p_stride + j + 4];
-
-              uint16x4_t tmp16_hi = vld1_u16(p4);
-              int32x4_t tmp32_hi = vreinterpretq_s32_u32(vmovl_u16(tmp16_hi));
-              int16x4_t tmp16_high;
-              if (use_wtd_comp_avg) {
-                res_hi = vmulq_s32(res_hi, bwd);
-                tmp32_hi = vmulq_s32(tmp32_hi, fwd);
-                tmp32_hi = vaddq_s32(tmp32_hi, res_hi);
-                tmp16_high = vshrn_n_s32(tmp32_hi, DIST_PRECISION_BITS);
-              } else {
-                tmp32_hi = vaddq_s32(tmp32_hi, res_hi);
-                tmp16_high = vshrn_n_s32(tmp32_hi, 1);
-              }
-              int16x4_t res_high = vadd_s16(tmp16_high, res_sub_const);
-              res_high = vqrshl_s16(res_high, round_bits_vec);
-              int16x8_t final_res_high = vcombine_s16(res_high, res_high);
-              uint8x8_t res_8_high = vqmovun_s16(final_res_high);
-
-              vst1_lane_u32((uint32_t *)dst8_4, vreinterpret_u32_u8(res_8_high),
-                            0);
-            } else {
-              uint16x4_t res_u16_high = vqmovun_s32(res_hi);
-              vst1_u16(p4, res_u16_high);
-            }
-          }
-        } else {
-          res_lo = vrshlq_s32(res_lo, shift_vert);
-          res_hi = vrshlq_s32(res_hi, shift_vert);
-
-          result_final = vcombine_s16(vmovn_s32(res_lo), vmovn_s32(res_hi));
-          result_final = vsubq_s16(result_final, sub_constant);
-
-          uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j];
-          uint8x8_t val = vqmovun_s16(result_final);
-
-          if (p_width == 4) {
-            vst1_lane_u32((uint32_t *)p, vreinterpret_u32_u8(val), 0);
-          } else {
-            vst1_u8(p, val);
-          }
-        }
-      }
-    }
-  }
-}

diff --git a/av1/common/arm/wiener_convolve_neon.c b/av1/common/arm/wiener_convolve_neon.c
deleted file mode 100644
index 4d970ed..0000000
--- a/av1/common/arm/wiener_convolve_neon.c
+++ /dev/null

@@ -1,531 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "config/aom_config.h"
-#include "config/av1_rtcd.h"
-
-#include "aom_dsp/txfm_common.h"
-#include "aom_ports/mem.h"
-#include "av1/common/common.h"
-#include "av1/common/arm/convolve_neon.h"
-#include "av1/common/arm/mem_neon.h"
-#include "av1/common/arm/transpose_neon.h"
-
-/* Wiener filter 2D
-   Apply horizontal filter and store in a temporary buffer. When applying
-   vertical filter, overwrite the original pixel values.
- */
-void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const int16_t *filter_x, int x_step_q4,
-                                      const int16_t *filter_y, int y_step_q4,
-                                      int w, int h,
-                                      const ConvolveParams *conv_params) {
-  uint16_t *d_tmp;
-  uint8_t *d;
-  const uint8_t *src_ptr, *s_tmp;
-  uint16_t *dst_ptr;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  int width, height;
-  const int bd = 8;
-  const int intermediate_height = h + SUBPEL_TAPS - 1;
-  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
-  int16_t filter_x_tmp[7], filter_y_tmp[7];
-
-  DECLARE_ALIGNED(16, uint16_t,
-                  temp[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
-
-  assert(x_step_q4 == 16 && y_step_q4 == 16);
-  assert(!(w % 8));
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-
-  assert(filter_x[7] == 0);
-  assert(filter_y[7] == 0);
-
-  /* assumption of horizontal filtering output will not exceed 15 bit.
-     ((bd) + 1 + FILTER_BITS - conv_params->round_0) <= 15
-     16 - conv_params->round_0 <= 15 -- (conv_params->round_0) >= 1
-   */
-  assert((conv_params->round_0) >= 1);
-
-  memcpy(&filter_x_tmp[0], filter_x, sizeof(*filter_x) * FILTER_BITS);
-  memcpy(&filter_y_tmp[0], filter_y, sizeof(*filter_y) * FILTER_BITS);
-
-  filter_x_tmp[3] += (1 << FILTER_BITS);
-  filter_y_tmp[3] += (1 << FILTER_BITS);
-
-  s_tmp = src - center_tap * src_stride - center_tap;
-  dst_ptr = temp;
-  src_ptr = s_tmp;
-  height = intermediate_height;
-
-  /* if height is a multiple of 8 */
-  if (!(h & 7)) {
-    int16x8_t res0, res1, res2, res3;
-    uint16x8_t res4;
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
-#if defined(__aarch64__)
-    uint16x8_t res5, res6, res7, res8, res9, res10, res11;
-    uint8x8_t t8, t9, t10, t11, t12, t13, t14;
-
-    do {
-      const uint8_t *s;
-
-      __builtin_prefetch(src_ptr + 0 * src_stride);
-      __builtin_prefetch(src_ptr + 1 * src_stride);
-      __builtin_prefetch(src_ptr + 2 * src_stride);
-      __builtin_prefetch(src_ptr + 3 * src_stride);
-      __builtin_prefetch(src_ptr + 4 * src_stride);
-      __builtin_prefetch(src_ptr + 5 * src_stride);
-      __builtin_prefetch(src_ptr + 6 * src_stride);
-      __builtin_prefetch(src_ptr + 7 * src_stride);
-
-      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-      s = src_ptr + 7;
-      d_tmp = dst_ptr;
-      width = w;
-
-      __builtin_prefetch(dst_ptr + 0 * dst_stride);
-      __builtin_prefetch(dst_ptr + 1 * dst_stride);
-      __builtin_prefetch(dst_ptr + 2 * dst_stride);
-      __builtin_prefetch(dst_ptr + 3 * dst_stride);
-      __builtin_prefetch(dst_ptr + 4 * dst_stride);
-      __builtin_prefetch(dst_ptr + 5 * dst_stride);
-      __builtin_prefetch(dst_ptr + 6 * dst_stride);
-      __builtin_prefetch(dst_ptr + 7 * dst_stride);
-
-      do {
-        load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
-        transpose_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(t0, t6));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                          bd, conv_params->round_0);
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(t1, t7));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t2, t6));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t3, t5));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        res5 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                          bd, conv_params->round_0);
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(t2, t8));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t3, t7));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t4, t6));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        res6 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                          bd, conv_params->round_0);
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(t3, t9));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t4, t8));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t5, t7));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t6));
-        res7 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                          bd, conv_params->round_0);
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(t4, t10));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t5, t9));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t6, t8));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t7));
-        res8 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                          bd, conv_params->round_0);
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(t5, t11));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t6, t10));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t7, t9));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t8));
-        res9 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                          bd, conv_params->round_0);
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(t6, t12));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t7, t11));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t8, t10));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t9));
-        res10 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                           bd, conv_params->round_0);
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(t7, t13));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t8, t12));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t9, t11));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t10));
-        res11 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                           bd, conv_params->round_0);
-
-        transpose_u16_8x8(&res4, &res5, &res6, &res7, &res8, &res9, &res10,
-                          &res11);
-        store_u16_8x8(d_tmp, MAX_SB_SIZE, res4, res5, res6, res7, res8, res9,
-                      res10, res11);
-
-        t0 = t8;
-        t1 = t9;
-        t2 = t10;
-        t3 = t11;
-        t4 = t12;
-        t5 = t13;
-        t6 = t14;
-        s += 8;
-        d_tmp += 8;
-        width -= 8;
-      } while (width > 0);
-      src_ptr += 8 * src_stride;
-      dst_ptr += 8 * MAX_SB_SIZE;
-      height -= 8;
-    } while (height > 0);
-#else
-    uint8x8_t temp_0;
-
-    do {
-      const uint8_t *s;
-
-      __builtin_prefetch(src_ptr);
-
-      t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
-      s = src_ptr + 8;
-      d_tmp = dst_ptr;
-      width = w;
-
-      __builtin_prefetch(dst_ptr);
-
-      do {
-        t7 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
-        temp_0 = t0;
-        t0 = t7;
-
-        t1 = vext_u8(temp_0, t7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
-        t2 = vext_u8(temp_0, t7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
-        t3 = vext_u8(temp_0, t7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
-        t4 = vext_u8(temp_0, t7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
-        t5 = vext_u8(temp_0, t7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
-        t6 = vext_u8(temp_0, t7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
-        t7 = vext_u8(temp_0, t7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                          bd, conv_params->round_0);
-
-        vst1q_u16(d_tmp, res4);
-
-        s += 8;
-        d_tmp += 8;
-        width -= 8;
-      } while (width > 0);
-      src_ptr += src_stride;
-      dst_ptr += MAX_SB_SIZE;
-      height--;
-    } while (height > 0);
-#endif
-  } else {
-    /*if height is a multiple of 4*/
-    const uint8_t *s;
-    int16x8_t tt0, tt1, tt2, tt3;
-    uint16x8_t d0;
-    uint8x8_t t0, t1, t2, t3;
-
-#if defined(__aarch64__)
-    uint16x4_t res0, res1, res2, res3, res4, res5, res6, res7;
-    uint16x8_t d1, d2, d3;
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-    int16x4_t s11, s12, s13, s14;
-    do {
-      __builtin_prefetch(src_ptr + 0 * src_stride);
-      __builtin_prefetch(src_ptr + 1 * src_stride);
-      __builtin_prefetch(src_ptr + 2 * src_stride);
-      __builtin_prefetch(src_ptr + 3 * src_stride);
-
-      load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3); /*8x4*/
-      transpose_u8_8x4(&t0, &t1, &t2,
-                       &t3); /*first 8 pixels of 4 rows transposed-- 4x8*/
-
-      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
-      s0 = vget_low_s16(tt0);  /*pa0 pb0 pc0 pd0 -- pixel_a0*/
-      s1 = vget_low_s16(tt1);  /*pa1 pb1 pc1 pd1 */
-      s2 = vget_low_s16(tt2);  /*pa2 pb2 pc2 pd2 */
-      s3 = vget_low_s16(tt3);  /*pa3 pb3 pc3 pd3 */
-      s4 = vget_high_s16(tt0); /*pa4 pb4 pc4 pd4 */
-      s5 = vget_high_s16(tt1); /*pa5 pb5 pc5 pd5 */
-      s6 = vget_high_s16(tt2); /*pa6 pb6 pc6 pd6 */
-
-      __builtin_prefetch(dst_ptr + 0 * dst_stride);
-      __builtin_prefetch(dst_ptr + 1 * dst_stride);
-      __builtin_prefetch(dst_ptr + 2 * dst_stride);
-      __builtin_prefetch(dst_ptr + 3 * dst_stride);
-
-      s = src_ptr + 7;
-      d_tmp = dst_ptr;
-      width = w;
-
-      do {
-        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); /*8x4*/
-        transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
-        tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
-        s7 = vget_low_s16(tt0); /*pa7  pb7  pc7  pd7  */ /*4x8*/
-        s8 = vget_low_s16(tt1);   /*pa8  pb8  pc8  pd8  */
-        s9 = vget_low_s16(tt2);   /*pa9  pb9  pc9  pd9  */
-        s10 = vget_low_s16(tt3);  /*pa10 pb10 pc10 pd10 */
-        s11 = vget_high_s16(tt0); /*pa11 pb11 pc11 pd11 */
-        s12 = vget_high_s16(tt1); /*pa12 pb12 pc12 pd12 */
-        s13 = vget_high_s16(tt2); /*pa13 pb13 pc13 pd13 */
-        s14 = vget_high_s16(tt3); /*pa14 pb14 pc14 pd14 */
-
-        res0 = wiener_convolve8_horiz_4x8(
-            s0, s1, s2, s3, s4, s5, s6, filter_x_tmp, bd, conv_params->round_0);
-        res1 = wiener_convolve8_horiz_4x8(
-            s1, s2, s3, s4, s5, s6, s7, filter_x_tmp, bd, conv_params->round_0);
-        res2 = wiener_convolve8_horiz_4x8(
-            s2, s3, s4, s5, s6, s7, s8, filter_x_tmp, bd, conv_params->round_0);
-        res3 = wiener_convolve8_horiz_4x8(
-            s3, s4, s5, s6, s7, s8, s9, filter_x_tmp, bd, conv_params->round_0);
-        res4 =
-            wiener_convolve8_horiz_4x8(s4, s5, s6, s7, s8, s9, s10,
-                                       filter_x_tmp, bd, conv_params->round_0);
-        res5 =
-            wiener_convolve8_horiz_4x8(s5, s6, s7, s8, s9, s10, s11,
-                                       filter_x_tmp, bd, conv_params->round_0);
-        res6 =
-            wiener_convolve8_horiz_4x8(s6, s7, s8, s9, s10, s11, s12,
-                                       filter_x_tmp, bd, conv_params->round_0);
-        res7 =
-            wiener_convolve8_horiz_4x8(s7, s8, s9, s10, s11, s12, s13,
-                                       filter_x_tmp, bd, conv_params->round_0);
-
-        transpose_u16_4x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
-                          &res7, &d0, &d1, &d2, &d3);
-
-        store_u16_8x4(d_tmp, MAX_SB_SIZE, d0, d1, d2, d3);
-
-        s0 = s8;
-        s1 = s9;
-        s2 = s10;
-        s3 = s11;
-        s4 = s12;
-        s5 = s13;
-        s6 = s14;
-        s += 8;
-        d_tmp += 8;
-        width -= 8;
-      } while (width > 0);
-
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * MAX_SB_SIZE;
-      height -= 4;
-    } while (height > 0);
-#else
-    uint8x8_t temp_0, t4, t5, t6, t7;
-
-    do {
-      __builtin_prefetch(src_ptr);
-
-      t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
-
-      __builtin_prefetch(dst_ptr);
-
-      s = src_ptr + 8;
-      d_tmp = dst_ptr;
-      width = w;
-
-      do {
-        t7 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
-        temp_0 = t0;
-        t0 = t7;
-
-        t1 = vext_u8(temp_0, t7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
-        t2 = vext_u8(temp_0, t7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
-        t3 = vext_u8(temp_0, t7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
-        t4 = vext_u8(temp_0, t7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
-        t5 = vext_u8(temp_0, t7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
-        t6 = vext_u8(temp_0, t7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
-        t7 = vext_u8(temp_0, t7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
-
-        tt0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6));
-        tt1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
-        tt2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
-        tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        d0 = wiener_convolve8_horiz_8x8(tt0, tt1, tt2, tt3, filter_x_tmp, bd,
-                                        conv_params->round_0);
-
-        vst1q_u16(d_tmp, d0);
-
-        s += 8;
-        d_tmp += 8;
-        width -= 8;
-      } while (width > 0);
-
-      src_ptr += src_stride;
-      dst_ptr += MAX_SB_SIZE;
-      height -= 1;
-    } while (height > 0);
-#endif
-  }
-
-  {
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-    uint8x8_t t0;
-#if defined(__aarch64__)
-    int16x8_t s8, s9, s10;
-    uint8x8_t t1, t2, t3;
-#endif
-    int16_t *src_tmp_ptr, *s;
-    uint8_t *dst_tmp_ptr;
-    height = h;
-    width = w;
-    src_tmp_ptr = (int16_t *)temp;
-    dst_tmp_ptr = dst;
-    src_stride = MAX_SB_SIZE;
-
-    do {
-      s = src_tmp_ptr;
-      s0 = vld1q_s16(s);
-      s += src_stride;
-      s1 = vld1q_s16(s);
-      s += src_stride;
-      s2 = vld1q_s16(s);
-      s += src_stride;
-      s3 = vld1q_s16(s);
-      s += src_stride;
-      s4 = vld1q_s16(s);
-      s += src_stride;
-      s5 = vld1q_s16(s);
-      s += src_stride;
-      s6 = vld1q_s16(s);
-      s += src_stride;
-      d = dst_tmp_ptr;
-      height = h;
-
-#if defined(__aarch64__)
-      do {
-        __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
-        __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
-        __builtin_prefetch(dst_tmp_ptr + 2 * dst_stride);
-        __builtin_prefetch(dst_tmp_ptr + 3 * dst_stride);
-
-        s7 = vld1q_s16(s);
-        s += src_stride;
-        s8 = vld1q_s16(s);
-        s += src_stride;
-        s9 = vld1q_s16(s);
-        s += src_stride;
-        s10 = vld1q_s16(s);
-        s += src_stride;
-
-        t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp,
-                                       bd, conv_params->round_1);
-        t1 = wiener_convolve8_vert_4x8(s1, s2, s3, s4, s5, s6, s7, filter_y_tmp,
-                                       bd, conv_params->round_1);
-        t2 = wiener_convolve8_vert_4x8(s2, s3, s4, s5, s6, s7, s8, filter_y_tmp,
-                                       bd, conv_params->round_1);
-        t3 = wiener_convolve8_vert_4x8(s3, s4, s5, s6, s7, s8, s9, filter_y_tmp,
-                                       bd, conv_params->round_1);
-
-        vst1_u8(d, t0);
-        d += dst_stride;
-        vst1_u8(d, t1);
-        d += dst_stride;
-        vst1_u8(d, t2);
-        d += dst_stride;
-        vst1_u8(d, t3);
-        d += dst_stride;
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s5 = s9;
-        s6 = s10;
-        height -= 4;
-      } while (height > 3);
-
-      if (height != 0) {
-        __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
-        __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
-
-        do {
-          s7 = vld1q_s16(s);
-          s += src_stride;
-
-          t0 =
-              wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6,
-                                        filter_y_tmp, bd, conv_params->round_1);
-          vst1_u8(d, t0);
-          d += dst_stride;
-
-          s0 = s1;
-          s1 = s2;
-          s2 = s3;
-          s3 = s4;
-          s4 = s5;
-          s5 = s6;
-          s6 = s7;
-          height -= 1;
-        } while (height > 0);
-      }
-
-      src_tmp_ptr += 8;
-      dst_tmp_ptr += 8;
-
-      w -= 8;
-    } while (w > 0);
-#else
-      do {
-        __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
-
-        s7 = vld1q_s16(s);
-        s += src_stride;
-
-        t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp,
-                                       bd, conv_params->round_1);
-
-        vst1_u8(d, t0);
-        d += dst_stride;
-
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-        s5 = s6;
-        s6 = s7;
-        height -= 1;
-      } while (height > 0);
-
-      src_tmp_ptr += 8;
-      dst_tmp_ptr += 8;
-
-      w -= 8;
-    } while (w > 0);
-#endif
-  }
-}

diff --git a/av1/common/av1_common_int.h b/av1/common/av1_common_int.h
index 1f5fbc1..b2cfe6f 100644
--- a/av1/common/av1_common_int.h
+++ b/av1/common/av1_common_int.h

@@ -455,7 +455,6 @@
   // Color config.
   aom_bit_depth_t bit_depth;  // AOM_BITS_8 in profile 0 or 1,
                               // AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3.
-  uint8_t use_highbitdepth;   // If true, we need to use 16bit frame buffers.
   uint8_t monochrome;         // Monochorme video
   aom_color_primaries_t color_primaries;
   aom_transfer_characteristics_t transfer_characteristics;

diff --git a/av1/common/av1_loopfilter.c b/av1/common/av1_loopfilter.c
index ff547fc..77133ad 100644
--- a/av1/common/av1_loopfilter.c
+++ b/av1/common/av1_loopfilter.c

@@ -938,62 +938,39 @@
         tx_size = TX_4X4;
       }
 
-      const int use_highbitdepth = cm->seq_params.use_highbitdepth;
       const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
 #if CONFIG_NEW_DF
 
       if (params.filter_length) {
-        if (use_highbitdepth) {
-          aom_highbd_lpf_vertical_generic_c(
-              CONVERT_TO_SHORTPTR(p), dst_stride, params.filter_length,
-              &params.q_threshold, &params.side_threshold, bit_depth);
-        } else {
-          aom_lpf_vertical_generic_c(p, dst_stride, params.filter_length,
-                                     &params.q_threshold,
-                                     &params.side_threshold);
-        }
+        aom_highbd_lpf_vertical_generic_c(
+            CONVERT_TO_SHORTPTR(p), dst_stride, params.filter_length,
+            &params.q_threshold, &params.side_threshold, bit_depth);
       }
 #else
       switch (params.filter_length) {
         // apply 4-tap filtering
         case 4:
-          if (use_highbitdepth)
-            aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(p), dst_stride,
-                                      params.mblim, params.lim, params.hev_thr,
-                                      bit_depth);
-          else
-            aom_lpf_vertical_4(p, dst_stride, params.mblim, params.lim,
-                               params.hev_thr);
+          aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                    params.mblim, params.lim, params.hev_thr,
+                                    bit_depth);
           break;
         case 6:  // apply 6-tap filter for chroma plane only
           assert(plane != 0);
-          if (use_highbitdepth)
-            aom_highbd_lpf_vertical_6(CONVERT_TO_SHORTPTR(p), dst_stride,
-                                      params.mblim, params.lim, params.hev_thr,
-                                      bit_depth);
-          else
-            aom_lpf_vertical_6(p, dst_stride, params.mblim, params.lim,
-                               params.hev_thr);
+          aom_highbd_lpf_vertical_6(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                    params.mblim, params.lim, params.hev_thr,
+                                    bit_depth);
           break;
         // apply 8-tap filtering
         case 8:
-          if (use_highbitdepth)
-            aom_highbd_lpf_vertical_8(CONVERT_TO_SHORTPTR(p), dst_stride,
-                                      params.mblim, params.lim, params.hev_thr,
-                                      bit_depth);
-          else
-            aom_lpf_vertical_8(p, dst_stride, params.mblim, params.lim,
-                               params.hev_thr);
+          aom_highbd_lpf_vertical_8(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                    params.mblim, params.lim, params.hev_thr,
+                                    bit_depth);
           break;
         // apply 14-tap filtering
         case 14:
-          if (use_highbitdepth)
-            aom_highbd_lpf_vertical_14(CONVERT_TO_SHORTPTR(p), dst_stride,
-                                       params.mblim, params.lim, params.hev_thr,
-                                       bit_depth);
-          else
-            aom_lpf_vertical_14(p, dst_stride, params.mblim, params.lim,
-                                params.hev_thr);
+          aom_highbd_lpf_vertical_14(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                     params.mblim, params.lim, params.hev_thr,
+                                     bit_depth);
           break;
         // no filtering
         default: break;
@@ -1041,64 +1018,41 @@
         params.filter_length = 0;
         tx_size = TX_4X4;
       }
-      const int use_highbitdepth = cm->seq_params.use_highbitdepth;
       const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
 
 #if CONFIG_NEW_DF
       if (params.filter_length) {
-        if (use_highbitdepth) {
-          aom_highbd_lpf_horizontal_generic_c(
-              CONVERT_TO_SHORTPTR(p), dst_stride, params.filter_length,
-              &params.q_threshold, &params.side_threshold, bit_depth);
-        } else {
-          aom_lpf_horizontal_generic_c(p, dst_stride, params.filter_length,
-                                       &params.q_threshold,
-                                       &params.side_threshold);
-        }
+        aom_highbd_lpf_horizontal_generic_c(
+            CONVERT_TO_SHORTPTR(p), dst_stride, params.filter_length,
+            &params.q_threshold, &params.side_threshold, bit_depth);
       }
 
 #else
       switch (params.filter_length) {
         // apply 4-tap filtering
         case 4:
-          if (use_highbitdepth)
-            aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(p), dst_stride,
-                                        params.mblim, params.lim,
-                                        params.hev_thr, bit_depth);
-          else
-            aom_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim,
-                                 params.hev_thr);
+          aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                      params.mblim, params.lim, params.hev_thr,
+                                      bit_depth);
           break;
         // apply 6-tap filtering
         case 6:
           assert(plane != 0);
-          if (use_highbitdepth)
-            aom_highbd_lpf_horizontal_6(CONVERT_TO_SHORTPTR(p), dst_stride,
-                                        params.mblim, params.lim,
-                                        params.hev_thr, bit_depth);
-          else
-            aom_lpf_horizontal_6(p, dst_stride, params.mblim, params.lim,
-                                 params.hev_thr);
+          aom_highbd_lpf_horizontal_6(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                      params.mblim, params.lim, params.hev_thr,
+                                      bit_depth);
           break;
         // apply 8-tap filtering
         case 8:
-          if (use_highbitdepth)
-            aom_highbd_lpf_horizontal_8(CONVERT_TO_SHORTPTR(p), dst_stride,
-                                        params.mblim, params.lim,
-                                        params.hev_thr, bit_depth);
-          else
-            aom_lpf_horizontal_8(p, dst_stride, params.mblim, params.lim,
-                                 params.hev_thr);
+          aom_highbd_lpf_horizontal_8(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                      params.mblim, params.lim, params.hev_thr,
+                                      bit_depth);
           break;
         // apply 14-tap filtering
         case 14:
-          if (use_highbitdepth)
-            aom_highbd_lpf_horizontal_14(CONVERT_TO_SHORTPTR(p), dst_stride,
-                                         params.mblim, params.lim,
-                                         params.hev_thr, bit_depth);
-          else
-            aom_lpf_horizontal_14(p, dst_stride, params.mblim, params.lim,
-                                  params.hev_thr);
+          aom_highbd_lpf_horizontal_14(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                       params.mblim, params.lim, params.hev_thr,
+                                       bit_depth);
           break;
         // no filtering
         default: break;

diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 4b4d704..f3bafce 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl

@@ -57,9 +57,6 @@
 typedef struct CNN_MULTI_OUT CNN_MULTI_OUT;
 
 /* Function pointers return by CfL functions */
-typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
-                                     uint16_t *output_q3);
-
 typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
                                      uint16_t *output_q3);
 
@@ -68,9 +65,6 @@
 
 typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
 
-typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
-                                   int dst_stride, int alpha_q3);
-
 EOF
 }
 forward_decls qw/av1_common_forward_decls/;
@@ -85,18 +79,12 @@
   $avx2_x86_64 = 'avx2';
 }
 
-add_proto qw/void av1_convolve_horiz_rs/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn";
-specialize qw/av1_convolve_horiz_rs sse4_1/;
-
 add_proto qw/void av1_highbd_convolve_horiz_rs/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd";
 specialize qw/av1_highbd_convolve_horiz_rs sse4_1/;
 
 add_proto qw/void av1_highbd_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd";
 specialize qw/av1_highbd_wiener_convolve_add_src ssse3 avx2/;
 
-add_proto qw/void av1_wiener_convolve_add_src/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params";
-specialize qw/av1_wiener_convolve_add_src sse2 avx2 neon/;
-
 # directional intra predictor functions
 add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy, int mrl_index";
 specialize qw/av1_dr_prediction_z1 avx2/;
@@ -118,24 +106,15 @@
 
 # optical flow interpolation function
 if (aom_config("CONFIG_OPTFLOW_REFINEMENT") eq "yes") {
-  add_proto qw/void av1_bicubic_grad_interpolation/, "const int16_t *pred_src,int16_t *x_grad,int16_t *y_grad,const int blk_width,const int blk_height";
-  specialize qw/av1_bicubic_grad_interpolation sse4_1/;
-
   add_proto qw/void av1_bicubic_grad_interpolation_highbd/, "const int16_t *pred_src,int16_t *x_grad,int16_t *y_grad,const int blk_width,const int blk_height";
   specialize qw/av1_bicubic_grad_interpolation_highbd sse4_1/;
 
-  add_proto qw/int av1_opfl_mv_refinement_nxn_lowbd/, " const uint8_t *p0, int pstride0, const uint8_t *p1, int pstride1, const int16_t *gx0, const int16_t *gy0, const int16_t *gx1, const int16_t *gy1, int gstride, int bw, int bh, int n, int d0, int d1, int grad_prec_bits, int mv_prec_bits, int *vx0, int *vy0, int *vx1, int *vy1";
-  specialize qw/av1_opfl_mv_refinement_nxn_lowbd sse4_1/;
-
   add_proto qw/int av1_opfl_mv_refinement_nxn_highbd/, " const uint16_t *p0, int pstride0, const uint16_t *p1, int pstride1, const int16_t *gx0, const int16_t *gy0, const int16_t *gx1, const int16_t *gy1, int gstride, int bw, int bh, int n, int d0, int d1, int grad_prec_bits, int mv_prec_bits, int *vx0, int *vy0, int *vx1, int *vy1";
   specialize qw/av1_opfl_mv_refinement_nxn_highbd sse4_1/;
 
   add_proto qw/int av1_opfl_mv_refinement_nxn_interp_grad/, " const int16_t *pdiff, int pstride,const int16_t *gx, const int16_t *gy, int gstride, int bw, int bh, int n,int d0, int d1, int grad_prec_bits,int mv_prec_bits, int *vx0, int *vy0,int *vx1, int *vy1";
   specialize qw/av1_opfl_mv_refinement_nxn_interp_grad sse4_1/;
 
-  add_proto qw/void av1_copy_pred_array/, "const uint8_t *src1, const uint8_t *src2, int16_t *dst1,int16_t *dst2, int bw, int bh,int d0, int d1";
-  specialize qw/av1_copy_pred_array sse4_1/;
-
   add_proto qw/void av1_copy_pred_array_highbd/, "const uint16_t *src1, const uint16_t *src2, int16_t *dst1,int16_t *dst2, int bw, int bh, int d0, int d1";
   specialize qw/av1_copy_pred_array_highbd sse4_1/;
 }
@@ -163,14 +142,6 @@
   add_proto qw/void inv_stxfm/ , "tran_low_t *src, tran_low_t *dst, const PREDICTION_MODE mode, const uint8_t stx_idx, const int size";
   specialize qw/inv_stxfm sse4_1/;
 }
-add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-if (aom_config("CONFIG_IST") eq "yes") {
-  # Disable neon version due to: https://crbug.com/aomedia/3090#c10
-  specialize qw/av1_inv_txfm_add ssse3 avx2/;
-} else {
-  specialize qw/av1_inv_txfm_add ssse3 avx2 neon/;
-}
-
 
 add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
 specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2 neon/;
@@ -212,69 +183,49 @@
 add_proto qw/void av1_highbd_inv_txfm_add_64x16/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
 specialize qw/av1_highbd_inv_txfm_add_64x64  neon/;
 
-add_proto qw/void av1_inv_txfm2d_add_4x4/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
-specialize qw/av1_inv_txfm2d_add_4x4 neon/;
-add_proto qw/void av1_inv_txfm2d_add_8x8/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
-specialize qw/av1_inv_txfm2d_add_8x8 neon/;
-add_proto qw/void av1_inv_txfm2d_add_4x8/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
-specialize qw/av1_inv_txfm2d_add_4x8 neon/;
-add_proto qw/void av1_inv_txfm2d_add_8x4/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
-specialize qw/av1_inv_txfm2d_add_8x4 neon/;
-add_proto qw/void av1_inv_txfm2d_add_4x16/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
-specialize qw/av1_inv_txfm2d_add_4x16 neon/;
-add_proto qw/void av1_inv_txfm2d_add_16x4/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
-specialize qw/av1_inv_txfm2d_add_16x4 neon/;
-add_proto qw/void av1_inv_txfm2d_add_8x16/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
-specialize qw/av1_inv_txfm2d_add_8x16  neon/;
-add_proto qw/void av1_inv_txfm2d_add_16x8/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
-specialize qw/av1_inv_txfm2d_add_16x8  neon/;
-add_proto qw/void av1_inv_txfm2d_add_16x32/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
-specialize qw/av1_inv_txfm2d_add_16x32  neon/;
-add_proto qw/void av1_inv_txfm2d_add_32x16/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
-specialize qw/av1_inv_txfm2d_add_32x16  neon/;
-add_proto qw/void av1_inv_txfm2d_add_32x32/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
-specialize qw/av1_inv_txfm2d_add_32x32  neon/;
-add_proto qw/void av1_inv_txfm2d_add_32x64/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
-specialize qw/av1_inv_txfm2d_add_32x64  neon/;
-add_proto qw/void av1_inv_txfm2d_add_64x32/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
-specialize qw/av1_inv_txfm2d_add_64x32  neon/;
-add_proto qw/void av1_inv_txfm2d_add_64x64/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
-specialize qw/av1_inv_txfm2d_add_64x64  neon/;
-add_proto qw/void av1_inv_txfm2d_add_8x32/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
-specialize qw/av1_inv_txfm2d_add_8x32  neon/;
-add_proto qw/void av1_inv_txfm2d_add_32x8/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
-specialize qw/av1_inv_txfm2d_add_32x8  neon/;
-add_proto qw/void av1_inv_txfm2d_add_16x64/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
-specialize qw/av1_inv_txfm2d_add_16x64  neon/;
-add_proto qw/void av1_inv_txfm2d_add_64x16/,  "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
-specialize qw/av1_inv_txfm2d_add_64x16  neon/;
-
 add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
 add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
 
 add_proto qw/void av1_inv_txfm2d_add_4x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_4x8 neon/;
 add_proto qw/void av1_inv_txfm2d_add_8x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_8x4 neon/;
 add_proto qw/void av1_inv_txfm2d_add_8x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_8x16 neon/;
 add_proto qw/void av1_inv_txfm2d_add_16x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_16x8 neon/;
 add_proto qw/void av1_inv_txfm2d_add_16x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_16x32 neon/;
 add_proto qw/void av1_inv_txfm2d_add_32x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_32x16 neon/;
 add_proto qw/void av1_inv_txfm2d_add_4x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-specialize qw/av1_inv_txfm2d_add_4x4 sse4_1/;
+specialize qw/av1_inv_txfm2d_add_4x4 sse4_1 neon/;
 add_proto qw/void av1_inv_txfm2d_add_8x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
-specialize qw/av1_inv_txfm2d_add_8x8 sse4_1/;
+specialize qw/av1_inv_txfm2d_add_8x8 sse4_1 neon/;
 add_proto qw/void av1_inv_txfm2d_add_16x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_16x16 neon/;
 add_proto qw/void av1_inv_txfm2d_add_32x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_32x32 neon/;
 
 add_proto qw/void av1_inv_txfm2d_add_64x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_64x64 neon/;
 add_proto qw/void av1_inv_txfm2d_add_32x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_32x64 neon/;
 add_proto qw/void av1_inv_txfm2d_add_64x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_64x32 neon/;
 add_proto qw/void av1_inv_txfm2d_add_16x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_16x64 neon/;
 add_proto qw/void av1_inv_txfm2d_add_64x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_64x16 neon/;
 
 add_proto qw/void av1_inv_txfm2d_add_4x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_4x16 neon/;
 add_proto qw/void av1_inv_txfm2d_add_16x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_16x4 neon/;
 add_proto qw/void av1_inv_txfm2d_add_8x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_8x32 neon/;
 add_proto qw/void av1_inv_txfm2d_add_32x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_32x8 neon/;
 
 # directional intra predictor functions
 add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd, int mrl_index";
@@ -292,9 +243,6 @@
 #endif
 
 # build compound seg mask functions
-add_proto qw/void av1_build_compound_diffwtd_mask/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w";
-specialize qw/av1_build_compound_diffwtd_mask sse4_1 avx2/;
-
 add_proto qw/void av1_build_compound_diffwtd_mask_highbd/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd";
 specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2/;
 
@@ -307,7 +255,6 @@
 
 # Resize functions.
 add_proto qw/void av1_resize_and_extend_frame/, "const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes";
-specialize qw/av1_resize_and_extend_frame ssse3 neon/;
 
 #
 # Encoder functions below this point.
@@ -315,15 +262,6 @@
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
   # ENCODEMB INVOKE
-
-  # the transform coefficients are held in 32-bit
-  # values, so the assembler code for  av1_block_error can no longer be used.
-  add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-  specialize qw/av1_block_error sse2 avx2 neon/;
-
-  add_proto qw/int64_t av1_block_error_lp/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size";
-  specialize qw/av1_block_error_lp avx2 neon/;
-
   add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr, const int32_t *round_ptr, const int32_t *quant_ptr, const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
@@ -386,8 +324,6 @@
   #
   # Motion search
   #
-  add_proto qw/void av1_apply_temporal_filter/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, const uint8_t *pred, uint32_t *accum, uint16_t *count";
-  specialize qw/av1_apply_temporal_filter sse2 avx2/;
   add_proto qw/void av1_highbd_apply_temporal_filter/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, const uint8_t *pred, uint32_t *accum, uint16_t *count";
   specialize qw/av1_highbd_apply_temporal_filter sse2/;
 
@@ -435,18 +371,9 @@
   add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, size_t length";
   specialize qw/av1_get_crc32c_value sse4_2/;
 
-  add_proto qw/void av1_compute_stats/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H";
-  specialize qw/av1_compute_stats sse4_1 avx2/;
-
   add_proto qw/void av1_compute_stats_highbd/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth";
   specialize qw/av1_compute_stats_highbd sse4_1 avx2/;
 
-  add_proto qw/void av1_calc_proj_params/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
-  specialize qw/av1_calc_proj_params avx2/;
-
-  add_proto qw/int64_t av1_lowbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
-  specialize qw/av1_lowbd_pixel_proj_error sse4_1 avx2 neon/;
-
   add_proto qw/int64_t av1_highbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
   specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2/;
   add_proto qw/void av1_get_horver_correlation_full/, " const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr";
@@ -473,7 +400,6 @@
 add_proto qw/int cdef_find_dir/, "const uint16_t *img, int stride, int32_t *var, int coeff_shift";
 add_proto qw/void cdef_filter_block/, "uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int coeff_shift";
 
-add_proto qw/void cdef_copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h";
 add_proto qw/void cdef_copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h";
 
 # VS compiling for 32 bit targets does not support vector types in
@@ -482,7 +408,6 @@
 if ($opts{config} !~ /libs-x86-win32-vs.*/) {
   specialize qw/cdef_find_dir sse2 ssse3 sse4_1 avx2 neon/;
   specialize qw/cdef_filter_block sse2 ssse3 sse4_1 avx2 neon/;
-  specialize qw/cdef_copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
   specialize qw/cdef_copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
 }
 
@@ -497,15 +422,9 @@
 
 # WARPED_MOTION / GLOBAL_MOTION functions
 
-add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
-specialize qw/av1_warp_affine sse4_1 avx2 neon/;
-
 add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
 specialize qw/av1_highbd_warp_affine sse4_1 avx2/;
 
-add_proto qw/int64_t av1_calc_frame_error/, "const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride";
-specialize qw/av1_calc_frame_error sse2 avx2/;
-
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/double av1_compute_cross_correlation/, "unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2";
   specialize qw/av1_compute_cross_correlation sse4_1 avx2/;
@@ -513,23 +432,16 @@
 
 # LOOP_RESTORATION functions
 
-add_proto qw/void av1_apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
+add_proto qw/void av1_apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth";
 specialize qw/av1_apply_selfguided_restoration sse4_1 avx2 neon/;
 
 add_proto qw/int av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
                                  int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
-                                 int sgr_params_idx, int bit_depth, int highbd";
+                                 int sgr_params_idx, int bit_depth";
 specialize qw/av1_selfguided_restoration sse4_1 avx2 neon/;
 
 # CONVOLVE_ROUND/COMPOUND_ROUND functions
 
-add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params";
-add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn";
-add_proto qw/void av1_dist_wtd_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
-add_proto qw/void av1_dist_wtd_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, ConvolveParams *conv_params";
-add_proto qw/void av1_dist_wtd_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params";
-add_proto qw/void av1_dist_wtd_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, ConvolveParams *conv_params";
 add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
 add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd";
 add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd";
@@ -539,17 +451,6 @@
 add_proto qw/void av1_highbd_dist_wtd_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, ConvolveParams *conv_params, int bd";
 add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params, int bd";
 
-add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params";
-
-specialize qw/av1_convolve_2d_sr sse2 avx2 neon/;
-specialize qw/av1_convolve_x_sr sse2 avx2 neon/;
-specialize qw/av1_convolve_y_sr sse2 avx2 neon/;
-specialize qw/av1_convolve_2d_scale sse4_1/;
-specialize qw/av1_dist_wtd_convolve_2d sse2 ssse3 avx2 neon/;
-specialize qw/av1_dist_wtd_convolve_2d_copy sse2 avx2 neon/;
-specialize qw/av1_dist_wtd_convolve_x sse2 avx2 neon/;
-specialize qw/av1_dist_wtd_convolve_y sse2 avx2 neon/;
-
 specialize qw/av1_highbd_dist_wtd_convolve_2d sse4_1 avx2/;
 specialize qw/av1_highbd_dist_wtd_convolve_x sse4_1 avx2/;
 specialize qw/av1_highbd_dist_wtd_convolve_y sse4_1 avx2/;
@@ -560,11 +461,6 @@
 specialize qw/av1_highbd_convolve_2d_scale sse4_1/;
 
 # INTRA_EDGE functions
-add_proto qw/void av1_filter_intra_edge/, "uint8_t *p, int sz, int strength";
-specialize qw/av1_filter_intra_edge sse4_1/;
-add_proto qw/void av1_upsample_intra_edge/, "uint8_t *p, int sz";
-specialize qw/av1_upsample_intra_edge sse4_1/;
-
 add_proto qw/void av1_filter_intra_edge_high/, "uint16_t *p, int sz, int strength";
 specialize qw/av1_filter_intra_edge_high sse4_1/;
 add_proto qw/void av1_upsample_intra_edge_high/, "uint16_t *p, int sz, int bd";
@@ -574,15 +470,6 @@
 add_proto qw/cfl_subtract_average_fn cfl_get_subtract_average_fn/, "TX_SIZE tx_size";
 specialize qw/cfl_get_subtract_average_fn sse2 avx2 neon vsx/;
 
-add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd/, "TX_SIZE tx_size";
-specialize qw/cfl_get_luma_subsampling_420_lbd ssse3 avx2 neon/;
-
-add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd/, "TX_SIZE tx_size";
-specialize qw/cfl_get_luma_subsampling_422_lbd ssse3 avx2 neon/;
-
-add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd/, "TX_SIZE tx_size";
-specialize qw/cfl_get_luma_subsampling_444_lbd ssse3 avx2 neon/;
-
 add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd/, "TX_SIZE tx_size";
 specialize qw/cfl_get_luma_subsampling_420_hbd ssse3 avx2 neon/;
 
@@ -595,7 +482,4 @@
 add_proto qw/cfl_predict_hbd_fn cfl_get_predict_hbd_fn/, "TX_SIZE tx_size";
 specialize qw/cfl_get_predict_hbd_fn ssse3 avx2 neon/;
 
-add_proto qw/cfl_predict_lbd_fn cfl_get_predict_lbd_fn/, "TX_SIZE tx_size";
-specialize qw/cfl_get_predict_lbd_fn ssse3 avx2 neon/;
-
 1;

diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index bd19261..25fcd1d 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h

@@ -1199,16 +1199,6 @@
 
 /*!\cond */
 
-static INLINE int is_cur_buf_hbd(const MACROBLOCKD *xd) {
-  return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
-}
-
-static INLINE uint8_t *get_buf_by_bd(const MACROBLOCKD *xd, uint8_t *buf16) {
-  return (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-             ? CONVERT_TO_BYTEPTR(buf16)
-             : buf16;
-}
-
 static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
   switch (bsize) {
     case BLOCK_4X4: return 0;

diff --git a/av1/common/ccso.c b/av1/common/ccso.c
index 5f57a95..fa03798 100644
--- a/av1/common/ccso.c
+++ b/av1/common/ccso.c

@@ -311,99 +311,6 @@
 }
 #endif
 
-/* Apply CCSO for one color component (low bit-depth) */
-void apply_ccso_filter(AV1_COMMON *cm, MACROBLOCKD *xd, const int plane,
-                       const uint16_t *temp_rec_y_buf, uint8_t *rec_yv_8,
-                       const int dst_stride, const int8_t *filter_offset,
-#if CONFIG_CCSO_EXT
-                       const int shift_bits,
-#endif
-                       const uint8_t quant_step_size,
-                       const uint8_t ext_filter_support) {
-  const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  const int ccso_stride_ext = xd->plane[0].dst.width + (CCSO_PADDING_SIZE << 1);
-  const int pic_height_c = xd->plane[1].dst.height;
-  const int pic_width_c = xd->plane[1].dst.width;
-  int rec_luma_idx[2];
-  const int inv_quant_step = quant_step_size * -1;
-  int rec_idx[2];
-
-  derive_ccso_sample_pos(rec_idx, ccso_stride_ext, ext_filter_support);
-
-  const int8_t *offset_buf;
-  if (plane > 0) {
-    offset_buf = cm->ccso_info.filter_offset[plane - 1];
-  } else {
-    offset_buf = filter_offset;
-  }
-  int ccso_stride_ext_idx[1 << CCSO_BLK_SIZE];
-  int dst_stride_idx[1 << CCSO_BLK_SIZE];
-  for (int i = 0; i < (1 << CCSO_BLK_SIZE); i++) {
-    ccso_stride_ext_idx[i] = ccso_stride_ext * i;
-    dst_stride_idx[i] = dst_stride * i;
-  }
-  const int pad_stride =
-      CCSO_PADDING_SIZE * ccso_stride_ext + CCSO_PADDING_SIZE;
-  const int y_uv_hori_scale = xd->plane[1].subsampling_x;
-  const int y_uv_vert_scale = xd->plane[1].subsampling_y;
-  for (int y = 0; y < pic_height_c; y += (1 << CCSO_BLK_SIZE)) {
-    for (int x = 0; x < pic_width_c; x += (1 << CCSO_BLK_SIZE)) {
-      if (plane > 0) {
-        const int ccso_blk_idx =
-            (1 << CCSO_BLK_SIZE >>
-             (MI_SIZE_LOG2 - xd->plane[plane].subsampling_y)) *
-                (y >> CCSO_BLK_SIZE) * mi_params->mi_stride +
-            (1 << CCSO_BLK_SIZE >>
-             (MI_SIZE_LOG2 - xd->plane[plane].subsampling_x)) *
-                (x >> CCSO_BLK_SIZE);
-        const bool use_ccso =
-            (plane == 1) ? mi_params->mi_grid_base[ccso_blk_idx]->ccso_blk_u
-                         : mi_params->mi_grid_base[ccso_blk_idx]->ccso_blk_v;
-        if (!use_ccso) continue;
-      }
-      int y_offset;
-      int x_offset;
-      if (y + (1 << CCSO_BLK_SIZE) >= pic_height_c)
-        y_offset = pic_height_c - y;
-      else
-        y_offset = (1 << CCSO_BLK_SIZE);
-
-      if (x + (1 << CCSO_BLK_SIZE) >= pic_width_c)
-        x_offset = pic_width_c - x;
-      else
-        x_offset = (1 << CCSO_BLK_SIZE);
-
-      for (int y_off = 0; y_off < y_offset; y_off++) {
-        for (int x_off = 0; x_off < x_offset; x_off++) {
-#if CONFIG_CCSO_EXT
-          const int band_num =
-              temp_rec_y_buf[((ccso_stride_ext_idx[y_off] << y_uv_vert_scale) +
-                              ((x + x_off) << y_uv_hori_scale)) +
-                             pad_stride] >>
-              shift_bits;
-#endif
-          cal_filter_support(
-              rec_luma_idx,
-              &temp_rec_y_buf[((ccso_stride_ext_idx[y_off] << y_uv_vert_scale) +
-                               ((x + x_off) << y_uv_hori_scale)) +
-                              pad_stride],
-              quant_step_size, inv_quant_step, rec_idx);
-          int offset_val = offset_buf[
-#if CONFIG_CCSO_EXT
-              (band_num << 4) +
-#endif
-              (rec_luma_idx[0] << 2) + rec_luma_idx[1]];
-          rec_yv_8[dst_stride_idx[y_off] + x + x_off] =
-              clamp(offset_val + rec_yv_8[dst_stride_idx[y_off] + x + x_off], 0,
-                    (1 << cm->seq_params.bit_depth) - 1);
-        }
-      }
-    }
-    temp_rec_y_buf += (ccso_stride_ext << (CCSO_BLK_SIZE + y_uv_vert_scale));
-    rec_yv_8 += (dst_stride << CCSO_BLK_SIZE);
-  }
-}
-
 /* Apply CCSO for one filtering unit using c code (high bit-depth) */
 void ccso_filter_block_hbd_c(
     const uint16_t *temp_rec_y_buf, uint16_t *rec_uv_16, const int x,
@@ -547,7 +454,7 @@
   for (int plane = 0; plane < num_planes; plane++) {
     const int dst_stride = xd->plane[plane].dst.stride;
     const uint8_t quant_step_size = quant_sz[cm->ccso_info.quant_idx[plane]];
-    if (cm->ccso_info.ccso_enable[plane] && cm->seq_params.use_highbitdepth) {
+    if (cm->ccso_info.ccso_enable[plane]) {
       CCSO_FILTER_FUNC apply_ccso_filter_func =
           cm->ccso_info.max_band_log2[plane]
               ? (plane > 0 ? ccso_apply_chroma_mb_filter
@@ -566,16 +473,10 @@
     const uint8_t quant_step_size =
         quant_sz[cm->ccso_info.quant_idx[plane - 1]];
     if (cm->ccso_info.ccso_enable[plane - 1]) {
-      if (cm->seq_params.use_highbitdepth) {
-        apply_ccso_filter_hbd(cm, xd, plane, ext_rec_y,
-                              &CONVERT_TO_SHORTPTR(xd->plane[plane].dst.buf)[0],
-                              dst_stride, NULL, quant_step_size,
-                              cm->ccso_info.ext_filter_support[plane - 1]);
-      } else {
-        apply_ccso_filter(
-            cm, xd, plane, ext_rec_y, &xd->plane[plane].dst.buf[0], dst_stride,
-            NULL, quant_step_size, cm->ccso_info.ext_filter_support[plane - 1]);
-      }
+      apply_ccso_filter_hbd(cm, xd, plane, ext_rec_y,
+                            &CONVERT_TO_SHORTPTR(xd->plane[plane].dst.buf)[0],
+                            dst_stride, NULL, quant_step_size,
+                            cm->ccso_info.ext_filter_support[plane - 1]);
     }
 #endif
   }

diff --git a/av1/common/ccso.h b/av1/common/ccso.h
index f34a438..f963fc5 100644
--- a/av1/common/ccso.h
+++ b/av1/common/ccso.h

@@ -38,15 +38,6 @@
                             const uint8_t ext_filter_support);
 #endif
 
-void apply_ccso_filter(AV1_COMMON *cm, MACROBLOCKD *xd, const int plane,
-                       const uint16_t *temp_rec_y_buf, uint8_t *rec_yv_8,
-                       const int dst_stride, const int8_t *filter_offset,
-#if CONFIG_CCSO_EXT
-                       const int shift_bits,
-#endif
-                       const uint8_t quant_step_size,
-                       const uint8_t ext_filter_support);
-
 void apply_ccso_filter_hbd(AV1_COMMON *cm, MACROBLOCKD *xd, const int plane,
                            const uint16_t *temp_rec_y_buf, uint16_t *rec_uv_16,
                            const int dst_stride, const int8_t *filter_offset,

diff --git a/av1/common/cdef.c b/av1/common/cdef.c
index dac4531..f39fc8e 100644
--- a/av1/common/cdef.c
+++ b/av1/common/cdef.c

@@ -68,16 +68,6 @@
   return count;
 }
 
-void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride,
-                                     const uint8_t *src, int sstride, int v,
-                                     int h) {
-  for (int i = 0; i < v; i++) {
-    for (int j = 0; j < h; j++) {
-      dst[i * dstride + j] = src[i * sstride + j];
-    }
-  }
-}
-
 void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride,
                                       const uint16_t *src, int sstride, int v,
                                       int h) {
@@ -91,14 +81,10 @@
 static void copy_sb8_16(AV1_COMMON *cm, uint16_t *dst, int dstride,
                         const uint8_t *src, int src_voffset, int src_hoffset,
                         int sstride, int vsize, int hsize) {
-  if (cm->seq_params.use_highbitdepth) {
-    const uint16_t *base =
-        &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset];
-    cdef_copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
-  } else {
-    const uint8_t *base = &src[src_voffset * sstride + src_hoffset];
-    cdef_copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
-  }
+  (void)cm;
+  const uint16_t *base =
+      &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset];
+  cdef_copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
 }
 
 static INLINE void fill_rect(uint16_t *dst, int dstride, int v, int h,
@@ -349,29 +335,17 @@
                     vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
         }
 
-        if (cm->seq_params.use_highbitdepth) {
-          av1_cdef_filter_fb(
-              NULL,
-              &CONVERT_TO_SHORTPTR(
-                  xd->plane[pli]
-                      .dst.buf)[xd->plane[pli].dst.stride *
-                                    (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
-                                (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
-              xd->plane[pli].dst.stride,
-              &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
-              ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
-              sec_strength, damping, coeff_shift);
-        } else {
-          av1_cdef_filter_fb(
-              &xd->plane[pli]
-                   .dst.buf[xd->plane[pli].dst.stride *
-                                (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
-                            (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
-              NULL, xd->plane[pli].dst.stride,
-              &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
-              ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
-              sec_strength, damping, coeff_shift);
-        }
+        av1_cdef_filter_fb(
+            NULL,
+            &CONVERT_TO_SHORTPTR(
+                xd->plane[pli]
+                    .dst.buf)[xd->plane[pli].dst.stride *
+                                  (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
+                              (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
+            xd->plane[pli].dst.stride,
+            &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
+            ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
+            sec_strength, damping, coeff_shift);
       }
       cdef_left = 1;
     }

diff --git a/av1/common/cdef_block_simd.h b/av1/common/cdef_block_simd.h
index e9dbf38..0cc728d 100644
--- a/av1/common/cdef_block_simd.h
+++ b/av1/common/cdef_block_simd.h

@@ -883,21 +883,6 @@
   }
 }
 
-void SIMD_FUNC(cdef_copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride,
-                                              const uint8_t *src, int sstride,
-                                              int v, int h) {
-  int i, j;
-  for (i = 0; i < v; i++) {
-    for (j = 0; j < (h & ~0x7); j += 8) {
-      v64 row = v64_load_unaligned(&src[i * sstride + j]);
-      v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
-    }
-    for (; j < h; j++) {
-      dst[i * dstride + j] = src[i * sstride + j];
-    }
-  }
-}
-
 void SIMD_FUNC(cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
                                                const uint16_t *src, int sstride,
                                                int v, int h) {

diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index c6312ab..091dc4c 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c

@@ -38,21 +38,9 @@
   assert(pred_plane < CFL_PRED_PLANES);
   assert(width <= CFL_BUF_LINE);
 
-  if (is_cur_buf_hbd(xd)) {
-    uint16_t *const input_16 = CONVERT_TO_SHORTPTR(input);
-    memcpy(xd->cfl.dc_pred_cache[pred_plane], input_16, width << 1);
-    return;
-  }
-
-  memcpy(xd->cfl.dc_pred_cache[pred_plane], input, width);
-}
-
-static void cfl_load_dc_pred_lbd(const int16_t *dc_pred_cache, uint8_t *dst,
-                                 int dst_stride, int width, int height) {
-  for (int j = 0; j < height; j++) {
-    memcpy(dst, dc_pred_cache, width);
-    dst += dst_stride;
-  }
+  uint16_t *const input_16 = CONVERT_TO_SHORTPTR(input);
+  memcpy(xd->cfl.dc_pred_cache[pred_plane], input_16, width << 1);
+  return;
 }
 
 static void cfl_load_dc_pred_hbd(const int16_t *dc_pred_cache, uint16_t *dst,
@@ -70,13 +58,8 @@
   assert(pred_plane < CFL_PRED_PLANES);
   assert(width <= CFL_BUF_LINE);
   assert(height <= CFL_BUF_LINE);
-  if (is_cur_buf_hbd(xd)) {
-    uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
-    cfl_load_dc_pred_hbd(xd->cfl.dc_pred_cache[pred_plane], dst_16, dst_stride,
-                         width, height);
-    return;
-  }
-  cfl_load_dc_pred_lbd(xd->cfl.dc_pred_cache[pred_plane], dst, dst_stride,
+  uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
+  cfl_load_dc_pred_hbd(xd->cfl.dc_pred_cache[pred_plane], dst_16, dst_stride,
                        width, height);
 }
 
@@ -147,20 +130,6 @@
   return (alpha_sign == CFL_SIGN_POS) ? abs_alpha_q3 + 1 : -abs_alpha_q3 - 1;
 }
 
-static INLINE void cfl_predict_lbd_c(const int16_t *ac_buf_q3, uint8_t *dst,
-                                     int dst_stride, int alpha_q3, int width,
-                                     int height) {
-  for (int j = 0; j < height; j++) {
-    for (int i = 0; i < width; i++) {
-      dst[i] = clip_pixel(get_scaled_luma_q0(alpha_q3, ac_buf_q3[i]) + dst[i]);
-    }
-    dst += dst_stride;
-    ac_buf_q3 += CFL_BUF_LINE;
-  }
-}
-
-CFL_PREDICT_FN(c, lbd)
-
 void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, int dst_stride,
                        int alpha_q3, int bit_depth, int width, int height) {
   for (int j = 0; j < height; j++) {
@@ -197,56 +166,9 @@
       cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1);
   assert((tx_size_high[tx_size] - 1) * CFL_BUF_LINE + tx_size_wide[tx_size] <=
          CFL_BUF_SQUARE);
-  if (is_cur_buf_hbd(xd)) {
-    uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
-    cfl_get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride,
-                                    alpha_q3, xd->bd);
-    return;
-  }
-  cfl_get_predict_lbd_fn(tx_size)(cfl->ac_buf_q3, dst, dst_stride, alpha_q3);
-}
-
-static void cfl_luma_subsampling_420_lbd_c(const uint8_t *input,
-                                           int input_stride,
-                                           uint16_t *output_q3, int width,
-                                           int height) {
-  for (int j = 0; j < height; j += 2) {
-    for (int i = 0; i < width; i += 2) {
-      const int bot = i + input_stride;
-      output_q3[i >> 1] =
-          (input[i] + input[i + 1] + input[bot] + input[bot + 1]) << 1;
-    }
-    input += input_stride << 1;
-    output_q3 += CFL_BUF_LINE;
-  }
-}
-
-static void cfl_luma_subsampling_422_lbd_c(const uint8_t *input,
-                                           int input_stride,
-                                           uint16_t *output_q3, int width,
-                                           int height) {
-  assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
-  for (int j = 0; j < height; j++) {
-    for (int i = 0; i < width; i += 2) {
-      output_q3[i >> 1] = (input[i] + input[i + 1]) << 2;
-    }
-    input += input_stride;
-    output_q3 += CFL_BUF_LINE;
-  }
-}
-
-static void cfl_luma_subsampling_444_lbd_c(const uint8_t *input,
-                                           int input_stride,
-                                           uint16_t *output_q3, int width,
-                                           int height) {
-  assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
-  for (int j = 0; j < height; j++) {
-    for (int i = 0; i < width; i++) {
-      output_q3[i] = input[i] << 3;
-    }
-    input += input_stride;
-    output_q3 += CFL_BUF_LINE;
-  }
+  uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
+  cfl_get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride, alpha_q3,
+                                  xd->bd);
 }
 
 static void cfl_luma_subsampling_420_hbd_c(const uint16_t *input,
@@ -305,19 +227,8 @@
   return cfl_get_luma_subsampling_444_hbd(tx_size);
 }
 
-static INLINE cfl_subsample_lbd_fn cfl_subsampling_lbd(TX_SIZE tx_size,
-                                                       int sub_x, int sub_y) {
-  if (sub_x == 1) {
-    if (sub_y == 1) {
-      return cfl_get_luma_subsampling_420_lbd(tx_size);
-    }
-    return cfl_get_luma_subsampling_422_lbd(tx_size);
-  }
-  return cfl_get_luma_subsampling_444_lbd(tx_size);
-}
 static void cfl_store(MACROBLOCKD *const xd, CFL_CTX *cfl, const uint8_t *input,
-                      int input_stride, int row, int col, TX_SIZE tx_size,
-                      int use_hbd) {
+                      int input_stride, int row, int col, TX_SIZE tx_size) {
   const int width = tx_size_wide[tx_size];
   const int height = tx_size_high[tx_size];
   const int tx_off_log2 = MI_SIZE_LOG2;
@@ -357,13 +268,8 @@
   // Store the input into the CfL pixel buffer
   uint16_t *recon_buf_q3 =
       cfl->recon_buf_q3 + (store_row * CFL_BUF_LINE + store_col);
-  if (use_hbd) {
-    cfl_subsampling_hbd(tx_size, sub_x, sub_y)(CONVERT_TO_SHORTPTR(input),
-                                               input_stride, recon_buf_q3);
-  } else {
-    cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride,
-                                               recon_buf_q3);
-  }
+  cfl_subsampling_hbd(tx_size, sub_x, sub_y)(CONVERT_TO_SHORTPTR(input),
+                                             input_stride, recon_buf_q3);
 }
 
 // Adjust the row and column of blocks smaller than 8X8, as chroma-referenced
@@ -396,8 +302,7 @@
     assert(!((row & 1) && tx_size_high[tx_size] != 4));
     sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col);
   }
-  cfl_store(xd, cfl, dst, pd->dst.stride, row, col, tx_size,
-            is_cur_buf_hbd(xd));
+  cfl_store(xd, cfl, dst, pd->dst.stride, row, col, tx_size);
 }
 
 static INLINE int max_intra_block_width(const MACROBLOCKD *xd,
@@ -429,6 +334,5 @@
   const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size);
   tx_size = get_tx_size(width, height);
   assert(tx_size != TX_INVALID);
-  cfl_store(xd, cfl, pd->dst.buf, pd->dst.stride, row, col, tx_size,
-            is_cur_buf_hbd(xd));
+  cfl_store(xd, cfl, pd->dst.buf, pd->dst.stride, row, col, tx_size);
 }

diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index 8b28758..f9d6306 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h

@@ -149,9 +149,6 @@
 // The RTCD script does not support passing in an array, so we wrap it in this
 // function.
 #define CFL_GET_SUBSAMPLE_FUNCTION(arch)  \
-  CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \
-  CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \
-  CFL_SUBSAMPLE_FUNCTIONS(arch, 444, lbd) \
   CFL_SUBSAMPLE_FUNCTIONS(arch, 420, hbd) \
   CFL_SUBSAMPLE_FUNCTIONS(arch, 422, hbd) \
   CFL_SUBSAMPLE_FUNCTIONS(arch, 444, hbd)
@@ -217,14 +214,6 @@
 void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
 void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
 
-#define CFL_PREDICT_lbd(arch, width, height)                              \
-  void cfl_predict_lbd_##width##x##height##_##arch(                       \
-      const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride,           \
-      int alpha_q3) {                                                     \
-    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \
-                           height);                                       \
-  }
-
 #define CFL_PREDICT_hbd(arch, width, height)                                   \
   void cfl_predict_hbd_##width##x##height##_##arch(                            \
       const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \

diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 26d0ea6..06cf0eb 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c

@@ -24,31 +24,6 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/mem.h"
 
-void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
-                             int dst_stride, int w, int h,
-                             const int16_t *x_filters, int x0_qn,
-                             int x_step_qn) {
-  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
-  for (int y = 0; y < h; ++y) {
-    int x_qn = x0_qn;
-    for (int x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
-      const int x_filter_idx =
-          (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
-      assert(x_filter_idx <= RS_SUBPEL_MASK);
-      const int16_t *const x_filter =
-          &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
-      int sum = 0;
-      for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
-        sum += src_x[k] * x_filter[k];
-      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      x_qn += x_step_qn;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
 void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
                                     uint16_t *dst, int dst_stride, int w, int h,
                                     const int16_t *x_filters, int x0_qn,
@@ -113,523 +88,6 @@
   }
 }
 
-void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
-                          int dst_stride, int w, int h,
-                          const InterpFilterParams *filter_params_x,
-                          const InterpFilterParams *filter_params_y,
-                          const int subpel_x_qn, const int subpel_y_qn,
-                          ConvolveParams *conv_params) {
-  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
-  int im_h = h + filter_params_y->taps - 1;
-  int im_stride = w;
-  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const int bd = 8;
-  const int bits =
-      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
-
-  // horizontal filter
-  const uint8_t *src_horiz = src - fo_vert * src_stride;
-  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-  for (int y = 0; y < im_h; ++y) {
-    for (int x = 0; x < w; ++x) {
-      int32_t sum = (1 << (bd + FILTER_BITS - 1));
-      for (int k = 0; k < filter_params_x->taps; ++k) {
-        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
-      }
-      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
-      im_block[y * im_stride + x] =
-          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
-    }
-  }
-
-  // vertical filter
-  int16_t *src_vert = im_block + fo_vert * im_stride;
-  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_qn & SUBPEL_MASK);
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  for (int y = 0; y < h; ++y) {
-    for (int x = 0; x < w; ++x) {
-      int32_t sum = 1 << offset_bits;
-      for (int k = 0; k < filter_params_y->taps; ++k) {
-        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
-      }
-      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
-      int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
-                    ((1 << (offset_bits - conv_params->round_1)) +
-                     (1 << (offset_bits - conv_params->round_1 - 1)));
-      dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
-    }
-  }
-}
-
-void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
-                         int dst_stride, int w, int h,
-                         const InterpFilterParams *filter_params_y,
-                         const int subpel_y_qn) {
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-
-  // vertical filter
-  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_qn & SUBPEL_MASK);
-  for (int y = 0; y < h; ++y) {
-    for (int x = 0; x < w; ++x) {
-      int32_t res = 0;
-      for (int k = 0; k < filter_params_y->taps; ++k) {
-        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
-      }
-      dst[y * dst_stride + x] =
-          clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
-    }
-  }
-}
-
-void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
-                         int dst_stride, int w, int h,
-                         const InterpFilterParams *filter_params_x,
-                         const int subpel_x_qn, ConvolveParams *conv_params) {
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const int bits = FILTER_BITS - conv_params->round_0;
-
-  assert(bits >= 0);
-  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
-         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
-
-  // horizontal filter
-  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-
-  for (int y = 0; y < h; ++y) {
-    for (int x = 0; x < w; ++x) {
-      int32_t res = 0;
-      for (int k = 0; k < filter_params_x->taps; ++k) {
-        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
-      }
-      res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
-      dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
-    }
-  }
-}
-
-void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
-                                uint8_t *dst, int dst_stride, int w, int h,
-                                const InterpFilterParams *filter_params_x,
-                                const InterpFilterParams *filter_params_y,
-                                const int subpel_x_qn, const int subpel_y_qn,
-                                ConvolveParams *conv_params) {
-  CONV_BUF_TYPE *dst16 = conv_params->dst;
-  int dst16_stride = conv_params->dst_stride;
-  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
-  int im_h = h + filter_params_y->taps - 1;
-  int im_stride = w;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const int bd = 8;
-  const int round_bits =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const int use_wtd_comp_avg = is_uneven_wtd_comp_avg(conv_params);
-
-  // horizontal filter
-  const uint8_t *src_horiz = src - fo_vert * src_stride;
-  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-  for (int y = 0; y < im_h; ++y) {
-    for (int x = 0; x < w; ++x) {
-      int32_t sum = (1 << (bd + FILTER_BITS - 1));
-      for (int k = 0; k < filter_params_x->taps; ++k) {
-        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
-      }
-      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
-      im_block[y * im_stride + x] =
-          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
-    }
-  }
-
-  // vertical filter
-  int16_t *src_vert = im_block + fo_vert * im_stride;
-  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_qn & SUBPEL_MASK);
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  for (int y = 0; y < h; ++y) {
-    for (int x = 0; x < w; ++x) {
-      int32_t sum = 1 << offset_bits;
-      for (int k = 0; k < filter_params_y->taps; ++k) {
-        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
-      }
-      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
-      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
-      if (conv_params->do_average) {
-        int32_t tmp = dst16[y * dst16_stride + x];
-        if (use_wtd_comp_avg) {
-          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
-          tmp = tmp >> DIST_PRECISION_BITS;
-        } else {
-          tmp += res;
-          tmp = tmp >> 1;
-        }
-        tmp -= (1 << (offset_bits - conv_params->round_1)) +
-               (1 << (offset_bits - conv_params->round_1 - 1));
-        dst[y * dst_stride + x] =
-            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
-      } else {
-        dst16[y * dst16_stride + x] = res;
-      }
-    }
-  }
-}
-
-void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst,
-                               int dst_stride, int w, int h,
-                               const InterpFilterParams *filter_params_y,
-                               const int subpel_y_qn,
-                               ConvolveParams *conv_params) {
-  CONV_BUF_TYPE *dst16 = conv_params->dst;
-  int dst16_stride = conv_params->dst_stride;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const int bits = FILTER_BITS - conv_params->round_0;
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
-                           (1 << (offset_bits - conv_params->round_1 - 1));
-  const int round_bits =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const int use_wtd_comp_avg = is_uneven_wtd_comp_avg(conv_params);
-
-  // vertical filter
-  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_qn & SUBPEL_MASK);
-  for (int y = 0; y < h; ++y) {
-    for (int x = 0; x < w; ++x) {
-      int32_t res = 0;
-      for (int k = 0; k < filter_params_y->taps; ++k) {
-        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
-      }
-      res *= (1 << bits);
-      res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
-
-      if (conv_params->do_average) {
-        int32_t tmp = dst16[y * dst16_stride + x];
-        if (use_wtd_comp_avg) {
-          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
-          tmp = tmp >> DIST_PRECISION_BITS;
-        } else {
-          tmp += res;
-          tmp = tmp >> 1;
-        }
-        tmp -= round_offset;
-        dst[y * dst_stride + x] =
-            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
-      } else {
-        dst16[y * dst16_stride + x] = res;
-      }
-    }
-  }
-}
-
-void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst,
-                               int dst_stride, int w, int h,
-                               const InterpFilterParams *filter_params_x,
-                               const int subpel_x_qn,
-                               ConvolveParams *conv_params) {
-  CONV_BUF_TYPE *dst16 = conv_params->dst;
-  int dst16_stride = conv_params->dst_stride;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const int bits = FILTER_BITS - conv_params->round_1;
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
-                           (1 << (offset_bits - conv_params->round_1 - 1));
-  const int round_bits =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const int use_wtd_comp_avg = is_uneven_wtd_comp_avg(conv_params);
-
-  // horizontal filter
-  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-  for (int y = 0; y < h; ++y) {
-    for (int x = 0; x < w; ++x) {
-      int32_t res = 0;
-      for (int k = 0; k < filter_params_x->taps; ++k) {
-        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
-      }
-      res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
-      res += round_offset;
-
-      if (conv_params->do_average) {
-        int32_t tmp = dst16[y * dst16_stride + x];
-        if (use_wtd_comp_avg) {
-          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
-          tmp = tmp >> DIST_PRECISION_BITS;
-        } else {
-          tmp += res;
-          tmp = tmp >> 1;
-        }
-        tmp -= round_offset;
-        dst[y * dst_stride + x] =
-            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
-      } else {
-        dst16[y * dst16_stride + x] = res;
-      }
-    }
-  }
-}
-
-void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride,
-                                     uint8_t *dst, int dst_stride, int w, int h,
-                                     ConvolveParams *conv_params) {
-  CONV_BUF_TYPE *dst16 = conv_params->dst;
-  int dst16_stride = conv_params->dst_stride;
-  const int bits =
-      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
-                           (1 << (offset_bits - conv_params->round_1 - 1));
-  const int use_wtd_comp_avg = is_uneven_wtd_comp_avg(conv_params);
-
-  for (int y = 0; y < h; ++y) {
-    for (int x = 0; x < w; ++x) {
-      CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
-      res += round_offset;
-
-      if (conv_params->do_average) {
-        int32_t tmp = dst16[y * dst16_stride + x];
-        if (use_wtd_comp_avg) {
-          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
-          tmp = tmp >> DIST_PRECISION_BITS;
-        } else {
-          tmp += res;
-          tmp = tmp >> 1;
-        }
-        tmp -= round_offset;
-        dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
-      } else {
-        dst16[y * dst16_stride + x] = res;
-      }
-    }
-  }
-}
-
-void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst,
-                             int dst_stride, int w, int h,
-                             const InterpFilterParams *filter_params_x,
-                             const InterpFilterParams *filter_params_y,
-                             const int subpel_x_qn, const int x_step_qn,
-                             const int subpel_y_qn, const int y_step_qn,
-                             ConvolveParams *conv_params) {
-  int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
-  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
-             filter_params_y->taps;
-  CONV_BUF_TYPE *dst16 = conv_params->dst;
-  const int dst16_stride = conv_params->dst_stride;
-  const int bits =
-      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
-  assert(bits >= 0);
-  int im_stride = w;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const int bd = 8;
-  const int use_wtd_comp_avg = is_uneven_wtd_comp_avg(conv_params);
-
-  // horizontal filter
-  const uint8_t *src_horiz = src - fo_vert * src_stride;
-  for (int y = 0; y < im_h; ++y) {
-    int x_qn = subpel_x_qn;
-    for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
-      const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
-      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      assert(x_filter_idx < SUBPEL_SHIFTS);
-      const int16_t *x_filter =
-          av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
-      int32_t sum = (1 << (bd + FILTER_BITS - 1));
-      for (int k = 0; k < filter_params_x->taps; ++k) {
-        sum += x_filter[k] * src_x[k - fo_horiz];
-      }
-      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
-      im_block[y * im_stride + x] =
-          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
-    }
-    src_horiz += src_stride;
-  }
-
-  // vertical filter
-  int16_t *src_vert = im_block + fo_vert * im_stride;
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  for (int x = 0; x < w; ++x) {
-    int y_qn = subpel_y_qn;
-    for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
-      const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
-      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      assert(y_filter_idx < SUBPEL_SHIFTS);
-      const int16_t *y_filter =
-          av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
-      int32_t sum = 1 << offset_bits;
-      for (int k = 0; k < filter_params_y->taps; ++k) {
-        sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
-      }
-      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
-      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
-      if (conv_params->is_compound) {
-        if (conv_params->do_average) {
-          int32_t tmp = dst16[y * dst16_stride + x];
-          if (use_wtd_comp_avg) {
-            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
-            tmp = tmp >> DIST_PRECISION_BITS;
-          } else {
-            tmp += res;
-            tmp = tmp >> 1;
-          }
-          /* Subtract round offset and convolve round */
-          tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
-                       (1 << (offset_bits - conv_params->round_1 - 1)));
-          dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
-        } else {
-          dst16[y * dst16_stride + x] = res;
-        }
-      } else {
-        /* Subtract round offset and convolve round */
-        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
-                             (1 << (offset_bits - conv_params->round_1 - 1)));
-        dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
-      }
-    }
-    src_vert++;
-  }
-}
-
-static void convolve_2d_scale_wrapper(
-    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
-    ConvolveParams *conv_params) {
-  if (conv_params->is_compound) {
-    assert(conv_params->dst != NULL);
-  }
-  av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
-                        filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
-                        y_step_qn, conv_params);
-}
-
-static void convolve_2d_facade_compound(
-    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params) {
-  const bool need_x = subpel_x_qn != 0;
-  const bool need_y = subpel_y_qn != 0;
-  if (!need_x && !need_y) {
-    av1_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
-                                  conv_params);
-  } else if (need_x && !need_y) {
-    av1_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
-                            filter_params_x, subpel_x_qn, conv_params);
-  } else if (!need_x && need_y) {
-    av1_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
-                            filter_params_y, subpel_y_qn, conv_params);
-  } else {
-    assert(need_y && need_x);
-    av1_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
-                             filter_params_x, filter_params_y, subpel_x_qn,
-                             subpel_y_qn, conv_params);
-  }
-}
-
-static void convolve_2d_facade_single(
-    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params) {
-  const bool need_x = subpel_x_qn != 0;
-  const bool need_y = subpel_y_qn != 0;
-  if (!need_x && !need_y) {
-    aom_convolve_copy(src, src_stride, dst, dst_stride, w, h);
-  } else if (need_x && !need_y) {
-    // Filters with taps > 8 are only for encoder side use.
-    // TODO(any): need SIMD for > 8 taps filters
-    if (filter_params_x->taps > 8 || filter_params_y->taps > 8) {
-      av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
-                          filter_params_x, subpel_x_qn, conv_params);
-    } else {
-      av1_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
-                        subpel_x_qn, conv_params);
-    }
-  } else if (!need_x && need_y) {
-    if (filter_params_x->taps > 8 || filter_params_y->taps > 8) {
-      av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
-                          filter_params_y, subpel_y_qn);
-    } else {
-      av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y,
-                        subpel_y_qn);
-    }
-  } else {
-    assert(need_x && need_y);
-
-    if (filter_params_x->taps > 8 || filter_params_y->taps > 8) {
-      av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
-                           filter_params_x, filter_params_y, subpel_x_qn,
-                           subpel_y_qn, conv_params);
-    } else {
-      av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h,
-                         filter_params_x, filter_params_y, subpel_x_qn,
-                         subpel_y_qn, conv_params);
-    }
-  }
-}
-
-void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
-                            int dst_stride, int w, int h,
-                            const InterpFilterParams *interp_filters[2],
-                            const int subpel_x_qn, int x_step_q4,
-                            const int subpel_y_qn, int y_step_q4, int scaled,
-                            ConvolveParams *conv_params) {
-  (void)x_step_q4;
-  (void)y_step_q4;
-  (void)dst;
-  (void)dst_stride;
-
-  const InterpFilterParams *filter_params_x = interp_filters[0];
-  const InterpFilterParams *filter_params_y = interp_filters[1];
-
-  // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case.
-  // Do we have SIMD support to 4-tap case?
-  // 2-tap filter indicates that it is for IntraBC.
-  if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
-    assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
-    assert(!scaled);
-    if (subpel_x_qn && subpel_y_qn) {
-      av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
-                           filter_params_x, filter_params_y, subpel_x_qn,
-                           subpel_y_qn, conv_params);
-      return;
-    } else if (subpel_x_qn) {
-      av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
-                          filter_params_x, subpel_x_qn, conv_params);
-      return;
-    } else if (subpel_y_qn) {
-      av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
-                          filter_params_y, subpel_y_qn);
-      return;
-    }
-  }
-
-  if (scaled) {
-    convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
-                              filter_params_x, filter_params_y, subpel_x_qn,
-                              x_step_q4, subpel_y_qn, y_step_q4, conv_params);
-  } else if (conv_params->is_compound) {
-    convolve_2d_facade_compound(src, src_stride, dst, dst_stride, w, h,
-                                filter_params_x, filter_params_y, subpel_x_qn,
-                                subpel_y_qn, conv_params);
-  } else {
-    convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
-                              filter_params_x, filter_params_y, subpel_x_qn,
-                              subpel_y_qn, conv_params);
-  }
-}
-
 void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
                                 uint16_t *dst, int dst_stride, int w, int h,
                                 const InterpFilterParams *filter_params_x,
@@ -1180,87 +638,6 @@
   return (int)((const InterpKernel *)(intptr_t)f - base);
 }
 
-static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
-                                       uint16_t *dst, ptrdiff_t dst_stride,
-                                       const InterpKernel *x_filters, int x0_q4,
-                                       int x_step_q4, int w, int h,
-                                       int round0_bits) {
-  const int bd = 8;
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (int y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (int x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
-                           (1 << (bd + FILTER_BITS - 1));
-      const int sum = horz_scalar_product(src_x, x_filter) + rounding;
-      dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
-                               WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const InterpKernel *y_filters, int y0_q4,
-                                      int y_step_q4, int w, int h,
-                                      int round1_bits) {
-  const int bd = 8;
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
-  for (int x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (int y = 0; y < h; ++y) {
-      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      const int rounding =
-          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
-          (1 << (bd + round1_bits - 1));
-      const int sum =
-          highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
-      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
-                                   int w, int h,
-                                   const ConvolveParams *conv_params) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
-  const int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
-  memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                             src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4,
-                             x_step_q4, w, intermediate_height,
-                             conv_params->round_0);
-  convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-                            MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4,
-                            y_step_q4, w, h, conv_params->round_1);
-}
-
 static void highbd_convolve_add_src_horiz_hip(
     const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
     ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,

diff --git a/av1/common/convolve.h b/av1/common/convolve.h
index c93bc55..3e6bfa8 100644
--- a/av1/common/convolve.h
+++ b/av1/common/convolve.h

@@ -53,13 +53,6 @@
 struct AV1Common;
 struct scale_factors;
 
-void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
-                            int dst_stride, int w, int h,
-                            const InterpFilterParams *interp_filters[2],
-                            const int subpel_x_qn, int x_step_q4,
-                            const int subpel_y_qn, int y_step_q4, int scaled,
-                            ConvolveParams *conv_params);
-
 static INLINE int is_uneven_wtd_comp_avg(const ConvolveParams *params) {
   return params->do_average &&
          (params->fwd_offset != (1 << (DIST_PRECISION_BITS - 1)) ||

diff --git a/av1/common/idct.c b/av1/common/idct.c
index 04dcfd7..cd9436b 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c

@@ -246,7 +246,6 @@
 #endif  // CONFIG_IST_FIX_B098
   txfm_param->lossless = xd->lossless[xd->mi[0]->segment_id];
   txfm_param->bd = xd->bd;
-  txfm_param->is_hbd = is_cur_buf_hbd(xd);
   txfm_param->tx_set_type = av1_get_ext_tx_set_type(
       txfm_param->tx_size, is_inter_block(xd->mi[0], xd->tree_type),
       reduced_tx_set);
@@ -321,29 +320,6 @@
   }
 }
 
-void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
-                        const TxfmParam *txfm_param) {
-  const TX_SIZE tx_size = txfm_param->tx_size;
-  DECLARE_ALIGNED(32, uint16_t, tmp[MAX_TX_SQUARE]);
-  int tmp_stride = MAX_TX_SIZE;
-  int w = tx_size_wide[tx_size];
-  int h = tx_size_high[tx_size];
-  for (int r = 0; r < h; ++r) {
-    for (int c = 0; c < w; ++c) {
-      tmp[r * tmp_stride + c] = dst[r * stride + c];
-    }
-  }
-
-  av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride,
-                          txfm_param);
-
-  for (int r = 0; r < h; ++r) {
-    for (int c = 0; c < w; ++c) {
-      dst[r * stride + c] = (uint8_t)tmp[r * tmp_stride + c];
-    }
-  }
-}
-
 void av1_inverse_transform_block(const MACROBLOCKD *xd,
 #if CONFIG_IST
                                  tran_low_t *dqcoeff,
@@ -373,11 +349,7 @@
   av1_inv_stxfm(dqcoeff, &txfm_param);
 #endif
 
-  if (txfm_param.is_hbd) {
-    av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
-  } else {
-    av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
-  }
+  av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
 }
 
 // Inverse secondary transform

diff --git a/av1/common/loopfiltermask.c b/av1/common/loopfiltermask.c
index dd21f37..4c09941 100644
--- a/av1/common/loopfiltermask.c
+++ b/av1/common/loopfiltermask.c

@@ -432,93 +432,6 @@
   return ((mi_row & 3) << 4) | mi_col;
 }
 
-static void filter_selectively_vert_row2(
-    int subsampling_factor, uint8_t *s, int pitch, int plane,
-    uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
-    uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
-    const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2) {
-  uint64_t mask;
-  const int step = 1 << subsampling_factor;
-
-  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
-              mask_8x8_1 | mask_4x4_1;
-       mask; mask >>= step) {
-    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
-    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
-
-    if (mask & 1) {
-      if ((mask_16x16_0 | mask_16x16_1) & 1) {
-        // chroma plane filters less pixels introduced in deblock_13tap
-        // experiment
-        LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_14;
-
-        if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          if (plane) {
-            aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                    lfi1->hev_thr);
-          } else {
-            aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                     lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                     lfi1->hev_thr);
-          }
-        } else if (mask_16x16_0 & 1) {
-          lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-        } else {
-          lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                       lfi1->hev_thr);
-        }
-      }
-
-      if ((mask_8x8_0 | mask_8x8_1) & 1) {
-        // chroma plane filters less pixels introduced in deblock_13tap
-        // experiment
-        LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_8;
-
-        if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          if (plane) {
-            aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                    lfi1->hev_thr);
-          } else {
-            aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                    lfi1->hev_thr);
-          }
-        } else if (mask_8x8_0 & 1) {
-          lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-        } else {
-          lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                       lfi1->hev_thr);
-        }
-      }
-
-      if ((mask_4x4_0 | mask_4x4_1) & 1) {
-        if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          aom_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                  lfi1->hev_thr);
-        } else if (mask_4x4_0 & 1) {
-          aom_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-        } else {
-          aom_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr);
-        }
-      }
-    }
-
-    s += 4;
-    lfl += step;
-    lfl2 += step;
-    mask_16x16_0 >>= step;
-    mask_8x8_0 >>= step;
-    mask_4x4_0 >>= step;
-    mask_16x16_1 >>= step;
-    mask_8x8_1 >>= step;
-    mask_4x4_1 >>= step;
-  }
-}
-
 static void highbd_filter_selectively_vert_row2(
     int subsampling_factor, uint16_t *s, int pitch, int plane,
     uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
@@ -609,89 +522,6 @@
   }
 }
 
-static void filter_selectively_horiz(uint8_t *s, int pitch, int plane,
-                                     int subsampling, uint64_t mask_16x16,
-                                     uint64_t mask_8x8, uint64_t mask_4x4,
-                                     const loop_filter_info_n *lfi_n,
-                                     const uint8_t *lfl) {
-  uint64_t mask;
-  int count;
-  const int step = 1 << subsampling;
-  const unsigned int two_block_mask = subsampling ? 5 : 3;
-  int offset = 0;
-
-  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
-    // Next block's thresholds, when it is within current 64x64 block.
-    // If it is out of bound, its mask is zero, and it points to current edge's
-    // filter parameters, instead of next edge's.
-    int next_edge = step;
-    if (offset + next_edge >= MI_SIZE_64X64) next_edge = 0;
-    const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + next_edge);
-
-    count = 1;
-    if (mask & 1) {
-      if (mask_16x16 & 1) {
-        // chroma plane filters less pixels introduced in deblock_13tap
-        // experiment
-        LpfFunc lpf_horizontal =
-            plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_14;
-
-        if ((mask_16x16 & two_block_mask) == two_block_mask) {
-          if (plane) {
-            aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, lfin->mblim, lfin->lim,
-                                      lfin->hev_thr);
-          } else {
-            aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
-                                       lfi->hev_thr, lfin->mblim, lfin->lim,
-                                       lfin->hev_thr);
-          }
-          count = 2;
-        } else {
-          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-        }
-      } else if (mask_8x8 & 1) {
-        // chroma plane filters less pixels introduced in deblock_13tap
-        // experiment
-        LpfFunc lpf_horizontal =
-            plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_8;
-
-        if ((mask_8x8 & two_block_mask) == two_block_mask) {
-          if (plane) {
-            aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, lfin->mblim, lfin->lim,
-                                      lfin->hev_thr);
-          } else {
-            aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, lfin->mblim, lfin->lim,
-                                      lfin->hev_thr);
-          }
-          count = 2;
-        } else {
-          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-        }
-      } else if (mask_4x4 & 1) {
-        if ((mask_4x4 & two_block_mask) == two_block_mask) {
-          aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
-                                    lfi->hev_thr, lfin->mblim, lfin->lim,
-                                    lfin->hev_thr);
-          count = 2;
-        } else {
-          aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-        }
-      }
-    }
-
-    s += 4 * count;
-    lfl += step * count;
-    mask_16x16 >>= step * count;
-    mask_8x8 >>= step * count;
-    mask_4x4 >>= step * count;
-    offset += step * count;
-  }
-}
-
 static void highbd_filter_selectively_horiz(
     uint16_t *s, int pitch, int plane, int subsampling, uint64_t mask_16x16,
     uint64_t mask_8x8, uint64_t mask_4x4, const loop_filter_info_n *lfi_n,
@@ -998,15 +828,10 @@
       mask_4x4_1 = 0;
     }
 
-    if (cm->seq_params.use_highbitdepth)
-      highbd_filter_selectively_vert_row2(
-          ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
-          mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
-          &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
-    else
-      filter_selectively_vert_row2(
-          ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
-          mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
+    highbd_filter_selectively_vert_row2(
+        ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
+        mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
+        &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
 
     dst->buf += two_row_stride;
   }
@@ -1066,13 +891,9 @@
     mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
     mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
 
-    if (cm->seq_params.use_highbitdepth)
-      highbd_filter_selectively_horiz(
-          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
-          mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth);
-    else
-      filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
-                               mask_8x8, mask_4x4, &cm->lf_info, lfl);
+    highbd_filter_selectively_horiz(
+        CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
+        mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth);
 
     dst->buf += row_stride;
   }
@@ -1146,16 +967,10 @@
       uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
       uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
 
-      if (cm->seq_params.use_highbitdepth)
-        highbd_filter_selectively_vert_row2(
-            ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
-            mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
-            &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
-      else
-        filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl,
-                                     mask_16x16_0, mask_8x8_0, mask_4x4_0,
-                                     mask_16x16_1, mask_8x8_1, mask_4x4_1,
-                                     &cm->lf_info, lfl, lfl2);
+      highbd_filter_selectively_vert_row2(
+          ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
+          mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
+          &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
 
       dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
     }
@@ -1217,14 +1032,9 @@
       mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
       mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
 
-      if (cm->seq_params.use_highbitdepth)
-        highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
-                                        dst->stride, pl, ssx, mask_16x16,
-                                        mask_8x8, mask_4x4, &cm->lf_info, lfl,
-                                        (int)cm->seq_params.bit_depth);
-      else
-        filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
-                                 mask_8x8, mask_4x4, &cm->lf_info, lfl);
+      highbd_filter_selectively_horiz(
+          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
+          mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth);
 
       dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
     }

diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index 04fd1fa..ee7152c 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c

@@ -60,8 +60,7 @@
 void av1_init_inter_params(InterPredParams *inter_pred_params, int block_width,
                            int block_height, int pix_row, int pix_col,
                            int subsampling_x, int subsampling_y, int bit_depth,
-                           int use_hbd_buf, int is_intrabc,
-                           const struct scale_factors *sf,
+                           int is_intrabc, const struct scale_factors *sf,
                            const struct buf_2d *ref_buf,
                            InterpFilter interp_filter) {
   inter_pred_params->block_width = block_width;
@@ -75,7 +74,6 @@
   inter_pred_params->subsampling_x = subsampling_x;
   inter_pred_params->subsampling_y = subsampling_y;
   inter_pred_params->bit_depth = bit_depth;
-  inter_pred_params->use_hbd_buf = use_hbd_buf;
   inter_pred_params->is_intrabc = is_intrabc;
   inter_pred_params->scale_factors = sf;
   inter_pred_params->ref_frame_buf = *ref_buf;
@@ -127,8 +125,8 @@
   // TODO(jingning): av1_warp_plane() can be further cleaned up.
   if (inter_pred_params->mode == WARP_PRED) {
     av1_warp_plane(
-        &inter_pred_params->warp_params, inter_pred_params->use_hbd_buf,
-        inter_pred_params->bit_depth, inter_pred_params->ref_frame_buf.buf0,
+        &inter_pred_params->warp_params, inter_pred_params->bit_depth,
+        inter_pred_params->ref_frame_buf.buf0,
         inter_pred_params->ref_frame_buf.width,
         inter_pred_params->ref_frame_buf.height,
         inter_pred_params->ref_frame_buf.stride, dst,
@@ -137,20 +135,11 @@
         dst_stride, inter_pred_params->subsampling_x,
         inter_pred_params->subsampling_y, &inter_pred_params->conv_params);
   } else if (inter_pred_params->mode == TRANSLATION_PRED) {
-    if (inter_pred_params->use_hbd_buf) {
-      highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params,
-                             inter_pred_params->block_width,
-                             inter_pred_params->block_height,
-                             &inter_pred_params->conv_params,
-                             inter_pred_params->interp_filter_params,
-                             inter_pred_params->bit_depth);
-    } else {
-      inter_predictor(src, src_stride, dst, dst_stride, subpel_params,
-                      inter_pred_params->block_width,
-                      inter_pred_params->block_height,
-                      &inter_pred_params->conv_params,
-                      inter_pred_params->interp_filter_params);
-    }
+    highbd_inter_predictor(
+        src, src_stride, dst, dst_stride, subpel_params,
+        inter_pred_params->block_width, inter_pred_params->block_height,
+        &inter_pred_params->conv_params,
+        inter_pred_params->interp_filter_params, inter_pred_params->bit_depth);
   }
 }
 
@@ -378,22 +367,6 @@
   }
 }
 
-void av1_build_compound_diffwtd_mask_c(uint8_t *mask,
-                                       DIFFWTD_MASK_TYPE mask_type,
-                                       const uint8_t *src0, int src0_stride,
-                                       const uint8_t *src1, int src1_stride,
-                                       int h, int w) {
-  switch (mask_type) {
-    case DIFFWTD_38:
-      diffwtd_mask(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w);
-      break;
-    case DIFFWTD_38_INV:
-      diffwtd_mask(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w);
-      break;
-    default: assert(0);
-  }
-}
-
 static AOM_FORCE_INLINE void diffwtd_mask_highbd(
     uint8_t *mask, int which_inverse, int mask_base, const uint16_t *src0,
     int src0_stride, const uint16_t *src1, int src1_stride, int h, int w,
@@ -659,8 +632,7 @@
 
   av1_init_inter_params(inter_pred_params, bw, bh, pre_y, pre_x,
                         pd->subsampling_x, pd->subsampling_y, xd->bd,
-                        is_cur_buf_hbd(xd), mi->use_intrabc[0], sf, pre_buf,
-                        mi->interp_fltr);
+                        mi->use_intrabc[0], sf, pre_buf, mi->interp_fltr);
 
   inter_pred_params->conv_params = get_conv_params_no_round(
       0, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
@@ -740,71 +712,9 @@
                             bw, CONVERT_TO_BYTEPTR(tmp_buf1), bw, xd->bd);
 }
 
-// Note: grad_prec_bits param returned correspond to the precision
-// of the gradient information in bits assuming gradient
-// computed at unit pixel step normalization is 0 scale.
-// Negative values indicate gradient returned at reduced precision, and
-// positive values indicate gradient returned at higher precision.
-void av1_compute_subpel_gradients_mc_lowbd(
-    MACROBLOCKD *xd, const MB_MODE_INFO *mi, int bw, int bh, int mi_x, int mi_y,
-    uint8_t **mc_buf, InterPredParams *inter_pred_params,
-    CalcSubpelParamsFunc calc_subpel_params_func, int ref, int *grad_prec_bits,
-    int16_t *x_grad, int16_t *y_grad) {
-  *grad_prec_bits = 3 - SUBPEL_GRAD_DELTA_BITS - 2;
-
-  // Original predictor
-  const MV mv_orig = mi->mv[ref].as_mv;
-  MV mv_modified = mv_orig;
-  uint8_t tmp_buf1[MAX_SB_SIZE * MAX_SB_SIZE] = { 0 };
-  uint8_t tmp_buf2[MAX_SB_SIZE * MAX_SB_SIZE] = { 0 };
-  // X gradient
-  // Get predictor to the left
-  mv_modified.col = mv_orig.col - (1 << (3 - SUBPEL_GRAD_DELTA_BITS));
-  mv_modified.row = mv_orig.row;
-  av1_build_one_inter_predictor(tmp_buf1, bw, &mv_modified, inter_pred_params,
-                                xd, mi_x, mi_y, ref, mc_buf,
-                                calc_subpel_params_func);
-  // Get predictor to the right
-  mv_modified.col = mv_orig.col + (1 << (3 - SUBPEL_GRAD_DELTA_BITS));
-  mv_modified.row = mv_orig.row;
-  av1_build_one_inter_predictor(tmp_buf2, bw, &mv_modified, inter_pred_params,
-                                xd, mi_x, mi_y, ref, mc_buf,
-                                calc_subpel_params_func);
-  // Compute difference.
-  // Note since the deltas are at +2^g/8 and -2^g/8 subpel locations
-  // (g = 3 - SUBPEL_GRAD_DELTA_BITS), the actual unit pel gradient is
-  // 4/2^g = 2^(2-g) times the difference. Therefore the gradient returned
-  // is at reduced precision by 2-g bits. That explains the grad_prec_bits
-  // return value of g-2 at the end of this function.
-
-  aom_subtract_block(bh, bw, x_grad, bw, tmp_buf2, bw, tmp_buf1, bw);
-
-  // Y gradient
-  // Get predictor below
-  mv_modified.col = mv_orig.col;
-  mv_modified.row = mv_orig.row - (1 << (3 - SUBPEL_GRAD_DELTA_BITS));
-  av1_build_one_inter_predictor(tmp_buf1, bw, &mv_modified, inter_pred_params,
-                                xd, mi_x, mi_y, ref, mc_buf,
-                                calc_subpel_params_func);
-  // Get predictor above
-  mv_modified.col = mv_orig.col;
-  mv_modified.row = mv_orig.row + (1 << (3 - SUBPEL_GRAD_DELTA_BITS));
-  av1_build_one_inter_predictor(tmp_buf2, bw, &mv_modified, inter_pred_params,
-                                xd, mi_x, mi_y, ref, mc_buf,
-                                calc_subpel_params_func);
-  // Compute difference.
-  // Note since the deltas are at +2^g/8 and -2^g/8 subpel locations
-  // (g = 3 - SUBPEL_GRAD_DELTA_BITS), the actual unit pel gradient is
-  // 4/2^g = 2^(2-g) times the difference. Therefore the gradient returned
-  // is at reduced precision by 2-g bits. That explains the grad_prec_bits
-  // return value of g-2 at the end of this function.
-
-  aom_subtract_block(bh, bw, y_grad, bw, tmp_buf2, bw, tmp_buf1, bw);
-}
-
-void av1_bicubic_grad_interpolation_c(const int16_t *pred_src, int16_t *x_grad,
-                                      int16_t *y_grad, const int bw,
-                                      const int bh) {
+void av1_bicubic_grad_interpolation_highbd_c(const int16_t *pred_src,
+                                             int16_t *x_grad, int16_t *y_grad,
+                                             const int bw, const int bh) {
 #if OPFL_BICUBIC_GRAD
   for (int i = 0; i < bh; i++) {
     for (int j = 0; j < bw; j++) {
@@ -853,12 +763,6 @@
 #endif  // OPFL_BICUBIC_GRAD
 }
 
-void av1_bicubic_grad_interpolation_highbd_c(const int16_t *pred_src,
-                                             int16_t *x_grad, int16_t *y_grad,
-                                             const int bw, const int bh) {
-  av1_bicubic_grad_interpolation_c(pred_src, x_grad, y_grad, bw, bh);
-}
-
 #if OPFL_BILINEAR_GRAD
 void av1_bilinear_grad_interpolation_c(const int16_t *pred_src, int16_t *x_grad,
                                        int16_t *y_grad, const int bw,
@@ -894,16 +798,13 @@
 #if OPFL_BILINEAR_GRAD || OPFL_BICUBIC_GRAD
 void av1_compute_subpel_gradients_interp(int16_t *pred_dst, int bw, int bh,
                                          int *grad_prec_bits, int16_t *x_grad,
-                                         int16_t *y_grad, int is_hbd) {
+                                         int16_t *y_grad) {
   // Reuse pixels in pred_dst to compute gradients
 #if OPFL_BILINEAR_GRAD
   (void)is_hbd;
   av1_bilinear_grad_interpolation_c(pred_dst, x_grad, y_grad, bw, bh);
 #else
-  if (is_hbd)
-    av1_bicubic_grad_interpolation_highbd(pred_dst, x_grad, y_grad, bw, bh);
-  else
-    av1_bicubic_grad_interpolation(pred_dst, x_grad, y_grad, bw, bh);
+  av1_bicubic_grad_interpolation_highbd(pred_dst, x_grad, y_grad, bw, bh);
 #endif  // OPFL_BILINEAR_GRAD
   *grad_prec_bits = 3 - SUBPEL_GRAD_DELTA_BITS - 2;
 }
@@ -1164,31 +1065,6 @@
   return n_blocks;
 }
 
-// Function to compute optical flow offsets in nxn blocks
-int av1_opfl_mv_refinement_nxn_lowbd_c(const uint8_t *p0, int pstride0,
-                                       const uint8_t *p1, int pstride1,
-                                       const int16_t *gx0, const int16_t *gy0,
-                                       const int16_t *gx1, const int16_t *gy1,
-                                       int gstride, int bw, int bh, int n,
-                                       int d0, int d1, int grad_prec_bits,
-                                       int mv_prec_bits, int *vx0, int *vy0,
-                                       int *vx1, int *vy1) {
-  assert(bw % n == 0 && bh % n == 0);
-  int n_blocks = 0;
-  for (int i = 0; i < bh; i += n) {
-    for (int j = 0; j < bw; j += n) {
-      av1_opfl_mv_refinement_lowbd(
-          p0 + (i * pstride0 + j), pstride0, p1 + (i * pstride1 + j), pstride1,
-          gx0 + (i * gstride + j), gy0 + (i * gstride + j),
-          gx1 + (i * gstride + j), gy1 + (i * gstride + j), gstride, n, n, d0,
-          d1, grad_prec_bits, mv_prec_bits, vx0 + n_blocks, vy0 + n_blocks,
-          vx1 + n_blocks, vy1 + n_blocks);
-      n_blocks++;
-    }
-  }
-  return n_blocks;
-}
-
 #if OPFL_COMBINE_INTERP_GRAD_LS
 static AOM_FORCE_INLINE void compute_pred_using_interp_grad(
     const uint8_t *src1, const uint8_t *src2, int16_t *dst1, int16_t *dst2,
@@ -1208,32 +1084,6 @@
 }
 #endif  // OPFL_COMBINE_INTERP_GRAD_LS
 
-void av1_copy_pred_array_c(const uint8_t *src1, const uint8_t *src2,
-                           int16_t *dst1, int16_t *dst2, int bw, int bh, int d0,
-                           int d1) {
-#if OPFL_BILINEAR_GRAD || OPFL_BICUBIC_GRAD
-#if OPFL_COMBINE_INTERP_GRAD_LS
-  compute_pred_using_interp_grad(src1, src2, dst1, dst2, bw, bh, d0, d1);
-#else
-  (void)src2;
-  (void)dst2;
-  (void)d0;
-  (void)d1;
-  for (int i = 0; i < bh; ++i)
-    for (int j = 0; j < bw; ++j) dst1[i * bw + j] = (int16_t)src1[i * bw + j];
-#endif  // OPFL_COMBINE_INTERP_GRAD_LS
-#else
-  (void)src1;
-  (void)dst1;
-  (void)src2;
-  (void)dst2;
-  (void)d0;
-  (void)d1;
-  (void)bw;
-  (void)bh;
-#endif  // OPFL_BILINEAR_GRAD || OPFL_BICUBIC_GRAD
-}
-
 #if OPFL_COMBINE_INTERP_GRAD_LS
 static AOM_FORCE_INLINE void compute_pred_using_interp_grad_highbd(
     const uint16_t *src1, const uint16_t *src2, int16_t *dst1, int16_t *dst2,
@@ -1328,8 +1178,7 @@
       (int16_t *)aom_memalign(16, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(int16_t));
   av1_copy_pred_array_highbd(dst0, dst1, tmp0, tmp1, bw, bh, d0, d1);
   // Buffers gx0 and gy0 are used to store the gradients of tmp0
-  av1_compute_subpel_gradients_interp(tmp0, bw, bh, &grad_prec_bits, gx0, gy0,
-                                      is_cur_buf_hbd(xd));
+  av1_compute_subpel_gradients_interp(tmp0, bw, bh, &grad_prec_bits, gx0, gy0);
 
   n_blocks = av1_opfl_mv_refinement_nxn_interp_grad(
       tmp1, bw, gx0, gy0, bw, bw, bh, n, d0, d1, grad_prec_bits, target_prec,
@@ -1341,12 +1190,10 @@
   int16_t *tmp =
       (int16_t *)aom_memalign(16, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(int16_t));
   av1_copy_pred_array_highbd(dst0, NULL, tmp, NULL, bw, bh, d0, d1);
-  av1_compute_subpel_gradients_interp(tmp, bw, bh, &grad_prec_bits, gx0, gy0,
-                                      is_cur_buf_hbd(xd));
+  av1_compute_subpel_gradients_interp(tmp, bw, bh, &grad_prec_bits, gx0, gy0);
 
   av1_copy_pred_array_highbd(dst1, NULL, tmp, NULL, bw, bh, d0, d1);
-  av1_compute_subpel_gradients_interp(tmp, bw, bh, &grad_prec_bits, gx1, gy1,
-                                      is_cur_buf_hbd(xd));
+  av1_compute_subpel_gradients_interp(tmp, bw, bh, &grad_prec_bits, gx1, gy1);
 
   n_blocks = av1_opfl_mv_refinement_nxn_highbd(
       dst0, bw, dst1, bw, gx0, gy0, gx1, gy1, bw, bw, bh, n, d0, d1,
@@ -1390,118 +1237,6 @@
   return target_prec;
 }
 
-static int get_optflow_based_mv_lowbd(
-    const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mbmi,
-    int_mv *mv_refined, int bw, int bh, int mi_x, int mi_y, uint8_t **mc_buf,
-    CalcSubpelParamsFunc calc_subpel_params_func, int16_t *gx0, int16_t *gy0,
-    int16_t *gx1, int16_t *gy1, int *vx0, int *vy0, int *vx1, int *vy1,
-    uint8_t *dst0, uint8_t *dst1) {
-  const int target_prec = MV_REFINE_PREC_BITS;
-  // Convert output MV to 1/16th pel
-  assert(MV_REFINE_PREC_BITS >= 3);
-  for (int mvi = 0; mvi < N_OF_OFFSETS; mvi++) {
-    mv_refined[mvi * 2].as_mv.row *= 1 << (MV_REFINE_PREC_BITS - 3);
-    mv_refined[mvi * 2].as_mv.col *= 1 << (MV_REFINE_PREC_BITS - 3);
-    mv_refined[mvi * 2 + 1].as_mv.row *= 1 << (MV_REFINE_PREC_BITS - 3);
-    mv_refined[mvi * 2 + 1].as_mv.col *= 1 << (MV_REFINE_PREC_BITS - 3);
-  }
-
-  // Obtain d0 and d1
-  const RefCntBuffer *const r0_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]);
-  const RefCntBuffer *const r1_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]);
-  int d0 = get_relative_dist(&cm->seq_params.order_hint_info,
-                             cm->cur_frame->order_hint, r0_buf->order_hint);
-  int d1 = get_relative_dist(&cm->seq_params.order_hint_info,
-                             cm->cur_frame->order_hint, r1_buf->order_hint);
-  if (d0 == 0 || d1 == 0) return target_prec;
-
-  // Obrain P0 and P1
-  InterPredParams params0, params1;
-  av1_opfl_build_inter_predictor(cm, xd, plane, mbmi, bw, bh, mi_x, mi_y,
-                                 mc_buf, &params0, calc_subpel_params_func, 0,
-                                 dst0);
-  av1_opfl_build_inter_predictor(cm, xd, plane, mbmi, bw, bh, mi_x, mi_y,
-                                 mc_buf, &params1, calc_subpel_params_func, 1,
-                                 dst1);
-
-  int n_blocks = 1;
-  int grad_prec_bits;
-  int n = opfl_get_subblock_size(bw, bh, plane);
-
-#if OPFL_BILINEAR_GRAD || OPFL_BICUBIC_GRAD
-  // Compute gradients of P0 and P1 with interpolation
-#if OPFL_COMBINE_INTERP_GRAD_LS
-  (void)gx1;
-  (void)gy1;
-
-  // Compute tmp1 = P0 - P1 and gradients of tmp0 = d0 * P0 - d1 * P1
-  int16_t *tmp0 =
-      (int16_t *)aom_memalign(16, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(int16_t));
-  int16_t *tmp1 =
-      (int16_t *)aom_memalign(16, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(int16_t));
-  av1_copy_pred_array(dst0, dst1, tmp0, tmp1, bw, bh, d0, d1);
-  // Buffers gx0 and gy0 are used to store the gradients of tmp0
-  av1_compute_subpel_gradients_interp(tmp0, bw, bh, &grad_prec_bits, gx0, gy0,
-                                      is_cur_buf_hbd(xd));
-
-  n_blocks = av1_opfl_mv_refinement_nxn_interp_grad(
-      tmp1, bw, gx0, gy0, bw, bw, bh, n, d0, d1, grad_prec_bits, target_prec,
-      vx0, vy0, vx1, vy1);
-
-  aom_free(tmp0);
-  aom_free(tmp1);
-#else
-  int16_t *tmp =
-      (int16_t *)aom_memalign(16, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(int16_t));
-  av1_copy_pred_array(dst0, NULL, tmp, NULL, bw, bh, d0, d1);
-  av1_compute_subpel_gradients_interp(tmp, bw, bh, &grad_prec_bits, gx0, gy0,
-                                      is_cur_buf_hbd(xd));
-  av1_copy_pred_array(dst1, NULL, tmp, NULL, bw, bh, d0, d1);
-  av1_compute_subpel_gradients_interp(tmp, bw, bh, &grad_prec_bits, gx1, gy1,
-                                      is_cur_buf_hbd(xd));
-
-  n_blocks = av1_opfl_mv_refinement_nxn_lowbd(
-      dst0, bw, dst1, bw, gx0, gy0, gx1, gy1, bw, bw, bh, n, d0, d1,
-      grad_prec_bits, target_prec, vx0, vy0, vx1, vy1);
-
-  aom_free(tmp);
-#endif  // OPFL_COMBINE_INTERP_GRAD_LS
-#else
-  // Compute gradients of P0 and P1 with MC
-  av1_compute_subpel_gradients_mc_lowbd(xd, mbmi, bw, bh, mi_x, mi_y, mc_buf,
-                                        &params0, calc_subpel_params_func, 0,
-                                        &grad_prec_bits, gx0, gy0);
-  av1_compute_subpel_gradients_mc_lowbd(xd, mbmi, bw, bh, mi_x, mi_y, mc_buf,
-                                        &params1, calc_subpel_params_func, 1,
-                                        &grad_prec_bits, gx1, gy1);
-
-  n_blocks = av1_opfl_mv_refinement_nxn_lowbd(
-      dst0, bw, dst1, bw, gx0, gy0, gx1, gy1, bw, bw, bh, n, d0, d1,
-      grad_prec_bits, target_prec, vx0, vy0, vx1, vy1);
-
-#endif  // OPFL_BILINEAR_GRAD || OPFL_BICUBIC_GRAD
-
-  for (int i = 0; i < n_blocks; i++) {
-#if OPFL_CLAMP_MV_DELTA
-    mv_refined[i * 2].as_mv.row +=
-        clamp(vy0[i], -OPFL_MV_DELTA_LIMIT, OPFL_MV_DELTA_LIMIT);
-    mv_refined[i * 2].as_mv.col +=
-        clamp(vx0[i], -OPFL_MV_DELTA_LIMIT, OPFL_MV_DELTA_LIMIT);
-    mv_refined[i * 2 + 1].as_mv.row +=
-        clamp(vy1[i], -OPFL_MV_DELTA_LIMIT, OPFL_MV_DELTA_LIMIT);
-    mv_refined[i * 2 + 1].as_mv.col +=
-        clamp(vx1[i], -OPFL_MV_DELTA_LIMIT, OPFL_MV_DELTA_LIMIT);
-#else
-    mv_refined[i * 2].as_mv.row += vy0[i];
-    mv_refined[i * 2].as_mv.col += vx0[i];
-    mv_refined[i * 2 + 1].as_mv.row += vy1[i];
-    mv_refined[i * 2 + 1].as_mv.col += vx1[i];
-#endif
-  }
-
-  return target_prec;
-}
-
 // Makes the interpredictor for the region by dividing it up into nxn blocks
 // and running the interpredictor code on each one.
 void make_inter_pred_of_nxn(uint8_t *dst, int dst_stride,
@@ -1579,16 +1314,10 @@
   const int ssx = inter_pred_params->subsampling_x;
   const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
   const int mask_stride = block_size_wide[sb_type];
-  if (inter_pred_params->use_hbd_buf) {
-    aom_highbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, w, h, ssx,
-                                  ssy, &inter_pred_params->conv_params,
-                                  inter_pred_params->bit_depth);
-  } else {
-    aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, mask_stride, w, h, ssx, ssy,
-                                 &inter_pred_params->conv_params);
-  }
+  aom_highbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
+                                src1_stride, mask, mask_stride, w, h, ssx, ssy,
+                                &inter_pred_params->conv_params,
+                                inter_pred_params->bit_depth);
 }
 
 static void make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
@@ -1602,8 +1331,7 @@
   // a temporary buffer, then will blend that temporary buffer with that from
   // the other reference.
   DECLARE_ALIGNED(32, uint8_t, tmp_buf[2 * MAX_SB_SQUARE]);
-  uint8_t *tmp_dst =
-      inter_pred_params->use_hbd_buf ? CONVERT_TO_BYTEPTR(tmp_buf) : tmp_buf;
+  uint8_t *tmp_dst = CONVERT_TO_BYTEPTR(tmp_buf);
 
   const int tmp_buf_stride = MAX_SB_SIZE;
   CONV_BUF_TYPE *org_dst = inter_pred_params->conv_params.dst;
@@ -1743,8 +1471,8 @@
       InterPredParams inter_pred_params;
       av1_init_inter_params(&inter_pred_params, b4_w, b4_h, pre_y + y,
                             pre_x + x, pd->subsampling_x, pd->subsampling_y,
-                            xd->bd, is_cur_buf_hbd(xd), mi->use_intrabc[0], sf,
-                            &pre_buf, this_mbmi->interp_fltr);
+                            xd->bd, mi->use_intrabc[0], sf, &pre_buf,
+                            this_mbmi->interp_fltr);
       inter_pred_params.conv_params =
           get_conv_params_no_round(ref, plane, NULL, 0, is_compound, xd->bd);
 
@@ -1837,28 +1565,16 @@
     }
     // Refine MV using optical flow. The final output MV will be in 1/16
     // precision.
-    if (is_cur_buf_hbd(xd)) {
-      dst0 = CONVERT_TO_BYTEPTR(
-          aom_calloc(1, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(uint16_t)));
-      dst1 = CONVERT_TO_BYTEPTR(
-          aom_calloc(1, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(uint16_t)));
-      get_optflow_based_mv_highbd(
-          cm, xd, plane, mi, mv_refined, bw, bh, mi_x, mi_y, mc_buf,
-          calc_subpel_params_func, gx0, gy0, gx1, gy1, vx0, vy0, vx1, vy1,
-          CONVERT_TO_SHORTPTR(dst0), CONVERT_TO_SHORTPTR(dst1));
-      aom_free(CONVERT_TO_SHORTPTR(dst0));
-      aom_free(CONVERT_TO_SHORTPTR(dst1));
-    } else {
-      dst0 =
-          (uint8_t *)aom_calloc(1, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(uint8_t));
-      dst1 =
-          (uint8_t *)aom_calloc(1, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(uint8_t));
-      get_optflow_based_mv_lowbd(cm, xd, plane, mi, mv_refined, bw, bh, mi_x,
-                                 mi_y, mc_buf, calc_subpel_params_func, gx0,
-                                 gy0, gx1, gy1, vx0, vy0, vx1, vy1, dst0, dst1);
-      aom_free(dst0);
-      aom_free(dst1);
-    }
+    dst0 = CONVERT_TO_BYTEPTR(
+        aom_calloc(1, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(uint16_t)));
+    dst1 = CONVERT_TO_BYTEPTR(
+        aom_calloc(1, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(uint16_t)));
+    get_optflow_based_mv_highbd(
+        cm, xd, plane, mi, mv_refined, bw, bh, mi_x, mi_y, mc_buf,
+        calc_subpel_params_func, gx0, gy0, gx1, gy1, vx0, vy0, vx1, vy1,
+        CONVERT_TO_SHORTPTR(dst0), CONVERT_TO_SHORTPTR(dst1));
+    aom_free(CONVERT_TO_SHORTPTR(dst0));
+    aom_free(CONVERT_TO_SHORTPTR(dst1));
     aom_free(gx0);
     aom_free(gx1);
   }
@@ -1875,8 +1591,7 @@
     InterPredParams inter_pred_params;
     av1_init_inter_params(&inter_pred_params, bw, bh, pre_y, pre_x,
                           pd->subsampling_x, pd->subsampling_y, xd->bd,
-                          is_cur_buf_hbd(xd), mi->use_intrabc[0], sf, pre_buf,
-                          mi->interp_fltr);
+                          mi->use_intrabc[0], sf, pre_buf, mi->interp_fltr);
     if (is_compound) av1_init_comp_mode(&inter_pred_params);
     inter_pred_params.conv_params = get_conv_params_no_round(
         ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
@@ -2087,13 +1802,8 @@
     const int tmp_stride = ctxt->adjacent_stride[plane];
     const uint8_t *const tmp = &ctxt->adjacent[plane][plane_col];
     const uint8_t *const mask = av1_get_obmc_mask(bh);
-    const int is_hbd = is_cur_buf_hbd(xd);
-    if (is_hbd)
-      aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
-                                 tmp_stride, mask, bw, bh, xd->bd);
-    else
-      aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
-                          mask, bw, bh);
+    aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
+                               tmp_stride, mask, bw, bh, xd->bd);
   }
 }
 
@@ -2122,13 +1832,8 @@
     const uint8_t *const tmp = &ctxt->adjacent[plane][plane_row * tmp_stride];
     const uint8_t *const mask = av1_get_obmc_mask(bw);
 
-    const int is_hbd = is_cur_buf_hbd(xd);
-    if (is_hbd)
-      aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
-                                 tmp_stride, mask, bw, bh, xd->bd);
-    else
-      aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
-                          mask, bw, bh);
+    aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
+                               tmp_stride, mask, bw, bh, xd->bd);
   }
 }
 
@@ -2158,26 +1863,15 @@
 
 void av1_setup_obmc_dst_bufs(MACROBLOCKD *xd, uint8_t **dst_buf1,
                              uint8_t **dst_buf2) {
-  if (is_cur_buf_hbd(xd)) {
-    int len = sizeof(uint16_t);
-    dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
-    dst_buf1[1] =
-        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
-    dst_buf1[2] =
-        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
-    dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
-    dst_buf2[1] =
-        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
-    dst_buf2[2] =
-        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
-  } else {
-    dst_buf1[0] = xd->tmp_obmc_bufs[0];
-    dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
-    dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
-    dst_buf2[0] = xd->tmp_obmc_bufs[1];
-    dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
-    dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
-  }
+  int len = sizeof(uint16_t);
+  dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
+  dst_buf1[1] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
+  dst_buf1[2] =
+      CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
+  dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
+  dst_buf2[1] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
+  dst_buf2[2] =
+      CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
 }
 
 void av1_setup_build_prediction_by_above_pred(
@@ -2260,32 +1954,6 @@
       GET_MV_SUBPEL((xd->height - rel_mi_row - left_mi_height) * MI_SIZE);
 }
 
-static AOM_INLINE void combine_interintra(
-    INTERINTRA_MODE mode, int8_t use_wedge_interintra, int8_t wedge_index,
-    int8_t wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
-    uint8_t *comppred, int compstride, const uint8_t *interpred,
-    int interstride, const uint8_t *intrapred, int intrastride) {
-  const int bw = block_size_wide[plane_bsize];
-  const int bh = block_size_high[plane_bsize];
-
-  if (use_wedge_interintra) {
-    if (av1_is_wedge_used(bsize)) {
-      const uint8_t *mask =
-          av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
-      const int subw = 2 * mi_size_wide[bsize] == bw;
-      const int subh = 2 * mi_size_high[bsize] == bh;
-      aom_blend_a64_mask(comppred, compstride, intrapred, intrastride,
-                         interpred, interstride, mask, block_size_wide[bsize],
-                         bw, bh, subw, subh);
-    }
-    return;
-  }
-
-  const uint8_t *mask = smooth_interintra_mask_buf[mode][plane_bsize];
-  aom_blend_a64_mask(comppred, compstride, intrapred, intrastride, interpred,
-                     interstride, mask, bw, bw, bh, 0, 0);
-}
-
 static AOM_INLINE void combine_interintra_highbd(
     INTERINTRA_MODE mode, int8_t use_wedge_interintra, int8_t wedge_index,
     int8_t wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
@@ -2341,20 +2009,11 @@
   const int ssy = xd->plane[plane].subsampling_y;
   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
 
-  if (is_cur_buf_hbd(xd)) {
-    combine_interintra_highbd(
-        xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
-        xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize,
-        plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
-        inter_pred, inter_stride, intra_pred, intra_stride, xd->bd);
-    return;
-  }
-
-  combine_interintra(
+  combine_interintra_highbd(
       xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
       xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize,
       plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
-      inter_pred, inter_stride, intra_pred, intra_stride);
+      inter_pred, inter_stride, intra_pred, intra_stride, xd->bd);
 }
 
 // build interintra_predictors for one plane
@@ -2363,18 +2022,10 @@
                                     const BUFFER_SET *ctx, int plane,
                                     BLOCK_SIZE bsize) {
   assert(bsize < BLOCK_SIZES_ALL);
-  if (is_cur_buf_hbd(xd)) {
-    DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
-    av1_build_intra_predictors_for_interintra(
-        cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(intrapredictor),
-        MAX_SB_SIZE);
-    av1_combine_interintra(xd, bsize, plane, pred, stride,
-                           CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
-  } else {
-    DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]);
-    av1_build_intra_predictors_for_interintra(cm, xd, bsize, plane, ctx,
-                                              intrapredictor, MAX_SB_SIZE);
-    av1_combine_interintra(xd, bsize, plane, pred, stride, intrapredictor,
-                           MAX_SB_SIZE);
-  }
+  DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
+  av1_build_intra_predictors_for_interintra(cm, xd, bsize, plane, ctx,
+                                            CONVERT_TO_BYTEPTR(intrapredictor),
+                                            MAX_SB_SIZE);
+  av1_combine_interintra(xd, bsize, plane, pred, stride,
+                         CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
 }

diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h
index 4e5dfeb..b6ca0e6 100644
--- a/av1/common/reconinter.h
+++ b/av1/common/reconinter.h

@@ -126,7 +126,6 @@
   int subsampling_y;
   const struct scale_factors *scale_factors;
   int bit_depth;
-  int use_hbd_buf;
   INTERINTER_COMPOUND_DATA mask_comp;
   BLOCK_SIZE sb_type;
   int is_intrabc;
@@ -208,8 +207,7 @@
 void av1_init_inter_params(InterPredParams *inter_pred_params, int block_width,
                            int block_height, int pix_row, int pix_col,
                            int subsampling_x, int subsampling_y, int bit_depth,
-                           int use_hbd_buf, int is_intrabc,
-                           const struct scale_factors *sf,
+                           int is_intrabc, const struct scale_factors *sf,
                            const struct buf_2d *ref_buf,
                            InterpFilter interp_filter);
 
@@ -290,26 +288,6 @@
   assert(sp->ys <= SUBPEL_SHIFTS);
 }
 
-static INLINE void inter_predictor(
-    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
-    const SubpelParams *subpel_params, int w, int h,
-    ConvolveParams *conv_params, const InterpFilterParams *interp_filters[2]) {
-  assert(conv_params->do_average == 0 || conv_params->do_average == 1);
-  const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
-  if (is_scaled) {
-    av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
-                           interp_filters, subpel_params->subpel_x,
-                           subpel_params->xs, subpel_params->subpel_y,
-                           subpel_params->ys, 1, conv_params);
-  } else {
-    SubpelParams sp = *subpel_params;
-    revert_scale_extra_bits(&sp);
-    av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
-                           interp_filters, sp.subpel_x, sp.xs, sp.subpel_y,
-                           sp.ys, 0, conv_params);
-  }
-}
-
 static INLINE void highbd_inter_predictor(
     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
     const SubpelParams *subpel_params, int w, int h,

diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c
index 6b04ac9..e6c8f22 100644
--- a/av1/common/reconintra.c
+++ b/av1/common/reconintra.c

@@ -459,12 +459,6 @@
 typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left);
 
-static intra_pred_fn pred[INTRA_MODES][TX_SIZES_ALL];
-static intra_pred_fn dc_pred[2][2][TX_SIZES_ALL];
-#if CONFIG_IBP_DC
-static intra_pred_fn ibp_dc_pred[2][2][TX_SIZES_ALL];
-#endif
-
 typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride,
                                    const uint16_t *above, const uint16_t *left,
                                    int bd);
@@ -504,22 +498,6 @@
   p[TX_4X4] = aom_##type##_predictor_4x4; \
   INIT_NO_4X4(p, type)
 
-  INIT_ALL_SIZES(pred[V_PRED], v);
-  INIT_ALL_SIZES(pred[H_PRED], h);
-  INIT_ALL_SIZES(pred[PAETH_PRED], paeth);
-  INIT_ALL_SIZES(pred[SMOOTH_PRED], smooth);
-  INIT_ALL_SIZES(pred[SMOOTH_V_PRED], smooth_v);
-  INIT_ALL_SIZES(pred[SMOOTH_H_PRED], smooth_h);
-  INIT_ALL_SIZES(dc_pred[0][0], dc_128);
-  INIT_ALL_SIZES(dc_pred[0][1], dc_top);
-  INIT_ALL_SIZES(dc_pred[1][0], dc_left);
-  INIT_ALL_SIZES(dc_pred[1][1], dc);
-#if CONFIG_IBP_DC
-  INIT_ALL_SIZES(ibp_dc_pred[0][0], dc_128);
-  INIT_ALL_SIZES(ibp_dc_pred[0][1], ibp_dc_top);
-  INIT_ALL_SIZES(ibp_dc_pred[1][0], ibp_dc_left);
-  INIT_ALL_SIZES(ibp_dc_pred[1][1], ibp_dc);
-#endif
   INIT_ALL_SIZES(pred_high[V_PRED], highbd_v);
   INIT_ALL_SIZES(pred_high[H_PRED], highbd_h);
   INIT_ALL_SIZES(pred_high[PAETH_PRED], highbd_paeth);
@@ -810,53 +788,6 @@
   }
 }
 
-static void dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
-                         const uint8_t *above, const uint8_t *left,
-                         int upsample_above, int upsample_left, int angle,
-                         int mrl_index) {
-  const int dx = av1_get_dx(angle);
-  const int dy = av1_get_dy(angle);
-  const int bw = tx_size_wide[tx_size];
-  const int bh = tx_size_high[tx_size];
-  assert(angle > 0 && angle < 270);
-
-  if (angle > 0 && angle < 90) {
-    av1_dr_prediction_z1(dst, stride, bw, bh, above, left, upsample_above, dx,
-                         dy, mrl_index);
-  } else if (angle > 90 && angle < 180) {
-    av1_dr_prediction_z2(dst, stride, bw, bh, above, left, upsample_above,
-                         upsample_left, dx, dy, mrl_index);
-  } else if (angle > 180 && angle < 270) {
-    av1_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left, dx,
-                         dy, mrl_index);
-  } else if (angle == 90) {
-    pred[V_PRED][tx_size](dst, stride, above, left);
-  } else if (angle == 180) {
-    pred[H_PRED][tx_size](dst, stride, above, left);
-  }
-}
-#if CONFIG_IBP_DIR
-// Generate the second directional predictor for IBP
-static void second_dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
-                                const uint8_t *above, const uint8_t *left,
-                                int upsample_above, int upsample_left,
-                                int angle) {
-  const int bw = tx_size_wide[tx_size];
-  const int bh = tx_size_high[tx_size];
-
-  if (angle > 0 && angle < 90) {
-    int dy = second_dr_intra_derivative[angle];
-    int dx = 1;
-    av1_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left, dx,
-                         dy, 0);
-  } else if (angle > 180 && angle < 270) {
-    int dx = second_dr_intra_derivative[270 - angle];
-    int dy = 1;
-    av1_dr_prediction_z1(dst, stride, bw, bh, above, left, upsample_above, dx,
-                         dy, 0);
-  }
-}
-#endif
 // Directional prediction, zone 1: 0 < angle < 90
 void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw,
                                    int bh, const uint16_t *above,
@@ -1246,39 +1177,6 @@
   return strength;
 }
 
-void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength) {
-  if (!strength) return;
-
-  const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 },
-                                                         { 0, 5, 6, 5, 0 },
-                                                         { 2, 4, 4, 4, 2 } };
-  const int filt = strength - 1;
-  uint8_t edge[129];
-
-  memcpy(edge, p, sz * sizeof(*p));
-  for (int i = 1; i < sz; i++) {
-    int s = 0;
-    for (int j = 0; j < INTRA_EDGE_TAPS; j++) {
-      int k = i - 2 + j;
-      k = (k < 0) ? 0 : k;
-      k = (k > sz - 1) ? sz - 1 : k;
-      s += edge[k] * kernel[filt][j];
-    }
-    s = (s + 8) >> 4;
-    p[i] = s;
-  }
-}
-
-static void filter_intra_edge_corner(uint8_t *p_above, uint8_t *p_left) {
-  const int kernel[3] = { 5, 6, 5 };
-
-  int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) +
-          (p_above[0] * kernel[2]);
-  s = (s + 8) >> 4;
-  p_above[-1] = s;
-  p_left[-1] = s;
-}
-
 void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength) {
   if (!strength) return;
 
@@ -1312,29 +1210,6 @@
   p_left[-1] = s;
 }
 
-void av1_upsample_intra_edge_c(uint8_t *p, int sz) {
-  // interpolate half-sample positions
-  assert(sz <= MAX_UPSAMPLE_SZ);
-
-  uint8_t in[MAX_UPSAMPLE_SZ + 3];
-  // copy p[-1..(sz-1)] and extend first and last samples
-  in[0] = p[-1];
-  in[1] = p[-1];
-  for (int i = 0; i < sz; i++) {
-    in[i + 2] = p[i];
-  }
-  in[sz + 2] = p[sz - 1];
-
-  // interpolate half-sample edge positions
-  p[-2] = in[0];
-  for (int i = 0; i < sz; i++) {
-    int s = -in[i] + (9 * in[i + 1]) + (9 * in[i + 2]) - in[i + 3];
-    s = clip_pixel((s + 8) >> 4);
-    p[2 * i - 1] = s;
-    p[2 * i] = in[i + 2];
-  }
-}
-
 void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd) {
   // interpolate half-sample positions
   assert(sz <= MAX_UPSAMPLE_SZ);
@@ -1778,346 +1653,6 @@
 #endif
 }
 
-static void build_intra_predictors(
-    const MACROBLOCKD *xd, const uint8_t *ref, int ref_stride, uint8_t *dst,
-    int dst_stride, PREDICTION_MODE mode, int angle_delta,
-    FILTER_INTRA_MODE filter_intra_mode, TX_SIZE tx_size,
-    int disable_edge_filter, int n_top_px, int n_topright_px, int n_left_px,
-    int n_bottomleft_px, int plane, int is_sb_boundary
-#if CONFIG_ORIP
-    ,
-    const int seq_intra_pred_filter_flag
-#endif
-#if CONFIG_IBP_DIR || CONFIG_IBP_DC
-    ,
-    const int seq_ibp_flag
-#endif
-#if CONFIG_IBP_DIR
-    ,
-    uint8_t *const ibp_weights[TX_SIZES_ALL][DIR_MODES_0_90]
-#endif
-) {
-  int i;
-  const uint8_t mrl_index =
-      (plane == PLANE_TYPE_Y && is_inter_block(xd->mi[0], xd->tree_type) == 0)
-          ? xd->mi[0]->mrl_index
-          : 0;
-  const int above_mrl_idx = is_sb_boundary ? 0 : mrl_index;
-  const uint8_t *above_ref = ref - ref_stride * (above_mrl_idx + 1);
-  const uint8_t *left_ref = ref - 1 - mrl_index;
-  DECLARE_ALIGNED(16, uint8_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
-  DECLARE_ALIGNED(16, uint8_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
-#if CONFIG_IBP_DIR
-  DECLARE_ALIGNED(16, uint8_t, second_pred_data[MAX_TX_SQUARE + 32]);
-#endif
-  uint8_t *const above_row = above_data + 32;
-  uint8_t *const left_col = left_data + 32;
-#if CONFIG_IBP_DIR
-  uint8_t *const second_pred = second_pred_data + 16;
-#endif
-  const int txwpx = tx_size_wide[tx_size];
-  const int txhpx = tx_size_high[tx_size];
-  int need_left = extend_modes[mode] & NEED_LEFT;
-  int need_above = extend_modes[mode] & NEED_ABOVE;
-  int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
-  int p_angle = 0;
-  const int is_dr_mode = av1_is_directional_mode(mode);
-  const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
-  // The left_data, above_data buffers must be zeroed to fix some intermittent
-  // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4
-  // path in av1_dr_prediction_z1_avx2()) from left_data, above_data are seen to
-  // be the potential reason for this issue.
-  memset(left_data, 129, NUM_INTRA_NEIGHBOUR_PIXELS);
-  memset(above_data, 127, NUM_INTRA_NEIGHBOUR_PIXELS);
-
-  // The default values if ref pixels are not available:
-  // 128 127 127 .. 127 127 127 127 127 127
-  // 129  A   B  ..  Y   Z
-  // 129  C   D  ..  W   X
-  // 129  E   F  ..  U   V
-  // 129  G   H  ..  S   T   T   T   T   T
-  // ..
-
-#if CONFIG_ORIP
-  int apply_sub_block_based_refinement_filter =
-      seq_intra_pred_filter_flag && (mrl_index == 0);
-#endif
-
-  if (is_dr_mode) {
-    p_angle = mode_to_angle_map[mode] + angle_delta;
-    if (p_angle <= 90)
-      need_above = 1, need_left = 0, need_above_left = 1;
-    else if (p_angle < 180)
-      need_above = 1, need_left = 1, need_above_left = 1;
-    else
-      need_above = 0, need_left = 1, need_above_left = 1;
-#if CONFIG_IBP_DIR
-    if (seq_ibp_flag) {
-      need_above = 1, need_left = 1, need_above_left = 1;
-    }
-#endif
-
-#if CONFIG_ORIP && !CONFIG_ORIP_NONDC_DISABLED
-    if (apply_sub_block_based_refinement_filter &&
-        (p_angle == 90 || p_angle == 180)) {
-      need_above = 1;
-      need_left = 1;
-      need_above_left = 1;
-    }
-#endif
-  }
-  if (use_filter_intra) need_left = need_above = need_above_left = 1;
-
-  assert(n_top_px >= 0);
-  assert(n_topright_px >= 0);
-  assert(n_left_px >= 0);
-  assert(n_bottomleft_px >= 0);
-
-  if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
-    int val;
-    if (need_left) {
-      val = (n_top_px > 0) ? above_ref[0] : 129;
-    } else {
-      val = (n_left_px > 0) ? left_ref[0] : 127;
-    }
-    for (i = 0; i < txhpx; ++i) {
-      memset(dst, val, txwpx);
-      dst += dst_stride;
-    }
-    return;
-  }
-
-  // NEED_LEFT
-  if (need_left) {
-    int need_bottom = extend_modes[mode] & NEED_BOTTOMLEFT;
-    if (use_filter_intra) need_bottom = 0;
-#if CONFIG_IBP_DIR
-    if (is_dr_mode)
-      need_bottom =
-          seq_ibp_flag ? (p_angle < 90) || (p_angle > 180) : p_angle > 180;
-#else
-    if (is_dr_mode) need_bottom = p_angle > 180;
-#endif
-    const int num_left_pixels_needed =
-        txhpx + (need_bottom ? txwpx : 3) + (mrl_index << 1);
-    i = 0;
-    if (n_left_px > 0) {
-      for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
-      if (need_bottom && n_bottomleft_px > 0) {
-        assert(i == txhpx);
-        for (; i < txhpx + n_bottomleft_px; i++)
-          left_col[i] = left_ref[i * ref_stride];
-      }
-      if (i < num_left_pixels_needed)
-        memset(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
-    } else if (n_top_px > 0) {
-      memset(left_col, above_ref[0], num_left_pixels_needed);
-    }
-  }
-
-  // NEED_ABOVE
-  if (need_above) {
-    int need_right = extend_modes[mode] & NEED_ABOVERIGHT;
-    if (use_filter_intra) need_right = 0;
-#if CONFIG_IBP_DIR
-    if (is_dr_mode)
-      need_right =
-          seq_ibp_flag ? (p_angle < 90) || (p_angle > 180) : p_angle < 90;
-#else
-    if (is_dr_mode) need_right = p_angle < 90;
-#endif
-    const int num_top_pixels_needed =
-        txwpx + (need_right ? txhpx : 0) + (mrl_index << 1);
-    if (n_top_px > 0) {
-      memcpy(above_row, above_ref, n_top_px);
-      i = n_top_px;
-      if (need_right && n_topright_px > 0) {
-        assert(n_top_px == txwpx);
-        memcpy(above_row + txwpx, above_ref + txwpx, n_topright_px);
-        i += n_topright_px;
-      }
-      if (i < num_top_pixels_needed)
-        memset(&above_row[i], above_row[i - 1], num_top_pixels_needed - i);
-    } else if (n_left_px > 0) {
-      memset(above_row, left_ref[0], num_top_pixels_needed);
-    }
-  }
-
-  if (need_above_left) {
-    for (i = 1; i <= mrl_index + 1; i++) {
-      if (n_top_px > 0 && n_left_px > 0) {
-        above_row[-i] = above_ref[-i];
-        if (is_sb_boundary)
-          left_col[-i] = left_ref[-ref_stride];
-        else
-          left_col[-i] = left_ref[-i * ref_stride];
-      } else if (n_top_px > 0) {
-        above_row[-i] = left_col[-i] = above_ref[0];
-      } else if (n_left_px > 0) {
-        above_row[-i] = left_col[-i] = left_ref[0];
-      } else {
-        above_row[-i] = left_col[-i] = 128;
-      }
-    }
-  }
-
-  if (use_filter_intra) {
-    av1_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
-                               filter_intra_mode);
-    return;
-  }
-
-  if (is_dr_mode) {
-    int upsample_above = 0;
-    int upsample_left = 0;
-    if (!disable_edge_filter && mrl_index == 0) {
-#if CONFIG_IBP_DIR
-      int need_right = p_angle < 90;
-      int need_bottom = p_angle > 180;
-      int filt_type_above = get_filt_type(xd, plane);
-      int filt_type_left = filt_type_above;
-      int angle_above = p_angle - 90;
-      int angle_left = p_angle - 180;
-      if (seq_ibp_flag) {
-        need_right |= p_angle > 180;
-        need_bottom |= p_angle < 90;
-        const MB_MODE_INFO *ab =
-            (plane == 0) ? xd->above_mbmi : xd->chroma_above_mbmi;
-        const MB_MODE_INFO *le =
-            (plane == 0) ? xd->left_mbmi : xd->chroma_left_mbmi;
-        filt_type_above = ab ? is_smooth(ab, plane) : 0;
-        filt_type_left = le ? is_smooth(le, plane) : 0;
-        angle_above = p_angle > 180 ? (p_angle - 180 - 90) : angle_above;
-        angle_left = p_angle < 90 ? p_angle : angle_left;
-      }
-#else
-      const int need_right = p_angle < 90;
-      const int need_bottom = p_angle > 180;
-      const int filt_type = get_filt_type(xd, plane);
-#endif
-      if (p_angle != 90 && p_angle != 180) {
-        const int ab_le = need_above_left ? 1 : 0;
-        if (need_above && need_left && (txwpx + txhpx >= 24)) {
-          filter_intra_edge_corner(above_row, left_col);
-        }
-        if (need_above && n_top_px > 0) {
-#if CONFIG_IBP_DIR
-          const int strength = intra_edge_filter_strength(
-              txwpx, txhpx, angle_above, filt_type_above);
-#else
-          const int strength =
-              intra_edge_filter_strength(txwpx, txhpx, p_angle - 90, filt_type);
-#endif
-          const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
-          av1_filter_intra_edge(above_row - ab_le, n_px, strength);
-        }
-        if (need_left && n_left_px > 0) {
-#if CONFIG_IBP_DIR
-          const int strength = intra_edge_filter_strength(
-              txhpx, txwpx, angle_left, filt_type_left);
-#else
-          const int strength = intra_edge_filter_strength(
-              txhpx, txwpx, p_angle - 180, filt_type);
-#endif
-          const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
-          av1_filter_intra_edge(left_col - ab_le, n_px, strength);
-        }
-      }
-#if CONFIG_IBP_DIR
-      upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, angle_above,
-                                                   filt_type_above);
-#else
-      upsample_above =
-          av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
-#endif
-      if (need_above && upsample_above) {
-        const int n_px = txwpx + (need_right ? txhpx : 0);
-        av1_upsample_intra_edge(above_row, n_px);
-      }
-#if CONFIG_IBP_DIR
-      upsample_left =
-          av1_use_intra_edge_upsample(txhpx, txwpx, angle_left, filt_type_left);
-#else
-      upsample_left =
-          av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
-#endif
-      if (need_left && upsample_left) {
-        const int n_px = txhpx + (need_bottom ? txwpx : 0);
-        av1_upsample_intra_edge(left_col, n_px);
-      }
-    }
-    dr_predictor(dst, dst_stride, tx_size, above_row, left_col, upsample_above,
-                 upsample_left, p_angle, mrl_index);
-#if CONFIG_IBP_DIR
-    if (seq_ibp_flag) {
-      if (mrl_index == 0) {
-        if (p_angle > 0 && p_angle < 90) {
-          int mode_index = angle_to_mode_index[p_angle];
-          uint8_t *weights = ibp_weights[tx_size][mode_index];
-          second_dr_predictor(second_pred, txwpx, tx_size, above_row, left_col,
-                              upsample_above, upsample_left, p_angle);
-          av1_ibp_dr_prediction_z1_c(weights, dst, dst_stride, second_pred,
-                                     txwpx, txwpx, txhpx);
-        }
-        if (p_angle > 180 && p_angle < 270) {
-          int mode_index = angle_to_mode_index[270 - p_angle];
-          int transpose_tsize = transpose_tx_size[tx_size];
-          uint8_t *weights = ibp_weights[transpose_tsize][mode_index];
-          second_dr_predictor(second_pred, txwpx, tx_size, above_row, left_col,
-                              upsample_above, upsample_left, p_angle);
-          av1_ibp_dr_prediction_z3_c(weights, dst, dst_stride, second_pred,
-                                     txwpx, txwpx, txhpx);
-        }
-      }
-    }
-#endif
-
-#if CONFIG_ORIP
-#if !CONFIG_ORIP_NONDC_DISABLED
-    // Apply sub-block based filter for horizontal/vertical intra mode
-    if (apply_sub_block_based_refinement_filter &&
-#if DF_RESTRICT_ORIP
-        av1_allow_orip_dir(p_angle, tx_size)) {
-#else
-        av1_allow_orip_dir(p_angle)) {
-#endif
-      av1_apply_orip_4x4subblock(dst, dst_stride, tx_size, above_row, left_col,
-                                 mode);
-    }
-#endif
-#endif
-    return;
-  }
-
-  // predict
-  if (mode == DC_PRED) {
-    dc_pred[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride, above_row,
-                                                  left_col);
-#if CONFIG_IBP_DC
-    if (seq_ibp_flag && ((plane == 0) || (xd->mi[0]->uv_mode != UV_CFL_PRED)) &&
-        ((n_left_px > 0) || (n_top_px > 0))) {
-      ibp_dc_pred[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride,
-                                                        above_row, left_col);
-    }
-#endif
-  } else {
-    pred[mode][tx_size](dst, dst_stride, above_row, left_col);
-  }
-
-#if CONFIG_ORIP
-  apply_sub_block_based_refinement_filter &=
-#if DF_RESTRICT_ORIP
-      av1_allow_orip_smooth_dc(mode, plane, tx_size);
-#else
-      av1_allow_orip_smooth_dc(mode, plane);
-#endif
-  if (apply_sub_block_based_refinement_filter) {
-    av1_apply_orip_4x4subblock(dst, dst_stride, tx_size, above_row, left_col,
-                               mode);
-  }
-#endif
-}
-
 static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x,
                                             int subsampling_y) {
   assert(subsampling_x >= 0 && subsampling_x < 2);
@@ -2186,19 +1721,10 @@
                                xd->color_index_map_offset[plane != 0];
     const uint16_t *const palette =
         mbmi->palette_mode_info.palette_colors + plane * PALETTE_MAX_SIZE;
-    if (is_cur_buf_hbd(xd)) {
-      uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
-      for (r = 0; r < txhpx; ++r) {
-        for (c = 0; c < txwpx; ++c) {
-          dst16[r * dst_stride + c] = palette[map[(r + y) * wpx + c + x]];
-        }
-      }
-    } else {
-      for (r = 0; r < txhpx; ++r) {
-        for (c = 0; c < txwpx; ++c) {
-          dst[r * dst_stride + c] =
-              (uint8_t)palette[map[(r + y) * wpx + c + x]];
-        }
+    uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+    for (r = 0; r < txhpx; ++r) {
+      for (c = 0; c < txwpx; ++c) {
+        dst16[r * dst_stride + c] = palette[map[(r + y) * wpx + c + x]];
       }
     }
     return;
@@ -2246,31 +1772,7 @@
   const int is_sb_boundary =
       (mi_row % cm->seq_params.mib_size == 0 && row_off == 0) ? 1 : 0;
 
-  if (is_cur_buf_hbd(xd)) {
-    build_intra_predictors_high(
-        xd, ref, ref_stride, dst, dst_stride, mode, angle_delta,
-        filter_intra_mode, tx_size, disable_edge_filter,
-        have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
-        have_top_right ? AOMMIN(txwpx, xr) : 0,
-        have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
-        have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane, is_sb_boundary
-#if CONFIG_ORIP
-        ,
-        cm->seq_params.enable_orip
-#endif
-#if CONFIG_IBP_DIR || CONFIG_IBP_DC
-        ,
-        cm->seq_params.enable_ibp
-#endif
-#if CONFIG_IBP_DIR
-        ,
-        cm->ibp_directional_weights
-#endif
-    );
-    return;
-  }
-
-  build_intra_predictors(
+  build_intra_predictors_high(
       xd, ref, ref_stride, dst, dst_stride, mode, angle_delta,
       filter_intra_mode, tx_size, disable_edge_filter,
       have_top ? AOMMIN(txwpx, xr + txwpx) : 0,

diff --git a/av1/common/resize.c b/av1/common/resize.c
index 2c3def8..24b6cd5 100644
--- a/av1/common/resize.c
+++ b/av1/common/resize.c

@@ -701,65 +701,6 @@
   aom_free(arrbuf2);
 }
 
-static void upscale_normative_rect(const uint8_t *const input, int height,
-                                   int width, int in_stride, uint8_t *output,
-                                   int height2, int width2, int out_stride,
-                                   int x_step_qn, int x0_qn, int pad_left,
-                                   int pad_right) {
-  assert(width > 0);
-  assert(height > 0);
-  assert(width2 > 0);
-  assert(height2 > 0);
-  assert(height2 == height);
-
-  // Extend the left/right pixels of the tile column if needed
-  // (either because we can't sample from other tiles, or because we're at
-  // a frame edge).
-  // Save the overwritten pixels into tmp_left and tmp_right.
-  // Note: Because we pass input-1 to av1_convolve_horiz_rs, we need one extra
-  // column of border pixels compared to what we'd naively think.
-  const int border_cols = UPSCALE_NORMATIVE_TAPS / 2 + 1;
-  uint8_t *tmp_left =
-      NULL;  // Silence spurious "may be used uninitialized" warnings
-  uint8_t *tmp_right = NULL;
-  uint8_t *const in_tl = (uint8_t *)(input - border_cols);  // Cast off 'const'
-  uint8_t *const in_tr = (uint8_t *)(input + width);
-  if (pad_left) {
-    tmp_left = (uint8_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height);
-    for (int i = 0; i < height; i++) {
-      memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_cols);
-      memset(in_tl + i * in_stride, input[i * in_stride], border_cols);
-    }
-  }
-  if (pad_right) {
-    tmp_right =
-        (uint8_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height);
-    for (int i = 0; i < height; i++) {
-      memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_cols);
-      memset(in_tr + i * in_stride, input[i * in_stride + width - 1],
-             border_cols);
-    }
-  }
-
-  av1_convolve_horiz_rs(input - 1, in_stride, output, out_stride, width2,
-                        height2, &av1_resize_filter_normative[0][0], x0_qn,
-                        x_step_qn);
-
-  // Restore the left/right border pixels
-  if (pad_left) {
-    for (int i = 0; i < height; i++) {
-      memcpy(in_tl + i * in_stride, tmp_left + i * border_cols, border_cols);
-    }
-    aom_free(tmp_left);
-  }
-  if (pad_right) {
-    for (int i = 0; i < height; i++) {
-      memcpy(in_tr + i * in_stride, tmp_right + i * border_cols, border_cols);
-    }
-    aom_free(tmp_right);
-  }
-}
-
 static void highbd_interpolate_core(const uint16_t *const input, int in_length,
                                     uint16_t *output, int out_length, int bd,
                                     const int16_t *interp_filters,
@@ -1212,14 +1153,15 @@
       const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler;
       for (int x = 0; x < dst_w; x += 16) {
         const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler;
-        const uint8_t *src_ptr = srcs[i] +
-                                 (y / factor) * src_h / dst_h * src_stride +
-                                 (x / factor) * src_w / dst_w;
-        uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
+        const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(srcs[i]);
+        uint16_t *dst_ptr = CONVERT_TO_SHORTPTR(dsts[i]);
 
-        aom_convolve8_c(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
-                        x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
-                        16 * src_h / dst_h, 16 / factor, 16 / factor);
+        aom_highbd_convolve8_c(
+            src_ptr + (y / factor) * src_h / dst_h * src_stride +
+                (x / factor) * src_w / dst_w,
+            src_stride, dst_ptr + (y / factor) * dst_stride + (x / factor),
+            dst_stride, kernel, x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+            16 * src_h / dst_h, 16 / factor, 16 / factor, 8);
       }
     }
   }
@@ -1234,16 +1176,10 @@
   // the static analysis warnings.
   for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
     const int is_uv = i > 0;
-    if (src->flags & YV12_FLAG_HIGHBITDEPTH)
-      av1_highbd_resize_plane(src->buffers[i], src->crop_heights[is_uv],
-                              src->crop_widths[is_uv], src->strides[is_uv],
-                              dst->buffers[i], dst->crop_heights[is_uv],
-                              dst->crop_widths[is_uv], dst->strides[is_uv], bd);
-    else
-      av1_resize_plane(src->buffers[i], src->crop_heights[is_uv],
-                       src->crop_widths[is_uv], src->strides[is_uv],
-                       dst->buffers[i], dst->crop_heights[is_uv],
-                       dst->crop_widths[is_uv], dst->strides[is_uv]);
+    av1_highbd_resize_plane(src->buffers[i], src->crop_heights[is_uv],
+                            src->crop_widths[is_uv], src->strides[is_uv],
+                            dst->buffers[i], dst->crop_heights[is_uv],
+                            dst->crop_widths[is_uv], dst->strides[is_uv], bd);
   }
   aom_extend_frame_borders(dst, num_planes);
 }
@@ -1293,15 +1229,10 @@
     const int pad_left = (j == 0);
     const int pad_right = (j == cm->tiles.cols - 1);
 
-    if (cm->seq_params.use_highbitdepth)
-      highbd_upscale_normative_rect(src_ptr, rows, src_width, src_stride,
-                                    dst_ptr, rows, dst_width, dst_stride,
-                                    x_step_qn, x0_qn, pad_left, pad_right,
-                                    cm->seq_params.bit_depth);
-    else
-      upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr,
-                             rows, dst_width, dst_stride, x_step_qn, x0_qn,
-                             pad_left, pad_right);
+    highbd_upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr,
+                                  rows, dst_width, dst_stride, x_step_qn, x0_qn,
+                                  pad_left, pad_right,
+                                  cm->seq_params.bit_depth);
 
     // Update the fractional pixel offset to prepare for the next tile column.
     x0_qn += (dst_width * x_step_qn) - (src_width << RS_SCALE_SUBPEL_BITS);
@@ -1418,8 +1349,7 @@
   const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, 3);
   if (aom_alloc_frame_buffer(
           &copy_buffer, aligned_width, cm->height, seq_params->subsampling_x,
-          seq_params->subsampling_y, seq_params->use_highbitdepth,
-          AOM_BORDER_IN_PIXELS, byte_alignment))
+          seq_params->subsampling_y, AOM_BORDER_IN_PIXELS, byte_alignment))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate copy buffer for superres upscaling");
 
@@ -1450,8 +1380,8 @@
     if (aom_realloc_frame_buffer(
             frame_to_show, cm->superres_upscaled_width,
             cm->superres_upscaled_height, seq_params->subsampling_x,
-            seq_params->subsampling_y, seq_params->use_highbitdepth,
-            AOM_BORDER_IN_PIXELS, byte_alignment, fb, cb, cb_priv)) {
+            seq_params->subsampling_y, AOM_BORDER_IN_PIXELS, byte_alignment, fb,
+            cb, cb_priv)) {
       unlock_buffer_pool(pool);
       aom_internal_error(
           &cm->error, AOM_CODEC_MEM_ERROR,
@@ -1467,8 +1397,7 @@
     if (aom_alloc_frame_buffer(
             frame_to_show, cm->superres_upscaled_width,
             cm->superres_upscaled_height, seq_params->subsampling_x,
-            seq_params->subsampling_y, seq_params->use_highbitdepth,
-            AOM_BORDER_IN_PIXELS, byte_alignment))
+            seq_params->subsampling_y, AOM_BORDER_IN_PIXELS, byte_alignment))
       aom_internal_error(
           &cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to reallocate current frame buffer for superres upscaling");

diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index e66f9c6..822ed1e 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c

@@ -135,25 +135,6 @@
 #endif
 }
 
-static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride,
-                               int border_horz, int border_vert) {
-  uint8_t *data_p;
-  int i;
-  for (i = 0; i < height; ++i) {
-    data_p = data + i * stride;
-    memset(data_p - border_horz, data_p[0], border_horz);
-    memset(data_p + width, data_p[width - 1], border_horz);
-  }
-  data_p = data - border_horz;
-  for (i = -border_vert; i < 0; ++i) {
-    memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
-  }
-  for (i = height; i < height + border_vert; ++i) {
-    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
-           width + 2 * border_horz);
-  }
-}
-
 static void extend_frame_highbd(uint16_t *data, int width, int height,
                                 int stride, int border_horz, int border_vert) {
   uint16_t *data_p;
@@ -181,34 +162,18 @@
 }
 
 void av1_extend_frame(uint8_t *data, int width, int height, int stride,
-                      int border_horz, int border_vert, int highbd) {
-  if (highbd) {
-    extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
-                        border_horz, border_vert);
-    return;
-  }
-  (void)highbd;
-  extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
-}
-
-static void copy_tile_lowbd(int width, int height, const uint8_t *src,
-                            int src_stride, uint8_t *dst, int dst_stride) {
-  for (int i = 0; i < height; ++i)
-    memcpy(dst + i * dst_stride, src + i * src_stride, width);
+                      int border_horz, int border_vert) {
+  extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
+                      border_horz, border_vert);
 }
 
 static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
-                      uint8_t *dst, int dst_stride, int highbd) {
-  if (highbd) {
-    copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
-                     CONVERT_TO_SHORTPTR(dst), dst_stride);
-    return;
-  }
-  (void)highbd;
-  copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
+                      uint8_t *dst, int dst_stride) {
+  copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
+                   CONVERT_TO_SHORTPTR(dst), dst_stride);
 }
 
-#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
+#define REAL_PTR(d) ((uint8_t *)CONVERT_TO_SHORTPTR(d))
 
 // With striped loop restoration, the filtering for each 64-pixel stripe gets
 // most of its input from the output of CDEF (stored in data8), but we need to
@@ -277,7 +242,7 @@
 // index we get from limits into something we can look up in rsb).
 static void setup_processing_stripe_boundary(
     const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
-    int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
+    int rsb_row, int h, uint8_t *data8, int data_stride,
     RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
   // Offsets within the line buffers. The buffer logically starts at column
   // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
@@ -286,7 +251,7 @@
   const int buf_x0_off = limits->h_start;
   const int line_width =
       (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
-  const int line_size = line_width << use_highbd;
+  const int line_size = line_width << 1;
 
   const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
 
@@ -308,13 +273,12 @@
       for (int i = -RESTORATION_BORDER; i < 0; ++i) {
         const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
         const int buf_off = buf_x0_off + buf_row * buf_stride;
-        const uint8_t *buf =
-            rsb->stripe_boundary_above + (buf_off << use_highbd);
+        const uint8_t *buf = rsb->stripe_boundary_above + (buf_off << 1);
         uint8_t *dst8 = data8_tl + i * data_stride;
         // Save old pixels, then replace with data from stripe_boundary_above
-        memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
-               REAL_PTR(use_highbd, dst8), line_size);
-        memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
+        memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER], REAL_PTR(dst8),
+               line_size);
+        memcpy(REAL_PTR(dst8), buf, line_size);
       }
     }
 
@@ -328,13 +292,12 @@
       for (int i = 0; i < RESTORATION_BORDER; ++i) {
         const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
         const int buf_off = buf_x0_off + buf_row * buf_stride;
-        const uint8_t *src =
-            rsb->stripe_boundary_below + (buf_off << use_highbd);
+        const uint8_t *src = rsb->stripe_boundary_below + (buf_off << 1);
 
         uint8_t *dst8 = data8_bl + i * data_stride;
         // Save old pixels, then replace with data from stripe_boundary_below
-        memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
-        memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
+        memcpy(rlbs->tmp_save_below[i], REAL_PTR(dst8), line_size);
+        memcpy(REAL_PTR(dst8), src, line_size);
       }
     }
   } else {
@@ -344,10 +307,9 @@
       // Only save and overwrite i=-RESTORATION_BORDER line.
       uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
       // Save old pixels, then replace with data from stripe_boundary_above
-      memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
-      memcpy(REAL_PTR(use_highbd, dst8),
-             REAL_PTR(use_highbd,
-                      data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
+      memcpy(rlbs->tmp_save_above[0], REAL_PTR(dst8), line_size);
+      memcpy(REAL_PTR(dst8),
+             REAL_PTR(data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
              line_size);
     }
 
@@ -358,9 +320,9 @@
       // Only save and overwrite i=2 line.
       uint8_t *dst8 = data8_bl + 2 * data_stride;
       // Save old pixels, then replace with data from stripe_boundary_below
-      memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
-      memcpy(REAL_PTR(use_highbd, dst8),
-             REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
+      memcpy(rlbs->tmp_save_below[2], REAL_PTR(dst8), line_size);
+      memcpy(REAL_PTR(dst8), REAL_PTR(data8_bl + (2 - 1) * data_stride),
+             line_size);
     }
   }
 }
@@ -380,11 +342,11 @@
 // by the top/bottom borders.
 static void restore_processing_stripe_boundary(
     const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
-    int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
-    int copy_below, int opt) {
+    int h, uint8_t *data8, int data_stride, int copy_above, int copy_below,
+    int opt) {
   const int line_width =
       (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
-  const int line_size = line_width << use_highbd;
+  const int line_size = line_width << 1;
 
   const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
 
@@ -393,8 +355,8 @@
       uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
       for (int i = -RESTORATION_BORDER; i < 0; ++i) {
         uint8_t *dst8 = data8_tl + i * data_stride;
-        memcpy(REAL_PTR(use_highbd, dst8),
-               rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
+        memcpy(REAL_PTR(dst8), rlbs->tmp_save_above[i + RESTORATION_BORDER],
+               line_size);
       }
     }
 
@@ -406,7 +368,7 @@
         if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
 
         uint8_t *dst8 = data8_bl + i * data_stride;
-        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
+        memcpy(REAL_PTR(dst8), rlbs->tmp_save_below[i], line_size);
       }
     }
   } else {
@@ -415,7 +377,7 @@
 
       // Only restore i=-RESTORATION_BORDER line.
       uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
-      memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
+      memcpy(REAL_PTR(dst8), rlbs->tmp_save_above[0], line_size);
     }
 
     if (copy_below) {
@@ -425,32 +387,12 @@
       // Only restore i=2 line.
       if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
         uint8_t *dst8 = data8_bl + 2 * data_stride;
-        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
+        memcpy(REAL_PTR(dst8), rlbs->tmp_save_below[2], line_size);
       }
     }
   }
 }
 
-static void wiener_filter_stripe(const RestorationUnitInfo *rui,
-                                 int stripe_width, int stripe_height,
-                                 int procunit_width, const uint8_t *src,
-                                 int src_stride, uint8_t *dst, int dst_stride,
-                                 int32_t *tmpbuf, int bit_depth) {
-  (void)tmpbuf;
-  (void)bit_depth;
-  assert(bit_depth == 8);
-  const ConvolveParams conv_params = get_conv_params_wiener(8);
-
-  for (int j = 0; j < stripe_width; j += procunit_width) {
-    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
-    const uint8_t *src_p = src + j;
-    uint8_t *dst_p = dst + j;
-    av1_wiener_convolve_add_src(
-        src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
-        rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
-  }
-}
-
 /* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
    over the input. The window is of size (2r + 1)x(2r + 1), and we
    specialize to r = 1, 2, 3. A default function is used for r > 3.
@@ -866,24 +808,16 @@
 int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
                                  int dgd_stride, int32_t *flt0, int32_t *flt1,
                                  int flt_stride, int sgr_params_idx,
-                                 int bit_depth, int highbd) {
+                                 int bit_depth) {
   int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
   const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
   int32_t *dgd32 =
       dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
 
-  if (highbd) {
-    const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
-    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
-      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
-        dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
-      }
-    }
-  } else {
-    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
-      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
-        dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
-      }
+  const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
+  for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
+    for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
+      dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
     }
   }
 
@@ -907,13 +841,13 @@
                                         int height, int stride, int eps,
                                         const int *xqd, uint8_t *dst8,
                                         int dst_stride, int32_t *tmpbuf,
-                                        int bit_depth, int highbd) {
+                                        int bit_depth) {
   int32_t *flt0 = tmpbuf;
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   assert(width * height <= RESTORATION_UNITPELS_MAX);
 
   const int ret = av1_selfguided_restoration_c(
-      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth);
   (void)ret;
   assert(!ret);
   const sgr_params_type *const params = &av1_sgr_params[eps];
@@ -925,7 +859,7 @@
       uint8_t *dst8ij = dst8 + i * dst_stride + j;
       const uint8_t *dat8ij = dat8 + i * stride + j;
 
-      const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
+      const uint16_t pre_u = *CONVERT_TO_SHORTPTR(dat8ij);
       const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
       int32_t v = u << SGRPROJ_PRJ_BITS;
       // If params->r == 0 then we skipped the filtering in
@@ -936,30 +870,11 @@
           (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
 
       const uint16_t out = clip_pixel_highbd(w, bit_depth);
-      if (highbd)
-        *CONVERT_TO_SHORTPTR(dst8ij) = out;
-      else
-        *dst8ij = (uint8_t)out;
+      *CONVERT_TO_SHORTPTR(dst8ij) = out;
     }
   }
 }
 
-static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
-                                  int stripe_width, int stripe_height,
-                                  int procunit_width, const uint8_t *src,
-                                  int src_stride, uint8_t *dst, int dst_stride,
-                                  int32_t *tmpbuf, int bit_depth) {
-  (void)bit_depth;
-  assert(bit_depth == 8);
-
-  for (int j = 0; j < stripe_width; j += procunit_width) {
-    int w = AOMMIN(procunit_width, stripe_width - j);
-    av1_apply_selfguided_restoration(
-        src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
-        rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth, 0);
-  }
-}
-
 static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui,
                                         int stripe_width, int stripe_height,
                                         int procunit_width, const uint8_t *src8,
@@ -990,7 +905,7 @@
     int w = AOMMIN(procunit_width, stripe_width - j);
     av1_apply_selfguided_restoration(
         src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
-        rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
+        rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth);
   }
 }
 
@@ -1000,10 +915,9 @@
                                   int src_stride, uint8_t *dst, int dst_stride,
                                   int32_t *tmpbuf, int bit_depth);
 
-#define NUM_STRIPE_FILTERS 4
+#define NUM_STRIPE_FILTERS 2
 static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
-  wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
-  sgrproj_filter_stripe_highbd
+  wiener_filter_stripe_highbd, sgrproj_filter_stripe_highbd
 };
 
 // Filter one restoration unit
@@ -1011,8 +925,8 @@
     const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
     const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
     const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
-    int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
-    int dst_stride, int32_t *tmpbuf, int optimized_lr) {
+    int bit_depth, uint8_t *data8, int stride, uint8_t *dst8, int dst_stride,
+    int32_t *tmpbuf, int optimized_lr) {
   RestorationType unit_rtype = rui->restoration_type;
 
   int unit_h = limits->v_end - limits->v_start;
@@ -1021,11 +935,11 @@
   uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start;
 
   if (unit_rtype == RESTORE_NONE) {
-    copy_tile(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, highbd);
+    copy_tile(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride);
     return;
   }
 
-  const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
+  const int filter_idx = (unit_rtype == RESTORE_SGRPROJ);
   assert(filter_idx < NUM_STRIPE_FILTERS);
   const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
 
@@ -1060,15 +974,15 @@
     const int h = AOMMIN(nominal_stripe_height,
                          remaining_stripes.v_end - remaining_stripes.v_start);
 
-    setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
-                                     h, data8, stride, rlbs, copy_above,
-                                     copy_below, optimized_lr);
+    setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, h, data8,
+                                     stride, rlbs, copy_above, copy_below,
+                                     optimized_lr);
 
     stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
                   dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth);
 
-    restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
-                                       data8, stride, copy_above, copy_below,
+    restore_processing_stripe_boundary(&remaining_stripes, rlbs, h, data8,
+                                       stride, copy_above, copy_below,
                                        optimized_lr);
 
     i += h;
@@ -1084,8 +998,8 @@
 
   av1_loop_restoration_filter_unit(
       limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs, tile_rect,
-      ctxt->tile_stripe0, ctxt->ss_x, ctxt->ss_y, ctxt->highbd, ctxt->bit_depth,
-      ctxt->data8, ctxt->data_stride, ctxt->dst8, ctxt->dst_stride, tmpbuf,
+      ctxt->tile_stripe0, ctxt->ss_x, ctxt->ss_y, ctxt->bit_depth, ctxt->data8,
+      ctxt->data_stride, ctxt->dst8, ctxt->dst_stride, tmpbuf,
       rsi->optimized_lr);
 }
 
@@ -1095,14 +1009,13 @@
                                             int num_planes) {
   const SequenceHeader *const seq_params = &cm->seq_params;
   const int bit_depth = seq_params->bit_depth;
-  const int highbd = seq_params->use_highbitdepth;
   lr_ctxt->dst = &cm->rst_frame;
 
   const int frame_width = frame->crop_widths[0];
   const int frame_height = frame->crop_heights[0];
   if (aom_realloc_frame_buffer(
           lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
-          seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
+          seq_params->subsampling_y, AOM_RESTORATION_FRAME_BORDER,
           cm->features.byte_alignment, NULL, NULL, NULL) < 0)
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate restoration dst buffer");
@@ -1125,12 +1038,11 @@
 
     av1_extend_frame(frame->buffers[plane], plane_width, plane_height,
                      frame->strides[is_uv], RESTORATION_BORDER,
-                     RESTORATION_BORDER, highbd);
+                     RESTORATION_BORDER);
 
     lr_plane_ctxt->rsi = rsi;
     lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
     lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
-    lr_plane_ctxt->highbd = highbd;
     lr_plane_ctxt->bit_depth = bit_depth;
     lr_plane_ctxt->data8 = frame->buffers[plane];
     lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
@@ -1372,33 +1284,27 @@
 
 // Extend to left and right
 static void extend_lines(uint8_t *buf, int width, int height, int stride,
-                         int extend, int use_highbitdepth) {
+                         int extend) {
   for (int i = 0; i < height; ++i) {
-    if (use_highbitdepth) {
-      uint16_t *buf16 = (uint16_t *)buf;
-      aom_memset16(buf16 - extend, buf16[0], extend);
-      aom_memset16(buf16 + width, buf16[width - 1], extend);
-    } else {
-      memset(buf - extend, buf[0], extend);
-      memset(buf + width, buf[width - 1], extend);
-    }
+    uint16_t *buf16 = (uint16_t *)buf;
+    aom_memset16(buf16 - extend, buf16[0], extend);
+    aom_memset16(buf16 + width, buf16[width - 1], extend);
     buf += stride;
   }
 }
 
 static void save_deblock_boundary_lines(
     const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
-    int stripe, int use_highbd, int is_above,
-    RestorationStripeBoundaries *boundaries) {
+    int stripe, int is_above, RestorationStripeBoundaries *boundaries) {
   const int is_uv = plane > 0;
-  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
-  const int src_stride = frame->strides[is_uv] << use_highbd;
+  const uint8_t *src_buf = REAL_PTR(frame->buffers[plane]);
+  const int src_stride = frame->strides[is_uv] << 1;
   const uint8_t *src_rows = src_buf + row * src_stride;
 
   uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
                                : boundaries->stripe_boundary_below;
-  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
-  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
+  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << 1);
+  const int bdry_stride = boundaries->stripe_boundary_stride << 1;
   uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
 
   // There is a rare case in which a processing stripe can end 1px above the
@@ -1415,19 +1321,14 @@
   if (av1_superres_scaled(cm)) {
     const int ss_x = is_uv && cm->seq_params.subsampling_x;
     upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
-    line_bytes = upscaled_width << use_highbd;
-    if (use_highbd)
-      av1_upscale_normative_rows(
-          cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
-          CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
-          plane, lines_to_save);
-    else
-      av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
-                                 boundaries->stripe_boundary_stride, plane,
-                                 lines_to_save);
+    line_bytes = upscaled_width << 1;
+    av1_upscale_normative_rows(
+        cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
+        CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
+        plane, lines_to_save);
   } else {
     upscaled_width = frame->crop_widths[is_uv];
-    line_bytes = upscaled_width << use_highbd;
+    line_bytes = upscaled_width << 1;
     for (int i = 0; i < lines_to_save; i++) {
       memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
              line_bytes);
@@ -1438,22 +1339,22 @@
     memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
 
   extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
-               RESTORATION_EXTRA_HORZ, use_highbd);
+               RESTORATION_EXTRA_HORZ);
 }
 
 static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
                                      const AV1_COMMON *cm, int plane, int row,
-                                     int stripe, int use_highbd, int is_above,
+                                     int stripe, int is_above,
                                      RestorationStripeBoundaries *boundaries) {
   const int is_uv = plane > 0;
-  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
-  const int src_stride = frame->strides[is_uv] << use_highbd;
+  const uint8_t *src_buf = REAL_PTR(frame->buffers[plane]);
+  const int src_stride = frame->strides[is_uv] << 1;
   const uint8_t *src_rows = src_buf + row * src_stride;
 
   uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
                                : boundaries->stripe_boundary_below;
-  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
-  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
+  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << 1);
+  const int bdry_stride = boundaries->stripe_boundary_stride << 1;
   uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
   const int src_width = frame->crop_widths[is_uv];
 
@@ -1464,7 +1365,7 @@
   const int upscaled_width = av1_superres_scaled(cm)
                                  ? (cm->superres_upscaled_width + ss_x) >> ss_x
                                  : src_width;
-  const int line_bytes = upscaled_width << use_highbd;
+  const int line_bytes = upscaled_width << 1;
   for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
     // Copy the line at 'row' into both context lines. This is because
     // we want to (effectively) extend the outermost row of CDEF data
@@ -1473,12 +1374,12 @@
     memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
   }
   extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
-               RESTORATION_EXTRA_HORZ, use_highbd);
+               RESTORATION_EXTRA_HORZ);
 }
 
 static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
-                                         int use_highbd, int plane,
-                                         AV1_COMMON *cm, int after_cdef) {
+                                         int plane, AV1_COMMON *cm,
+                                         int after_cdef) {
   const int is_uv = plane > 0;
   const int ss_y = is_uv && cm->seq_params.subsampling_y;
   const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
@@ -1514,11 +1415,11 @@
       // Save deblocked context where needed.
       if (use_deblock_above) {
         save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
-                                    frame_stripe, use_highbd, 1, boundaries);
+                                    frame_stripe, 1, boundaries);
       }
       if (use_deblock_below) {
-        save_deblock_boundary_lines(frame, cm, plane, y1, frame_stripe,
-                                    use_highbd, 0, boundaries);
+        save_deblock_boundary_lines(frame, cm, plane, y1, frame_stripe, 0,
+                                    boundaries);
       }
     } else {
       // Save CDEF context where needed. Note that we need to save the CDEF
@@ -1528,12 +1429,12 @@
       // In addition, we need to save copies of the outermost line within
       // the tile, rather than using data from outside the tile.
       if (!use_deblock_above) {
-        save_cdef_boundary_lines(frame, cm, plane, y0, frame_stripe, use_highbd,
-                                 1, boundaries);
+        save_cdef_boundary_lines(frame, cm, plane, y0, frame_stripe, 1,
+                                 boundaries);
       }
       if (!use_deblock_below) {
-        save_cdef_boundary_lines(frame, cm, plane, y1 - 1, frame_stripe,
-                                 use_highbd, 0, boundaries);
+        save_cdef_boundary_lines(frame, cm, plane, y1 - 1, frame_stripe, 0,
+                                 boundaries);
       }
     }
   }
@@ -1545,8 +1446,7 @@
 void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
                                               AV1_COMMON *cm, int after_cdef) {
   const int num_planes = av1_num_planes(cm);
-  const int use_highbd = cm->seq_params.use_highbitdepth;
   for (int p = 0; p < num_planes; ++p) {
-    save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef);
+    save_tile_row_boundary_lines(frame, p, cm, after_cdef);
   }
 }

diff --git a/av1/common/restoration.h b/av1/common/restoration.h
index 198ea7f..4ce81dc 100644
--- a/av1/common/restoration.h
+++ b/av1/common/restoration.h

@@ -341,7 +341,7 @@
   const RestorationInfo *rsi;
   int tile_stripe0;
   int ss_x, ss_y;
-  int highbd, bit_depth;
+  int bit_depth;
   uint8_t *data8, *dst8;
   int data_stride, dst_stride;
   AV1PixelRect tile_rect;
@@ -364,7 +364,7 @@
 void av1_free_restoration_struct(RestorationInfo *rst_info);
 
 void av1_extend_frame(uint8_t *data, int width, int height, int stride,
-                      int border_horz, int border_vert, int highbd);
+                      int border_horz, int border_vert);
 void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params);
 
 /*!\endcond */
@@ -384,7 +384,6 @@
  * \param[in]  tile_stripe0  Index of the first stripe in this tile
  * \param[in]  ss_x          Horizontal subsampling for plane
  * \param[in]  ss_y          Vertical subsampling for plane
- * \param[in]  highbd        Whether high bitdepth pipeline is used
  * \param[in]  bit_depth     Bit-depth of the video
  * \param[in]  data8         Frame data (pointing at the top-left corner of
  *                           the frame, not the restoration unit).
@@ -404,8 +403,8 @@
     const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
     const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
     const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
-    int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
-    int dst_stride, int32_t *tmpbuf, int optimized_lr);
+    int bit_depth, uint8_t *data8, int stride, uint8_t *dst8, int dst_stride,
+    int32_t *tmpbuf, int optimized_lr);
 
 /*!\brief Function for applying loop restoration filter to a frame
  *

diff --git a/av1/common/tip.c b/av1/common/tip.c
index 49702be..d3d2138 100644
--- a/av1/common/tip.c
+++ b/av1/common/tip.c

@@ -608,7 +608,6 @@
   uint8_t *const dst = dst_buf->buf;
 
   const int bd = cm->seq_params.bit_depth;
-  const int high_bd = cm->seq_params.use_highbitdepth;
 
   const int ss_x = plane ? cm->seq_params.subsampling_x : 0;
   const int ss_y = plane ? cm->seq_params.subsampling_y : 0;
@@ -622,8 +621,8 @@
 
     InterPredParams inter_pred_params;
     av1_init_inter_params(&inter_pred_params, comp_bw, comp_bh, comp_pixel_y,
-                          comp_pixel_x, ss_x, ss_y, bd, high_bd, 0, sf,
-                          pred_buf, MULTITAP_SHARP);
+                          comp_pixel_x, ss_x, ss_y, bd, 0, sf, pred_buf,
+                          MULTITAP_SHARP);
 
     inter_pred_params.comp_mode = UNIFORM_COMP;
 
@@ -848,59 +847,6 @@
   }
 }
 
-static void tip_extend_plane_block_based(uint8_t *const src, int src_stride,
-                                         int width, int height, int extend_top,
-                                         int extend_left, int extend_bottom,
-                                         int extend_right, int start_w,
-                                         int start_h, int blk_w, int blk_h) {
-  assert(src != NULL);
-  int i = 0;
-
-  if (extend_left) {
-    // copy the left most columns out
-    uint8_t *src_ptr = src + start_h * src_stride;
-    uint8_t *dst_ptr = src_ptr - extend_left;
-    for (i = 0; i < blk_h; ++i) {
-      memset(dst_ptr, src_ptr[0], extend_left);
-      src_ptr += src_stride;
-      dst_ptr += src_stride;
-    }
-  }
-
-  if (extend_right) {
-    // copy the right most columns out
-    uint8_t *src_ptr = src + start_h * src_stride + width - 1;
-    uint8_t *dst_ptr = src_ptr + 1;
-    for (i = 0; i < blk_h; ++i) {
-      memset(dst_ptr, src_ptr[0], extend_right);
-      src_ptr += src_stride;
-      dst_ptr += src_stride;
-    }
-  }
-
-  if (extend_top) {
-    // copy the top lines into each line of the respective borders
-    uint8_t *src_ptr = src + start_w - extend_left;
-    uint8_t *dst_ptr = src_ptr - src_stride * extend_top;
-    const int extend_size = extend_left + extend_right + blk_w;
-    for (i = 0; i < extend_top; ++i) {
-      memcpy(dst_ptr, src_ptr, extend_size);
-      dst_ptr += src_stride;
-    }
-  }
-
-  if (extend_bottom) {
-    // copy the bottom lines into each line of the respective borders
-    uint8_t *src_ptr = src + src_stride * (height - 1) + start_w - extend_left;
-    uint8_t *dst_ptr = src_ptr + src_stride;
-    const int extend_size = extend_left + extend_right + blk_w;
-    for (i = 0; i < extend_bottom; ++i) {
-      memcpy(dst_ptr, src_ptr, extend_size);
-      dst_ptr += src_stride;
-    }
-  }
-}
-
 static void tip_extend_plane_border(AV1_COMMON *cm, int blk_row_start,
                                     int blk_col_start, int blk_height,
                                     int blk_width) {
@@ -931,7 +877,6 @@
   }
 
   if (top_border || bottom_border || left_border || right_border) {
-    const int is_high_bitdepth = tip_buf->flags & YV12_FLAG_HIGHBITDEPTH;
     const int subsampling_x = cm->seq_params.subsampling_x;
     const int subsampling_y = cm->seq_params.subsampling_y;
     const int y_stride = tip_buf->y_stride;
@@ -955,43 +900,23 @@
     const int uv_extend_left = extend_left >> subsampling_x;
     const int uv_extend_right = extend_right >> subsampling_x;
 
-    if (is_high_bitdepth) {
-      tip_extend_plane_block_based_highbd(
-          y_dst, y_stride, y_width, y_height, extend_top, extend_left,
-          extend_bottom, extend_right, blk_col_start, blk_row_start, blk_width,
-          blk_height);
+    tip_extend_plane_block_based_highbd(y_dst, y_stride, y_width, y_height,
+                                        extend_top, extend_left, extend_bottom,
+                                        extend_right, blk_col_start,
+                                        blk_row_start, blk_width, blk_height);
 
-      blk_col_start >>= subsampling_x;
-      blk_row_start >>= subsampling_y;
-      blk_width >>= subsampling_x;
-      blk_height >>= subsampling_y;
-      tip_extend_plane_block_based_highbd(
-          u_dst, uv_stride, uv_width, uv_heigh, uv_extend_top, uv_extend_left,
-          uv_extend_bottom, uv_extend_right, blk_col_start, blk_row_start,
-          blk_width, blk_height);
-      tip_extend_plane_block_based_highbd(
-          v_dst, uv_stride, uv_width, uv_heigh, uv_extend_top, uv_extend_left,
-          uv_extend_bottom, uv_extend_right, blk_col_start, blk_row_start,
-          blk_width, blk_height);
-    } else {
-      tip_extend_plane_block_based(y_dst, y_stride, y_width, y_height,
-                                   extend_top, extend_left, extend_bottom,
-                                   extend_right, blk_col_start, blk_row_start,
-                                   blk_width, blk_height);
-
-      blk_col_start >>= subsampling_x;
-      blk_row_start >>= subsampling_y;
-      blk_width >>= subsampling_x;
-      blk_height >>= subsampling_y;
-      tip_extend_plane_block_based(
-          u_dst, uv_stride, uv_width, uv_heigh, uv_extend_top, uv_extend_left,
-          uv_extend_bottom, uv_extend_right, blk_col_start, blk_row_start,
-          blk_width, blk_height);
-      tip_extend_plane_block_based(
-          v_dst, uv_stride, uv_width, uv_heigh, uv_extend_top, uv_extend_left,
-          uv_extend_bottom, uv_extend_right, blk_col_start, blk_row_start,
-          blk_width, blk_height);
-    }
+    blk_col_start >>= subsampling_x;
+    blk_row_start >>= subsampling_y;
+    blk_width >>= subsampling_x;
+    blk_height >>= subsampling_y;
+    tip_extend_plane_block_based_highbd(
+        u_dst, uv_stride, uv_width, uv_heigh, uv_extend_top, uv_extend_left,
+        uv_extend_bottom, uv_extend_right, blk_col_start, blk_row_start,
+        blk_width, blk_height);
+    tip_extend_plane_block_based_highbd(
+        v_dst, uv_stride, uv_width, uv_heigh, uv_extend_top, uv_extend_left,
+        uv_extend_bottom, uv_extend_right, blk_col_start, blk_row_start,
+        blk_width, blk_height);
   }
 }
 

diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index 6a8365d..1db0f1b 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c

@@ -277,8 +277,90 @@
          error_measure_lut[256 + e1] * e2;
 }
 
-/* Note: For an explanation of the warp algorithm, and some notes on bit widths
-    for hardware implementations, see the comments above av1_warp_affine_c
+/* The warp filter for ROTZOOM and AFFINE models works as follows:
+   * Split the input into 8x8 blocks
+   * For each block, project the point (4, 4) within the block, to get the
+     overall block position. Split into integer and fractional coordinates,
+     maintaining full WARPEDMODEL precision
+   * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
+     variable horizontal offset. This means that, while the rows of the
+     intermediate buffer align with the rows of the *reference* image, the
+     columns align with the columns of the *destination* image.
+   * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
+     destination is too small we crop the output at this stage). Each pixel has
+     a variable vertical offset, so that the resulting rows are aligned with
+     the rows of the destination image.
+
+   To accomplish these alignments, we factor the warp matrix as a
+   product of two shear / asymmetric zoom matrices:
+   / a b \  = /   1       0    \ * / 1+alpha  beta \
+   \ c d /    \ gamma  1+delta /   \    0      1   /
+   where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
+   The horizontal shear (with alpha and beta) is applied first,
+   then the vertical shear (with gamma and delta) is applied second.
+
+   The only limitation is that, to fit this in a fixed 8-tap filter size,
+   the fractional pixel offsets must be at most +-1. Since the horizontal filter
+   generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
+   within the block, the parameters must satisfy
+   4 * |alpha| + 7 * |beta| <= 1   and   4 * |gamma| + 4 * |delta| <= 1
+   for this filter to be applicable.
+
+   Note: This function assumes that the caller has done all of the relevant
+   checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
+   are set appropriately (if using a ROTZOOM model), and that alpha, beta,
+   gamma, delta are all in range.
+
+   TODO(rachelbarker): Maybe support scaled references?
+*/
+/* A note on hardware implementation:
+    The warp filter is intended to be implementable using the same hardware as
+    the high-precision convolve filters from the loop-restoration and
+    convolve-round experiments.
+
+    For a single filter stage, considering all of the coefficient sets for the
+    warp filter and the regular convolution filter, an input in the range
+    [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)]
+    before rounding.
+
+    Allowing for some changes to the filter coefficient sets, call the range
+    [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k,
+    we can replace this by the range [0, 256 * 2^k], which can be stored in an
+    unsigned value with 8 + k bits.
+
+    This allows the derivation of the appropriate bit widths and offsets for
+    the various intermediate values: If
+
+    F := FILTER_BITS = 7 (or else the above ranges need adjusting)
+         So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
+         intermediate value.
+    H := ROUND0_BITS
+    V := VERSHEAR_REDUCE_PREC_BITS
+    (and note that we must have H + V = 2*F for the output to have the same
+     scale as the input)
+
+    then we end up with the following offsets and ranges:
+    Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a
+                       uint{bd + F + 1}
+    After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}.
+    Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a
+                     uint{bd + 2*F + 2 - H}
+    After rounding: The final value, before undoing the offset, fits into a
+                    uint{bd + 2}.
+
+    Then we need to undo the offsets before clamping to a pixel. Note that,
+    if we do this at the end, the amount to subtract is actually independent
+    of H and V:
+
+    offset to subtract = (1 << ((bd + F - 1) - H + F - V)) +
+                         (1 << ((bd + 2*F - H) - V))
+                      == (1 << (bd - 1)) + (1 << bd)
+
+    This allows us to entirely avoid clamping in both the warp filter and
+    the convolve-round experiment. As of the time of writing, the Wiener filter
+    from loop-restoration can encode a central coefficient up to 216, which
+    leads to a maximum value of about 282 * 2^k after applying the offset.
+    So in that case we still need to clamp.
 */
 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
                               int width, int height, int stride, uint16_t *pred,
@@ -467,319 +549,30 @@
   return sum_error;
 }
 
-/* The warp filter for ROTZOOM and AFFINE models works as follows:
-   * Split the input into 8x8 blocks
-   * For each block, project the point (4, 4) within the block, to get the
-     overall block position. Split into integer and fractional coordinates,
-     maintaining full WARPEDMODEL precision
-   * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
-     variable horizontal offset. This means that, while the rows of the
-     intermediate buffer align with the rows of the *reference* image, the
-     columns align with the columns of the *destination* image.
-   * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
-     destination is too small we crop the output at this stage). Each pixel has
-     a variable vertical offset, so that the resulting rows are aligned with
-     the rows of the destination image.
-
-   To accomplish these alignments, we factor the warp matrix as a
-   product of two shear / asymmetric zoom matrices:
-   / a b \  = /   1       0    \ * / 1+alpha  beta \
-   \ c d /    \ gamma  1+delta /   \    0      1   /
-   where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
-   The horizontal shear (with alpha and beta) is applied first,
-   then the vertical shear (with gamma and delta) is applied second.
-
-   The only limitation is that, to fit this in a fixed 8-tap filter size,
-   the fractional pixel offsets must be at most +-1. Since the horizontal filter
-   generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
-   within the block, the parameters must satisfy
-   4 * |alpha| + 7 * |beta| <= 1   and   4 * |gamma| + 4 * |delta| <= 1
-   for this filter to be applicable.
-
-   Note: This function assumes that the caller has done all of the relevant
-   checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
-   are set appropriately (if using a ROTZOOM model), and that alpha, beta,
-   gamma, delta are all in range.
-
-   TODO(rachelbarker): Maybe support scaled references?
-*/
-/* A note on hardware implementation:
-    The warp filter is intended to be implementable using the same hardware as
-    the high-precision convolve filters from the loop-restoration and
-    convolve-round experiments.
-
-    For a single filter stage, considering all of the coefficient sets for the
-    warp filter and the regular convolution filter, an input in the range
-    [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)]
-    before rounding.
-
-    Allowing for some changes to the filter coefficient sets, call the range
-    [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k,
-    we can replace this by the range [0, 256 * 2^k], which can be stored in an
-    unsigned value with 8 + k bits.
-
-    This allows the derivation of the appropriate bit widths and offsets for
-    the various intermediate values: If
-
-    F := FILTER_BITS = 7 (or else the above ranges need adjusting)
-         So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
-         intermediate value.
-    H := ROUND0_BITS
-    V := VERSHEAR_REDUCE_PREC_BITS
-    (and note that we must have H + V = 2*F for the output to have the same
-     scale as the input)
-
-    then we end up with the following offsets and ranges:
-    Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a
-                       uint{bd + F + 1}
-    After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}.
-    Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a
-                     uint{bd + 2*F + 2 - H}
-    After rounding: The final value, before undoing the offset, fits into a
-                    uint{bd + 2}.
-
-    Then we need to undo the offsets before clamping to a pixel. Note that,
-    if we do this at the end, the amount to subtract is actually independent
-    of H and V:
-
-    offset to subtract = (1 << ((bd + F - 1) - H + F - V)) +
-                         (1 << ((bd + 2*F - H) - V))
-                      == (1 << (bd - 1)) + (1 << bd)
-
-    This allows us to entirely avoid clamping in both the warp filter and
-    the convolve-round experiment. As of the time of writing, the Wiener filter
-    from loop-restoration can encode a central coefficient up to 216, which
-    leads to a maximum value of about 282 * 2^k after applying the offset.
-    So in that case we still need to clamp.
-*/
-void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
-                       int height, int stride, uint8_t *pred, int p_col,
-                       int p_row, int p_width, int p_height, int p_stride,
-                       int subsampling_x, int subsampling_y,
-                       ConvolveParams *conv_params, int16_t alpha, int16_t beta,
-                       int16_t gamma, int16_t delta) {
-  int32_t tmp[15 * 8];
-  const int bd = 8;
-  const int reduce_bits_horiz = conv_params->round_0;
-  const int reduce_bits_vert = conv_params->is_compound
-                                   ? conv_params->round_1
-                                   : 2 * FILTER_BITS - reduce_bits_horiz;
-  const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
-  const int offset_bits_horiz = bd + FILTER_BITS - 1;
-  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
-  const int round_bits =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int use_wtd_comp_avg = is_uneven_wtd_comp_avg(conv_params);
-  (void)max_bits_horiz;
-  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
-  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
-
-  for (int i = p_row; i < p_row + p_height; i += 8) {
-    for (int j = p_col; j < p_col + p_width; j += 8) {
-      // Calculate the center of this 8x8 block,
-      // project to luma coordinates (if in a subsampled chroma plane),
-      // apply the affine transformation,
-      // then convert back to the original coordinates (if necessary)
-      const int32_t src_x = (j + 4) << subsampling_x;
-      const int32_t src_y = (i + 4) << subsampling_y;
-      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
-      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
-      const int32_t x4 = dst_x >> subsampling_x;
-      const int32_t y4 = dst_y >> subsampling_y;
-
-      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
-      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
-      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-
-      sx4 += alpha * (-4) + beta * (-4);
-      sy4 += gamma * (-4) + delta * (-4);
-
-      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-
-      // Horizontal filter
-      for (int k = -7; k < 8; ++k) {
-        // Clamp to top/bottom edge of the frame
-        const int iy = clamp(iy4 + k, 0, height - 1);
-
-        int sx = sx4 + beta * (k + 4);
-
-        for (int l = -4; l < 4; ++l) {
-          int ix = ix4 + l - 3;
-          // At this point, sx = sx4 + alpha * l + beta * k
-          const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
-                           WARPEDPIXEL_PREC_SHIFTS;
-          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
-          const int16_t *coeffs = av1_warped_filter[offs];
-
-          int32_t sum = 1 << offset_bits_horiz;
-          for (int m = 0; m < 8; ++m) {
-            // Clamp to left/right edge of the frame
-            const int sample_x = clamp(ix + m, 0, width - 1);
-
-            sum += ref[iy * stride + sample_x] * coeffs[m];
-          }
-          sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
-          assert(0 <= sum && sum < (1 << max_bits_horiz));
-          tmp[(k + 7) * 8 + (l + 4)] = sum;
-          sx += alpha;
-        }
-      }
-
-      // Vertical filter
-      for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
-        int sy = sy4 + delta * (k + 4);
-        for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
-          // At this point, sy = sy4 + gamma * l + delta * k
-          const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
-                           WARPEDPIXEL_PREC_SHIFTS;
-          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
-          const int16_t *coeffs = av1_warped_filter[offs];
-
-          int32_t sum = 1 << offset_bits_vert;
-          for (int m = 0; m < 8; ++m) {
-            sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
-          }
-
-          if (conv_params->is_compound) {
-            CONV_BUF_TYPE *p =
-                &conv_params
-                     ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
-                           (j - p_col + l + 4)];
-            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
-            if (conv_params->do_average) {
-              uint8_t *dst8 =
-                  &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
-              int32_t tmp32 = *p;
-              if (use_wtd_comp_avg) {
-                tmp32 = tmp32 * conv_params->fwd_offset +
-                        sum * conv_params->bck_offset;
-                tmp32 = tmp32 >> DIST_PRECISION_BITS;
-              } else {
-                tmp32 += sum;
-                tmp32 = tmp32 >> 1;
-              }
-              tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
-                      (1 << (offset_bits - conv_params->round_1 - 1));
-              *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits));
-            } else {
-              *p = sum;
-            }
-          } else {
-            uint8_t *p =
-                &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
-            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
-            assert(0 <= sum && sum < (1 << (bd + 2)));
-            *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
-          }
-          sy += gamma;
-        }
-      }
-    }
-  }
+int64_t av1_frame_error(int bd, const uint8_t *ref, int stride, uint8_t *dst,
+                        int p_width, int p_height, int p_stride) {
+  return av1_calc_highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
+                                     CONVERT_TO_SHORTPTR(dst), p_width,
+                                     p_height, p_stride, bd);
 }
 
-void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
-                int height, int stride, uint8_t *pred, int p_col, int p_row,
-                int p_width, int p_height, int p_stride, int subsampling_x,
-                int subsampling_y, ConvolveParams *conv_params) {
-  assert(wm->wmtype <= AFFINE);
-  if (wm->wmtype == ROTZOOM) {
-    wm->wmmat[5] = wm->wmmat[2];
-    wm->wmmat[4] = -wm->wmmat[3];
-  }
-  const int32_t *const mat = wm->wmmat;
-  const int16_t alpha = wm->alpha;
-  const int16_t beta = wm->beta;
-  const int16_t gamma = wm->gamma;
-  const int16_t delta = wm->delta;
-  av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
-                  p_height, p_stride, subsampling_x, subsampling_y, conv_params,
-                  alpha, beta, gamma, delta);
-}
-
-int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride,
-                               const uint8_t *const dst, int p_width,
-                               int p_height, int p_stride) {
-  int64_t sum_error = 0;
-  for (int i = 0; i < p_height; ++i) {
-    for (int j = 0; j < p_width; ++j) {
-      sum_error +=
-          (int64_t)error_measure(dst[j + i * p_stride] - ref[j + i * stride]);
-    }
-  }
-  return sum_error;
-}
-
-static int64_t segmented_frame_error(const uint8_t *const ref, int stride,
-                                     const uint8_t *const dst, int p_width,
-                                     int p_height, int p_stride,
-                                     uint8_t *segment_map,
-                                     int segment_map_stride) {
-  int patch_w, patch_h;
-  const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
-  const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
-  int64_t sum_error = 0;
-  for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
-    for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
-      int seg_x = j >> WARP_ERROR_BLOCK_LOG;
-      int seg_y = i >> WARP_ERROR_BLOCK_LOG;
-      // Only compute the error if this block contains inliers from the motion
-      // model
-      if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
-
-      // avoid computing error into the frame padding
-      patch_w = AOMMIN(error_bsize_w, p_width - j);
-      patch_h = AOMMIN(error_bsize_h, p_height - i);
-      sum_error += av1_calc_frame_error(ref + j + i * stride, stride,
-                                        dst + j + i * p_stride, patch_w,
-                                        patch_h, p_stride);
-    }
-  }
-  return sum_error;
-}
-
-int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
-                        uint8_t *dst, int p_width, int p_height, int p_stride) {
-  if (use_hbd) {
-    return av1_calc_highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
-                                       CONVERT_TO_SHORTPTR(dst), p_width,
-                                       p_height, p_stride, bd);
-  }
-
-  return av1_calc_frame_error(ref, stride, dst, p_width, p_height, p_stride);
-}
-
-int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
-                                  int stride, uint8_t *dst, int p_width,
-                                  int p_height, int p_stride,
-                                  uint8_t *segment_map,
+int64_t av1_segmented_frame_error(int bd, const uint8_t *ref, int stride,
+                                  uint8_t *dst, int p_width, int p_height,
+                                  int p_stride, uint8_t *segment_map,
                                   int segment_map_stride) {
-  if (use_hbd) {
-    return highbd_segmented_frame_error(
-        CONVERT_TO_SHORTPTR(ref), stride, CONVERT_TO_SHORTPTR(dst), p_width,
-        p_height, p_stride, bd, segment_map, segment_map_stride);
-  }
-
-  return segmented_frame_error(ref, stride, dst, p_width, p_height, p_stride,
-                               segment_map, segment_map_stride);
+  return highbd_segmented_frame_error(
+      CONVERT_TO_SHORTPTR(ref), stride, CONVERT_TO_SHORTPTR(dst), p_width,
+      p_height, p_stride, bd, segment_map, segment_map_stride);
 }
 
-void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
-                    const uint8_t *ref, int width, int height, int stride,
-                    uint8_t *pred, int p_col, int p_row, int p_width,
-                    int p_height, int p_stride, int subsampling_x,
-                    int subsampling_y, ConvolveParams *conv_params) {
-  if (use_hbd)
-    highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride,
-                      CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width,
-                      p_height, p_stride, subsampling_x, subsampling_y, bd,
-                      conv_params);
-  else
-    warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
-               p_height, p_stride, subsampling_x, subsampling_y, conv_params);
+void av1_warp_plane(WarpedMotionParams *wm, int bd, const uint8_t *ref,
+                    int width, int height, int stride, uint8_t *pred, int p_col,
+                    int p_row, int p_width, int p_height, int p_stride,
+                    int subsampling_x, int subsampling_y,
+                    ConvolveParams *conv_params) {
+  highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride,
+                    CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width, p_height,
+                    p_stride, subsampling_x, subsampling_y, bd, conv_params);
 }
 
 #define LS_MV_MAX 256  // max mv in 1/8-pel

diff --git a/av1/common/warped_motion.h b/av1/common/warped_motion.h
index 7dd2550..6c29597 100644
--- a/av1/common/warped_motion.h
+++ b/av1/common/warped_motion.h

@@ -191,13 +191,13 @@
 
 // Returns the error between the frame described by 'ref' and the frame
 // described by 'dst'.
-int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
-                        uint8_t *dst, int p_width, int p_height, int p_stride);
+int64_t av1_frame_error(int bd, const uint8_t *ref, int stride, uint8_t *dst,
+                        int p_width, int p_height, int p_stride);
 
-int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
-                                  int stride, uint8_t *dst, int p_width,
-                                  int p_height, int p_stride,
-                                  uint8_t *segment_map, int segment_map_stride);
+int64_t av1_segmented_frame_error(int bd, const uint8_t *ref, int stride,
+                                  uint8_t *dst, int p_width, int p_height,
+                                  int p_stride, uint8_t *segment_map,
+                                  int segment_map_stride);
 
 int64_t av1_calc_highbd_frame_error(const uint16_t *const ref, int stride,
                                     const uint16_t *const dst, int p_width,
@@ -214,11 +214,11 @@
                 int p_width, int p_height, int p_stride, int subsampling_x,
                 int subsampling_y, ConvolveParams *conv_params);
 
-void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
-                    const uint8_t *ref, int width, int height, int stride,
-                    uint8_t *pred, int p_col, int p_row, int p_width,
-                    int p_height, int p_stride, int subsampling_x,
-                    int subsampling_y, ConvolveParams *conv_params);
+void av1_warp_plane(WarpedMotionParams *wm, int bd, const uint8_t *ref,
+                    int width, int height, int stride, uint8_t *pred, int p_col,
+                    int p_row, int p_width, int p_height, int p_stride,
+                    int subsampling_x, int subsampling_y,
+                    ConvolveParams *conv_params);
 
 int av1_find_projection(int np, const int *pts1, const int *pts2,
                         BLOCK_SIZE bsize, int mvy, int mvx,

diff --git a/av1/common/x86/av1_convolve_horiz_rs_sse4.c b/av1/common/x86/av1_convolve_horiz_rs_sse4.c
index 727ff16..ca58ee6 100644
--- a/av1/common/x86/av1_convolve_horiz_rs_sse4.c
+++ b/av1/common/x86/av1_convolve_horiz_rs_sse4.c

@@ -23,114 +23,6 @@
 // this function will overwrite some of the padding on the right hand side of
 // the frame. This padding appears to be trashed anyway, so this should not
 // affect the running of the decoder.
-void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride,
-                                  uint8_t *dst, int dst_stride, int w, int h,
-                                  const int16_t *x_filters, int x0_qn,
-                                  int x_step_qn) {
-  assert(UPSCALE_NORMATIVE_TAPS == 8);
-
-  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
-
-  const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
-  const __m128i zero = _mm_setzero_si128();
-
-  const uint8_t *src_y;
-  uint8_t *dst_y;
-  int x_qn = x0_qn;
-  for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
-    const int x_filter_idx0 =
-        ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
-    const int x_filter_idx1 =
-        ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
-    const int x_filter_idx2 =
-        ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
-    const int x_filter_idx3 =
-        ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
-
-    assert(x_filter_idx0 <= RS_SUBPEL_MASK);
-    assert(x_filter_idx1 <= RS_SUBPEL_MASK);
-    assert(x_filter_idx2 <= RS_SUBPEL_MASK);
-    assert(x_filter_idx3 <= RS_SUBPEL_MASK);
-
-    const int16_t *const x_filter0 =
-        &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
-    const int16_t *const x_filter1 =
-        &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
-    const int16_t *const x_filter2 =
-        &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
-    const int16_t *const x_filter3 =
-        &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
-
-    const __m128i fil0_16 = xx_loadu_128(x_filter0);
-    const __m128i fil1_16 = xx_loadu_128(x_filter1);
-    const __m128i fil2_16 = xx_loadu_128(x_filter2);
-    const __m128i fil3_16 = xx_loadu_128(x_filter3);
-
-    src_y = src;
-    dst_y = dst;
-    for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
-      const uint8_t *const src_x0 =
-          &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
-      const uint8_t *const src_x1 =
-          &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
-      const uint8_t *const src_x2 =
-          &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
-      const uint8_t *const src_x3 =
-          &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
-
-      // Load up the source data. This is 8-bit input data, so each load
-      // gets 8 pixels.
-      const __m128i src0_8 = xx_loadl_64(src_x0);
-      const __m128i src1_8 = xx_loadl_64(src_x1);
-      const __m128i src2_8 = xx_loadl_64(src_x2);
-      const __m128i src3_8 = xx_loadl_64(src_x3);
-
-      // Now zero-extend up to 16-bit precision, i.e.
-      // [ 00 00 00 00 hg fe dc ba ] -> [ 0h 0g 0f 0e 0d 0c 0b 0a ]
-      const __m128i src0_16 = _mm_cvtepu8_epi16(src0_8);
-      const __m128i src1_16 = _mm_cvtepu8_epi16(src1_8);
-      const __m128i src2_16 = _mm_cvtepu8_epi16(src2_8);
-      const __m128i src3_16 = _mm_cvtepu8_epi16(src3_8);
-
-      // Multiply by filter coefficients (results in a 32-bit value),
-      // and add adjacent pairs, i.e.
-      // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
-      // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
-      const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
-      const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
-      const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
-      const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
-
-      // Reduce horizontally and add, i.e.
-      // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
-      const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
-      const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
-
-      const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
-
-      // Divide down by (1 << FILTER_BITS), rounding to nearest.
-      const __m128i shifted_32 =
-          _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
-
-      // Pack 32-bit values into 16-bit values, i.e.
-      // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
-      const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
-
-      // Pack 16-bit values into 8-bit values, i.e.
-      // ([ 0 0 0 0 D C B A ], [ 0 0 0 0 0 0 0 0 ])
-      // -> [ 0 0 0 0 0 0 DC BA ]
-      const __m128i shifted_8 = _mm_packus_epi16(shifted_16, zero);
-
-      // Write to the output
-      xx_storel_32(&dst_y[x], shifted_8);
-    }
-  }
-}
-
-// Note: If the crop width is not a multiple of 4, then, unlike the C version,
-// this function will overwrite some of the padding on the right hand side of
-// the frame. This padding appears to be trashed anyway, so this should not
-// affect the running of the decoder.
 void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride,
                                          uint16_t *dst, int dst_stride, int w,
                                          int h, const int16_t *x_filters,

diff --git a/av1/common/x86/av1_convolve_scale_sse4.c b/av1/common/x86/av1_convolve_scale_sse4.c
index f9cd29b..51553ff 100644
--- a/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/av1/common/x86/av1_convolve_scale_sse4.c

@@ -19,244 +19,11 @@
 #include "aom_dsp/aom_filter.h"
 #include "av1/common/convolve.h"
 
-// A specialised version of hfilter, the horizontal filter for
-// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
-static void hfilter8(const uint8_t *src, int src_stride, int16_t *dst, int w,
-                     int h, int subpel_x_qn, int x_step_qn,
-                     const InterpFilterParams *filter_params, unsigned round) {
-  const int bd = 8;
-  const int ntaps = 8;
-
-  src -= ntaps / 2 - 1;
-
-  int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
-  const __m128i round_add = _mm_set1_epi32(round_add32);
-  const __m128i round_shift = _mm_cvtsi32_si128(round);
-
-  int x_qn = subpel_x_qn;
-  for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
-    const uint8_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
-    const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-    assert(filter_idx < SUBPEL_SHIFTS);
-    const int16_t *filter =
-        av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
-
-    // Load the filter coefficients
-    const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
-    const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
-
-    int y;
-    for (y = 0; y <= h - 4; y += 4) {
-      const uint8_t *const src0 = src_col + y * src_stride;
-      const uint8_t *const src1 = src0 + 1 * src_stride;
-      const uint8_t *const src2 = src0 + 2 * src_stride;
-      const uint8_t *const src3 = src0 + 3 * src_stride;
-
-      // Load up source data. This is 8-bit input data; each load is just
-      // loading the lower half of the register and gets 8 pixels
-      const __m128i data08 = _mm_loadl_epi64((__m128i *)src0);
-      const __m128i data18 = _mm_loadl_epi64((__m128i *)src1);
-      const __m128i data28 = _mm_loadl_epi64((__m128i *)src2);
-      const __m128i data38 = _mm_loadl_epi64((__m128i *)src3);
-
-      // Now zero-extend up to 16-bit precision by interleaving with
-      // zeros. Drop the upper half of each register (which just had zeros)
-      const __m128i data0lo = _mm_unpacklo_epi8(data08, zero);
-      const __m128i data1lo = _mm_unpacklo_epi8(data18, zero);
-      const __m128i data2lo = _mm_unpacklo_epi8(data28, zero);
-      const __m128i data3lo = _mm_unpacklo_epi8(data38, zero);
-
-      // Multiply by coefficients
-      const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
-      const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
-      const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
-      const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
-
-      // Reduce horizontally and add
-      const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
-      const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
-      const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
-
-      // Divide down by (1 << round), rounding to nearest.
-      __m128i shifted =
-          _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
-
-      shifted = _mm_packus_epi32(shifted, shifted);
-      // Write transposed to the output
-      _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted);
-    }
-    for (; y < h; ++y) {
-      const uint8_t *const src_row = src_col + y * src_stride;
-
-      int32_t sum = (1 << (bd + FILTER_BITS - 1));
-      for (int k = 0; k < ntaps; ++k) {
-        sum += filter[k] * src_row[k];
-      }
-
-      dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
-    }
-  }
-}
-
 static __m128i convolve_16_8(const int16_t *src, __m128i coeff) {
   __m128i data = _mm_loadu_si128((__m128i *)src);
   return _mm_madd_epi16(data, coeff);
 }
 
-// A specialised version of vfilter, the vertical filter for
-// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
-static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
-                     int dst_stride, int w, int h, int subpel_y_qn,
-                     int y_step_qn, const InterpFilterParams *filter_params,
-                     const ConvolveParams *conv_params, int bd) {
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int ntaps = 8;
-
-  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-
-  const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
-                         (1 << (offset_bits - conv_params->round_1 - 1)));
-  const __m128i sub = _mm_set1_epi16(sub32);
-
-  CONV_BUF_TYPE *dst16 = conv_params->dst;
-  const int dst16_stride = conv_params->dst_stride;
-  const int bits =
-      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
-  const __m128i bits_shift = _mm_cvtsi32_si128(bits);
-  const __m128i bits_const = _mm_set1_epi16(((1 << bits) >> 1));
-  const __m128i round_shift_add =
-      _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
-  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits);
-
-  const int use_wtd_comp_avg = is_uneven_wtd_comp_avg(conv_params);
-  const int w0 = conv_params->fwd_offset;
-  const int w1 = conv_params->bck_offset;
-  const __m128i wt0 = _mm_set1_epi16((short)w0);
-  const __m128i wt1 = _mm_set1_epi16((short)w1);
-  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
-
-  int y_qn = subpel_y_qn;
-  for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
-    const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
-    const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-    assert(filter_idx < SUBPEL_SHIFTS);
-    const int16_t *filter =
-        av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
-
-    const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
-    int x;
-    for (x = 0; x <= w - 4; x += 4) {
-      const int16_t *const src0 = src_y + x * src_stride;
-      const int16_t *const src1 = src0 + 1 * src_stride;
-      const int16_t *const src2 = src0 + 2 * src_stride;
-      const int16_t *const src3 = src0 + 3 * src_stride;
-
-      // Load the source data for the three rows, adding the three registers of
-      // convolved products to one as we go (conv0..conv3) to avoid the
-      // register pressure getting too high.
-      const __m128i conv0 = convolve_16_8(src0, coeff0716);
-      const __m128i conv1 = convolve_16_8(src1, coeff0716);
-      const __m128i conv2 = convolve_16_8(src2, coeff0716);
-      const __m128i conv3 = convolve_16_8(src3, coeff0716);
-
-      // Now reduce horizontally to get one lane for each result
-      const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
-      const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
-      __m128i conv = _mm_hadd_epi32(conv01, conv23);
-
-      conv = _mm_add_epi32(conv, res_add_const);
-      // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
-      __m128i shifted =
-          _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);
-
-      uint8_t *dst_x = dst + y * dst_stride + x;
-      CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
-      __m128i result;
-      __m128i shifted_16 = _mm_packus_epi32(shifted, shifted);
-
-      if (conv_params->is_compound) {
-        if (conv_params->do_average) {
-          const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x);
-          if (use_wtd_comp_avg) {
-            const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16);
-            const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
-            const __m128i shifted_32 =
-                _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
-            shifted_16 = _mm_packus_epi32(shifted_32, shifted_32);
-          } else {
-            shifted_16 = _mm_srai_epi16(_mm_add_epi16(p_16, shifted_16), 1);
-          }
-          const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
-          result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
-          const __m128i result_8 = _mm_packus_epi16(result, result);
-          *(uint32_t *)dst_x = _mm_cvtsi128_si32(result_8);
-        } else {
-          _mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
-        }
-      } else {
-        const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
-        result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
-        const __m128i result_8 = _mm_packus_epi16(result, result);
-        *(uint32_t *)dst_x = _mm_cvtsi128_si32(result_8);
-      }
-    }
-    for (; x < w; ++x) {
-      const int16_t *src_x = src_y + x * src_stride;
-      int32_t sum = 1 << offset_bits;
-      for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
-      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
-
-      if (conv_params->is_compound) {
-        if (conv_params->do_average) {
-          int32_t tmp = dst16[y * dst16_stride + x];
-          if (use_wtd_comp_avg) {
-            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
-            tmp = tmp >> DIST_PRECISION_BITS;
-          } else {
-            tmp += res;
-            tmp = tmp >> 1;
-          }
-          /* Subtract round offset and convolve round */
-          tmp = tmp - sub32;
-          dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
-        } else {
-          dst16[y * dst16_stride + x] = res;
-        }
-      } else {
-        /* Subtract round offset and convolve round */
-        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
-                             (1 << (offset_bits - conv_params->round_1 - 1)));
-        dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
-      }
-    }
-  }
-}
-void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride,
-                                  uint8_t *dst8, int dst8_stride, int w, int h,
-                                  const InterpFilterParams *filter_params_x,
-                                  const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_qn, const int x_step_qn,
-                                  const int subpel_y_qn, const int y_step_qn,
-                                  ConvolveParams *conv_params) {
-  int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
-  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
-             filter_params_y->taps;
-
-  const int xtaps = filter_params_x->taps;
-  const int ytaps = filter_params_y->taps;
-  const int fo_vert = ytaps / 2 - 1;
-  assert((xtaps == 8) && (ytaps == 8));
-  (void)xtaps;
-
-  // horizontal filter
-  hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn,
-           x_step_qn, filter_params_x, conv_params->round_0);
-
-  // vertical filter (input is transposed)
-  vfilter8(tmp, im_h, dst8, dst8_stride, w, h, subpel_y_qn, y_step_qn,
-           filter_params_y, conv_params, 8);
-}
-
 // A specialised version of hfilter, the horizontal filter for
 // av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
 // filters.

diff --git a/av1/common/x86/av1_inv_txfm_avx2.c b/av1/common/x86/av1_inv_txfm_avx2.c
deleted file mode 100644
index e8a8558..0000000
--- a/av1/common/x86/av1_inv_txfm_avx2.c
+++ /dev/null

@@ -1,1950 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include "config/aom_config.h"
-
-#include "config/av1_rtcd.h"
-
-#include "av1/common/av1_inv_txfm1d_cfg.h"
-#include "av1/common/x86/av1_txfm_sse2.h"
-#include "av1/common/x86/av1_inv_txfm_avx2.h"
-#include "av1/common/x86/av1_inv_txfm_ssse3.h"
-
-// TODO(venkatsanampudi@ittiam.com): move this to header file
-
-// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
-static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
-                                          4 * 5793 };
-
-static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
-                                      const __m256i _r, int8_t cos_bit) {
-  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
-  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
-  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
-  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
-
-  btf_16_adds_subs_avx2(&x1[8], &x1[11]);
-  btf_16_adds_subs_avx2(&x1[9], &x1[10]);
-  btf_16_adds_subs_avx2(&x1[15], &x1[12]);
-  btf_16_adds_subs_avx2(&x1[14], &x1[13]);
-}
-
-static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi,
-                                      const __m256i _r, int8_t cos_bit) {
-  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
-  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
-  btf_16_adds_subs_avx2(&x[0], &x[7]);
-  btf_16_adds_subs_avx2(&x[1], &x[6]);
-  btf_16_adds_subs_avx2(&x[2], &x[5]);
-  btf_16_adds_subs_avx2(&x[3], &x[4]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
-}
-
-static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
-  btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]);
-  btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]);
-  btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]);
-  btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]);
-  btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]);
-  btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]);
-  btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]);
-  btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]);
-}
-
-static void idct16_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) {
-  (void)(cos_bit);
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
-  __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
-  __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
-  __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
-  __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
-  __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
-  __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
-  __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
-  __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
-  __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
-  __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
-  __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
-  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
-  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
-  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
-  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
-  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
-  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
-  __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
-
-  // stage 1
-  __m256i x1[16];
-  x1[0] = input[0];
-  x1[1] = input[8];
-  x1[2] = input[4];
-  x1[3] = input[12];
-  x1[4] = input[2];
-  x1[5] = input[10];
-  x1[6] = input[6];
-  x1[7] = input[14];
-  x1[8] = input[1];
-  x1[9] = input[9];
-  x1[10] = input[5];
-  x1[11] = input[13];
-  x1[12] = input[3];
-  x1[13] = input[11];
-  x1[14] = input[7];
-  x1[15] = input[15];
-
-  // stage 2
-  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit);
-
-  // stage 3
-  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit);
-  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
-  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
-  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
-  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
-
-  // stage 4
-  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit);
-  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
-  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
-
-  idct16_stage5_avx2(x1, cospi, _r, cos_bit);
-  idct16_stage6_avx2(x1, cospi, _r, cos_bit);
-  idct16_stage7_avx2(output, x1);
-}
-
-static void idct16_low8_avx2(const __m256i *input, __m256i *output,
-                             int8_t cos_bit) {
-  (void)(cos_bit);
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
-  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
-  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
-
-  // stage 1
-  __m256i x1[16];
-  x1[0] = input[0];
-  x1[2] = input[4];
-  x1[4] = input[2];
-  x1[6] = input[6];
-  x1[8] = input[1];
-  x1[10] = input[5];
-  x1[12] = input[3];
-  x1[14] = input[7];
-
-  // stage 2
-  btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]);
-  btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]);
-  btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]);
-  btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]);
-
-  // stage 3
-  btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]);
-  btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]);
-  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
-  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
-  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
-  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
-
-  // stage 4
-  btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
-  btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]);
-  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
-  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
-
-  idct16_stage5_avx2(x1, cospi, _r, cos_bit);
-  idct16_stage6_avx2(x1, cospi, _r, cos_bit);
-  idct16_stage7_avx2(output, x1);
-}
-
-static void idct16_low1_avx2(const __m256i *input, __m256i *output,
-                             int8_t cos_bit) {
-  (void)(cos_bit);
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-
-  // stage 1
-  __m256i x1[2];
-  x1[0] = input[0];
-
-  // stage 2
-  // stage 3
-  // stage 4
-  btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
-
-  // stage 5
-  // stage 6
-  output[0] = x1[0];
-  output[1] = x1[1];
-  output[2] = x1[1];
-  output[3] = x1[0];
-  output[4] = x1[0];
-  output[5] = x1[1];
-  output[6] = x1[1];
-  output[7] = x1[0];
-  output[8] = x1[0];
-  output[9] = x1[1];
-  output[10] = x1[1];
-  output[11] = x1[0];
-  output[12] = x1[0];
-  output[13] = x1[1];
-  output[14] = x1[1];
-  output[15] = x1[0];
-}
-
-static INLINE void iadst16_stage3_avx2(__m256i *x) {
-  btf_16_adds_subs_avx2(&x[0], &x[8]);
-  btf_16_adds_subs_avx2(&x[1], &x[9]);
-  btf_16_adds_subs_avx2(&x[2], &x[10]);
-  btf_16_adds_subs_avx2(&x[3], &x[11]);
-  btf_16_adds_subs_avx2(&x[4], &x[12]);
-  btf_16_adds_subs_avx2(&x[5], &x[13]);
-  btf_16_adds_subs_avx2(&x[6], &x[14]);
-  btf_16_adds_subs_avx2(&x[7], &x[15]);
-}
-
-static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi,
-                                       const __m256i _r, int8_t cos_bit) {
-  const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
-  const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
-  const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
-  const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
-  const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
-  const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
-  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit);
-}
-
-static INLINE void iadst16_stage5_avx2(__m256i *x) {
-  btf_16_adds_subs_avx2(&x[0], &x[4]);
-  btf_16_adds_subs_avx2(&x[1], &x[5]);
-  btf_16_adds_subs_avx2(&x[2], &x[6]);
-  btf_16_adds_subs_avx2(&x[3], &x[7]);
-  btf_16_adds_subs_avx2(&x[8], &x[12]);
-  btf_16_adds_subs_avx2(&x[9], &x[13]);
-  btf_16_adds_subs_avx2(&x[10], &x[14]);
-  btf_16_adds_subs_avx2(&x[11], &x[15]);
-}
-
-static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi,
-                                       const __m256i _r, int8_t cos_bit) {
-  const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
-  const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
-  const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
-  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit);
-}
-
-static INLINE void iadst16_stage7_avx2(__m256i *x) {
-  btf_16_adds_subs_avx2(&x[0], &x[2]);
-  btf_16_adds_subs_avx2(&x[1], &x[3]);
-  btf_16_adds_subs_avx2(&x[4], &x[6]);
-  btf_16_adds_subs_avx2(&x[5], &x[7]);
-  btf_16_adds_subs_avx2(&x[8], &x[10]);
-  btf_16_adds_subs_avx2(&x[9], &x[11]);
-  btf_16_adds_subs_avx2(&x[12], &x[14]);
-  btf_16_adds_subs_avx2(&x[13], &x[15]);
-}
-
-static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi,
-                                       const __m256i _r, int8_t cos_bit) {
-  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
-  const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
-  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
-}
-
-static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
-  const __m256i __zero = _mm256_setzero_si256();
-  output[0] = x1[0];
-  output[1] = _mm256_subs_epi16(__zero, x1[8]);
-  output[2] = x1[12];
-  output[3] = _mm256_subs_epi16(__zero, x1[4]);
-  output[4] = x1[6];
-  output[5] = _mm256_subs_epi16(__zero, x1[14]);
-  output[6] = x1[10];
-  output[7] = _mm256_subs_epi16(__zero, x1[2]);
-  output[8] = x1[3];
-  output[9] = _mm256_subs_epi16(__zero, x1[11]);
-  output[10] = x1[15];
-  output[11] = _mm256_subs_epi16(__zero, x1[7]);
-  output[12] = x1[5];
-  output[13] = _mm256_subs_epi16(__zero, x1[13]);
-  output[14] = x1[9];
-  output[15] = _mm256_subs_epi16(__zero, x1[1]);
-}
-
-static void iadst16_avx2(const __m256i *input, __m256i *output,
-                         int8_t cos_bit) {
-  (void)(cos_bit);
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-
-  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
-  __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
-  __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
-  __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
-  __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
-  __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
-  __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
-  __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
-  __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
-  __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
-  __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
-  __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
-  __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
-  __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
-  __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
-  __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
-
-  // stage 1
-  __m256i x1[16];
-  x1[0] = input[15];
-  x1[1] = input[0];
-  x1[2] = input[13];
-  x1[3] = input[2];
-  x1[4] = input[11];
-  x1[5] = input[4];
-  x1[6] = input[9];
-  x1[7] = input[6];
-  x1[8] = input[7];
-  x1[9] = input[8];
-  x1[10] = input[5];
-  x1[11] = input[10];
-  x1[12] = input[3];
-  x1[13] = input[12];
-  x1[14] = input[1];
-  x1[15] = input[14];
-
-  // stage 2
-  btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit);
-
-  iadst16_stage3_avx2(x1);
-  iadst16_stage4_avx2(x1, cospi, _r, cos_bit);
-  iadst16_stage5_avx2(x1);
-  iadst16_stage6_avx2(x1, cospi, _r, cos_bit);
-  iadst16_stage7_avx2(x1);
-  iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
-  iadst16_stage9_avx2(output, x1);
-}
-
-static void iadst16_low8_avx2(const __m256i *input, __m256i *output,
-                              int8_t cos_bit) {
-  (void)(cos_bit);
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  // stage 1
-  __m256i x1[16];
-  x1[1] = input[0];
-  x1[3] = input[2];
-  x1[5] = input[4];
-  x1[7] = input[6];
-  x1[8] = input[7];
-  x1[10] = input[5];
-  x1[12] = input[3];
-  x1[14] = input[1];
-
-  // stage 2
-  btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
-  btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]);
-  btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]);
-  btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]);
-  btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]);
-  btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]);
-  btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]);
-  btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]);
-
-  iadst16_stage3_avx2(x1);
-  iadst16_stage4_avx2(x1, cospi, _r, cos_bit);
-  iadst16_stage5_avx2(x1);
-  iadst16_stage6_avx2(x1, cospi, _r, cos_bit);
-  iadst16_stage7_avx2(x1);
-  iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
-  iadst16_stage9_avx2(output, x1);
-}
-
-static void iadst16_low1_avx2(const __m256i *input, __m256i *output,
-                              int8_t cos_bit) {
-  (void)(cos_bit);
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
-  const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
-  const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
-  const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
-
-  // stage 1
-  __m256i x1[16];
-  x1[1] = input[0];
-
-  // stage 2
-  btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
-
-  // stage 3
-  x1[8] = x1[0];
-  x1[9] = x1[1];
-
-  // stage 4
-  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit);
-
-  // stage 5
-  x1[4] = x1[0];
-  x1[5] = x1[1];
-
-  x1[12] = x1[8];
-  x1[13] = x1[9];
-
-  // stage 6
-  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit);
-
-  // stage 7
-  x1[2] = x1[0];
-  x1[3] = x1[1];
-  x1[6] = x1[4];
-  x1[7] = x1[5];
-  x1[10] = x1[8];
-  x1[11] = x1[9];
-  x1[14] = x1[12];
-  x1[15] = x1[13];
-
-  iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
-  iadst16_stage9_avx2(output, x1);
-}
-
-static INLINE void idct32_high16_stage3_avx2(__m256i *x) {
-  btf_16_adds_subs_avx2(&x[16], &x[17]);
-  btf_16_adds_subs_avx2(&x[19], &x[18]);
-  btf_16_adds_subs_avx2(&x[20], &x[21]);
-  btf_16_adds_subs_avx2(&x[23], &x[22]);
-  btf_16_adds_subs_avx2(&x[24], &x[25]);
-  btf_16_adds_subs_avx2(&x[27], &x[26]);
-  btf_16_adds_subs_avx2(&x[28], &x[29]);
-  btf_16_adds_subs_avx2(&x[31], &x[30]);
-}
-
-static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi,
-                                             const __m256i _r, int8_t cos_bit) {
-  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
-  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
-  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
-  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
-  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
-  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
-  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
-}
-
-static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi,
-                                             const __m256i _r, int8_t cos_bit) {
-  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
-  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
-  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
-  btf_16_adds_subs_avx2(&x[16], &x[19]);
-  btf_16_adds_subs_avx2(&x[17], &x[18]);
-  btf_16_adds_subs_avx2(&x[23], &x[20]);
-  btf_16_adds_subs_avx2(&x[22], &x[21]);
-  btf_16_adds_subs_avx2(&x[24], &x[27]);
-  btf_16_adds_subs_avx2(&x[25], &x[26]);
-  btf_16_adds_subs_avx2(&x[31], &x[28]);
-  btf_16_adds_subs_avx2(&x[30], &x[29]);
-}
-
-static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi,
-                                             const __m256i _r, int8_t cos_bit) {
-  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
-  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
-  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
-  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
-  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
-  btf_16_adds_subs_avx2(&x[8], &x[11]);
-  btf_16_adds_subs_avx2(&x[9], &x[10]);
-  btf_16_adds_subs_avx2(&x[15], &x[12]);
-  btf_16_adds_subs_avx2(&x[14], &x[13]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
-}
-
-static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi,
-                                      const __m256i _r, int8_t cos_bit) {
-  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
-  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
-  btf_16_adds_subs_avx2(&x[0], &x[7]);
-  btf_16_adds_subs_avx2(&x[1], &x[6]);
-  btf_16_adds_subs_avx2(&x[2], &x[5]);
-  btf_16_adds_subs_avx2(&x[3], &x[4]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
-  btf_16_adds_subs_avx2(&x[16], &x[23]);
-  btf_16_adds_subs_avx2(&x[17], &x[22]);
-  btf_16_adds_subs_avx2(&x[18], &x[21]);
-  btf_16_adds_subs_avx2(&x[19], &x[20]);
-  btf_16_adds_subs_avx2(&x[31], &x[24]);
-  btf_16_adds_subs_avx2(&x[30], &x[25]);
-  btf_16_adds_subs_avx2(&x[29], &x[26]);
-  btf_16_adds_subs_avx2(&x[28], &x[27]);
-}
-
-static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi,
-                                      const __m256i _r, int8_t cos_bit) {
-  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
-  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
-  btf_16_adds_subs_avx2(&x[0], &x[15]);
-  btf_16_adds_subs_avx2(&x[1], &x[14]);
-  btf_16_adds_subs_avx2(&x[2], &x[13]);
-  btf_16_adds_subs_avx2(&x[3], &x[12]);
-  btf_16_adds_subs_avx2(&x[4], &x[11]);
-  btf_16_adds_subs_avx2(&x[5], &x[10]);
-  btf_16_adds_subs_avx2(&x[6], &x[9]);
-  btf_16_adds_subs_avx2(&x[7], &x[8]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
-}
-
-static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) {
-  btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]);
-  btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]);
-  btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]);
-  btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]);
-  btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]);
-  btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]);
-  btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]);
-  btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]);
-  btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]);
-  btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]);
-  btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]);
-  btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]);
-  btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]);
-  btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]);
-  btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]);
-  btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]);
-}
-
-static void idct32_low1_avx2(const __m256i *input, __m256i *output,
-                             int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-
-  // stage 1
-  __m256i x[2];
-  x[0] = input[0];
-
-  // stage 2
-  // stage 3
-  // stage 4
-  // stage 5
-  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
-
-  // stage 6
-  // stage 7
-  // stage 8
-  // stage 9
-  output[0] = x[0];
-  output[31] = x[0];
-  output[1] = x[1];
-  output[30] = x[1];
-  output[2] = x[1];
-  output[29] = x[1];
-  output[3] = x[0];
-  output[28] = x[0];
-  output[4] = x[0];
-  output[27] = x[0];
-  output[5] = x[1];
-  output[26] = x[1];
-  output[6] = x[1];
-  output[25] = x[1];
-  output[7] = x[0];
-  output[24] = x[0];
-  output[8] = x[0];
-  output[23] = x[0];
-  output[9] = x[1];
-  output[22] = x[1];
-  output[10] = x[1];
-  output[21] = x[1];
-  output[11] = x[0];
-  output[20] = x[0];
-  output[12] = x[0];
-  output[19] = x[0];
-  output[13] = x[1];
-  output[18] = x[1];
-  output[14] = x[1];
-  output[17] = x[1];
-  output[15] = x[0];
-  output[16] = x[0];
-}
-
-static void idct32_low8_avx2(const __m256i *input, __m256i *output,
-                             int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  // stage 1
-  __m256i x[32];
-  x[0] = input[0];
-  x[4] = input[4];
-  x[8] = input[2];
-  x[12] = input[6];
-  x[16] = input[1];
-  x[20] = input[5];
-  x[24] = input[3];
-  x[28] = input[7];
-
-  // stage 2
-  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
-  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
-  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
-  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
-
-  // stage 3
-  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
-  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
-  x[17] = x[16];
-  x[18] = x[19];
-  x[21] = x[20];
-  x[22] = x[23];
-  x[25] = x[24];
-  x[26] = x[27];
-  x[29] = x[28];
-  x[30] = x[31];
-
-  // stage 4
-  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
-  x[9] = x[8];
-  x[10] = x[11];
-  x[13] = x[12];
-  x[14] = x[15];
-  idct32_high16_stage4_avx2(x, cospi, _r, cos_bit);
-
-  // stage 5
-  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
-  x[5] = x[4];
-  x[6] = x[7];
-  idct32_high24_stage5_avx2(x, cospi, _r, cos_bit);
-  // stage 6
-  x[3] = x[0];
-  x[2] = x[1];
-  idct32_high28_stage6_avx2(x, cospi, _r, cos_bit);
-
-  idct32_stage7_avx2(x, cospi, _r, cos_bit);
-  idct32_stage8_avx2(x, cospi, _r, cos_bit);
-  idct32_stage9_avx2(output, x);
-}
-
-static void idct32_low16_avx2(const __m256i *input, __m256i *output,
-                              int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  // stage 1
-  __m256i x[32];
-  x[0] = input[0];
-  x[2] = input[8];
-  x[4] = input[4];
-  x[6] = input[12];
-  x[8] = input[2];
-  x[10] = input[10];
-  x[12] = input[6];
-  x[14] = input[14];
-  x[16] = input[1];
-  x[18] = input[9];
-  x[20] = input[5];
-  x[22] = input[13];
-  x[24] = input[3];
-  x[26] = input[11];
-  x[28] = input[7];
-  x[30] = input[15];
-
-  // stage 2
-  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
-  btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
-  btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
-  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
-  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
-  btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
-  btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
-  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
-
-  // stage 3
-  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
-  btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
-  btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
-  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
-  idct32_high16_stage3_avx2(x);
-
-  // stage 4
-  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
-  btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
-  btf_16_adds_subs_avx2(&x[8], &x[9]);
-  btf_16_adds_subs_avx2(&x[11], &x[10]);
-  btf_16_adds_subs_avx2(&x[12], &x[13]);
-  btf_16_adds_subs_avx2(&x[15], &x[14]);
-  idct32_high16_stage4_avx2(x, cospi, _r, cos_bit);
-
-  // stage 5
-  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
-  btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
-  btf_16_adds_subs_avx2(&x[4], &x[5]);
-  btf_16_adds_subs_avx2(&x[7], &x[6]);
-  idct32_high24_stage5_avx2(x, cospi, _r, cos_bit);
-
-  btf_16_adds_subs_avx2(&x[0], &x[3]);
-  btf_16_adds_subs_avx2(&x[1], &x[2]);
-  idct32_high28_stage6_avx2(x, cospi, _r, cos_bit);
-
-  idct32_stage7_avx2(x, cospi, _r, cos_bit);
-  idct32_stage8_avx2(x, cospi, _r, cos_bit);
-  idct32_stage9_avx2(output, x);
-}
-
-static void idct32_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) {
-  (void)(cos_bit);
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
-  __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
-  __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
-  __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
-  __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
-  __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
-  __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
-  __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
-  __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
-  __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
-  __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
-  __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
-  __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
-  __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
-  __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
-  __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
-  __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
-  __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
-  __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
-  __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
-  __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
-  __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
-  __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
-  __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
-  __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
-  __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
-  __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
-  __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
-  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
-  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
-  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
-  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
-
-  // stage 1
-  __m256i x1[32];
-  x1[0] = input[0];
-  x1[1] = input[16];
-  x1[2] = input[8];
-  x1[3] = input[24];
-  x1[4] = input[4];
-  x1[5] = input[20];
-  x1[6] = input[12];
-  x1[7] = input[28];
-  x1[8] = input[2];
-  x1[9] = input[18];
-  x1[10] = input[10];
-  x1[11] = input[26];
-  x1[12] = input[6];
-  x1[13] = input[22];
-  x1[14] = input[14];
-  x1[15] = input[30];
-  x1[16] = input[1];
-  x1[17] = input[17];
-  x1[18] = input[9];
-  x1[19] = input[25];
-  x1[20] = input[5];
-  x1[21] = input[21];
-  x1[22] = input[13];
-  x1[23] = input[29];
-  x1[24] = input[3];
-  x1[25] = input[19];
-  x1[26] = input[11];
-  x1[27] = input[27];
-  x1[28] = input[7];
-  x1[29] = input[23];
-  x1[30] = input[15];
-  x1[31] = input[31];
-
-  // stage 2
-  btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r, cos_bit);
-
-  // stage 3
-  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit);
-  idct32_high16_stage3_avx2(x1);
-
-  // stage 4
-  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit);
-  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
-  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
-  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
-  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
-  idct32_high16_stage4_avx2(x1, cospi, _r, cos_bit);
-
-  // stage 5
-  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
-  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit);
-  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
-  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
-  idct32_high24_stage5_avx2(x1, cospi, _r, cos_bit);
-
-  // stage 6
-  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
-  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
-  idct32_high28_stage6_avx2(x1, cospi, _r, cos_bit);
-
-  idct32_stage7_avx2(x1, cospi, _r, cos_bit);
-  idct32_stage8_avx2(x1, cospi, _r, cos_bit);
-  idct32_stage9_avx2(output, x1);
-}
-
-static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
-                                             const __m256i _r, int8_t cos_bit) {
-  (void)cos_bit;
-  const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
-  const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
-  const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
-  const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
-  const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
-  const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
-  const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
-  const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
-  const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
-  const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
-  const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
-  const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
-  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
-}
-
-static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
-                                             const __m256i _r, int8_t cos_bit) {
-  (void)cos_bit;
-  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
-  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
-  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
-  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
-  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
-  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
-  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
-  btf_16_adds_subs_avx2(&x[32], &x[35]);
-  btf_16_adds_subs_avx2(&x[33], &x[34]);
-  btf_16_adds_subs_avx2(&x[39], &x[36]);
-  btf_16_adds_subs_avx2(&x[38], &x[37]);
-  btf_16_adds_subs_avx2(&x[40], &x[43]);
-  btf_16_adds_subs_avx2(&x[41], &x[42]);
-  btf_16_adds_subs_avx2(&x[47], &x[44]);
-  btf_16_adds_subs_avx2(&x[46], &x[45]);
-  btf_16_adds_subs_avx2(&x[48], &x[51]);
-  btf_16_adds_subs_avx2(&x[49], &x[50]);
-  btf_16_adds_subs_avx2(&x[55], &x[52]);
-  btf_16_adds_subs_avx2(&x[54], &x[53]);
-  btf_16_adds_subs_avx2(&x[56], &x[59]);
-  btf_16_adds_subs_avx2(&x[57], &x[58]);
-  btf_16_adds_subs_avx2(&x[63], &x[60]);
-  btf_16_adds_subs_avx2(&x[62], &x[61]);
-}
-
-static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
-                                             const __m256i _r, int8_t cos_bit) {
-  (void)cos_bit;
-  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
-  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
-  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
-  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
-  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
-  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
-  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit);
-}
-
-static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi,
-                                             const __m256i _r, int8_t cos_bit) {
-  btf_16_adds_subs_avx2(&x[16], &x[19]);
-  btf_16_adds_subs_avx2(&x[17], &x[18]);
-  btf_16_adds_subs_avx2(&x[23], &x[20]);
-  btf_16_adds_subs_avx2(&x[22], &x[21]);
-  btf_16_adds_subs_avx2(&x[24], &x[27]);
-  btf_16_adds_subs_avx2(&x[25], &x[26]);
-  btf_16_adds_subs_avx2(&x[31], &x[28]);
-  btf_16_adds_subs_avx2(&x[30], &x[29]);
-  idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
-}
-
-static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi,
-                                             const __m256i _r, int8_t cos_bit) {
-  (void)cos_bit;
-  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
-  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
-  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
-  btf_16_adds_subs_avx2(&x[32], &x[39]);
-  btf_16_adds_subs_avx2(&x[33], &x[38]);
-  btf_16_adds_subs_avx2(&x[34], &x[37]);
-  btf_16_adds_subs_avx2(&x[35], &x[36]);
-  btf_16_adds_subs_avx2(&x[47], &x[40]);
-  btf_16_adds_subs_avx2(&x[46], &x[41]);
-  btf_16_adds_subs_avx2(&x[45], &x[42]);
-  btf_16_adds_subs_avx2(&x[44], &x[43]);
-  btf_16_adds_subs_avx2(&x[48], &x[55]);
-  btf_16_adds_subs_avx2(&x[49], &x[54]);
-  btf_16_adds_subs_avx2(&x[50], &x[53]);
-  btf_16_adds_subs_avx2(&x[51], &x[52]);
-  btf_16_adds_subs_avx2(&x[63], &x[56]);
-  btf_16_adds_subs_avx2(&x[62], &x[57]);
-  btf_16_adds_subs_avx2(&x[61], &x[58]);
-  btf_16_adds_subs_avx2(&x[60], &x[59]);
-}
-
-static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi,
-                                             const __m256i _r, int8_t cos_bit) {
-  (void)cos_bit;
-  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
-  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
-  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
-  btf_16_adds_subs_avx2(&x[16], &x[23]);
-  btf_16_adds_subs_avx2(&x[17], &x[22]);
-  btf_16_adds_subs_avx2(&x[18], &x[21]);
-  btf_16_adds_subs_avx2(&x[19], &x[20]);
-  btf_16_adds_subs_avx2(&x[31], &x[24]);
-  btf_16_adds_subs_avx2(&x[30], &x[25]);
-  btf_16_adds_subs_avx2(&x[29], &x[26]);
-  btf_16_adds_subs_avx2(&x[28], &x[27]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit);
-}
-
-static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi,
-                                      const __m256i _r, int8_t cos_bit) {
-  (void)cos_bit;
-  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
-  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
-  btf_16_adds_subs_avx2(&x[0], &x[15]);
-  btf_16_adds_subs_avx2(&x[1], &x[14]);
-  btf_16_adds_subs_avx2(&x[2], &x[13]);
-  btf_16_adds_subs_avx2(&x[3], &x[12]);
-  btf_16_adds_subs_avx2(&x[4], &x[11]);
-  btf_16_adds_subs_avx2(&x[5], &x[10]);
-  btf_16_adds_subs_avx2(&x[6], &x[9]);
-  btf_16_adds_subs_avx2(&x[7], &x[8]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
-  btf_16_adds_subs_avx2(&x[32], &x[47]);
-  btf_16_adds_subs_avx2(&x[33], &x[46]);
-  btf_16_adds_subs_avx2(&x[34], &x[45]);
-  btf_16_adds_subs_avx2(&x[35], &x[44]);
-  btf_16_adds_subs_avx2(&x[36], &x[43]);
-  btf_16_adds_subs_avx2(&x[37], &x[42]);
-  btf_16_adds_subs_avx2(&x[38], &x[41]);
-  btf_16_adds_subs_avx2(&x[39], &x[40]);
-  btf_16_adds_subs_avx2(&x[63], &x[48]);
-  btf_16_adds_subs_avx2(&x[62], &x[49]);
-  btf_16_adds_subs_avx2(&x[61], &x[50]);
-  btf_16_adds_subs_avx2(&x[60], &x[51]);
-  btf_16_adds_subs_avx2(&x[59], &x[52]);
-  btf_16_adds_subs_avx2(&x[58], &x[53]);
-  btf_16_adds_subs_avx2(&x[57], &x[54]);
-  btf_16_adds_subs_avx2(&x[56], &x[55]);
-}
-
-static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi,
-                                       const __m256i _r, int8_t cos_bit) {
-  (void)cos_bit;
-  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
-  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
-  btf_16_adds_subs_avx2(&x[0], &x[31]);
-  btf_16_adds_subs_avx2(&x[1], &x[30]);
-  btf_16_adds_subs_avx2(&x[2], &x[29]);
-  btf_16_adds_subs_avx2(&x[3], &x[28]);
-  btf_16_adds_subs_avx2(&x[4], &x[27]);
-  btf_16_adds_subs_avx2(&x[5], &x[26]);
-  btf_16_adds_subs_avx2(&x[6], &x[25]);
-  btf_16_adds_subs_avx2(&x[7], &x[24]);
-  btf_16_adds_subs_avx2(&x[8], &x[23]);
-  btf_16_adds_subs_avx2(&x[9], &x[22]);
-  btf_16_adds_subs_avx2(&x[10], &x[21]);
-  btf_16_adds_subs_avx2(&x[11], &x[20]);
-  btf_16_adds_subs_avx2(&x[12], &x[19]);
-  btf_16_adds_subs_avx2(&x[13], &x[18]);
-  btf_16_adds_subs_avx2(&x[14], &x[17]);
-  btf_16_adds_subs_avx2(&x[15], &x[16]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit);
-}
-
-static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) {
-  btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]);
-  btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]);
-  btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]);
-  btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]);
-  btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]);
-  btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]);
-  btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]);
-  btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]);
-  btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]);
-  btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]);
-  btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]);
-  btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]);
-  btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]);
-  btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]);
-  btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]);
-  btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]);
-  btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]);
-  btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]);
-  btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]);
-  btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]);
-  btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]);
-  btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]);
-  btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]);
-  btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]);
-  btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]);
-  btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]);
-  btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]);
-  btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]);
-  btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]);
-  btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]);
-  btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]);
-  btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]);
-}
-
-static void idct64_low1_avx2(const __m256i *input, __m256i *output,
-                             int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-
-  // stage 1
-  __m256i x[32];
-  x[0] = input[0];
-
-  // stage 2
-  // stage 3
-  // stage 4
-  // stage 5
-  // stage 6
-  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
-
-  // stage 7
-  // stage 8
-  // stage 9
-  // stage 10
-  // stage 11
-  output[0] = x[0];
-  output[63] = x[0];
-  output[1] = x[1];
-  output[62] = x[1];
-  output[2] = x[1];
-  output[61] = x[1];
-  output[3] = x[0];
-  output[60] = x[0];
-  output[4] = x[0];
-  output[59] = x[0];
-  output[5] = x[1];
-  output[58] = x[1];
-  output[6] = x[1];
-  output[57] = x[1];
-  output[7] = x[0];
-  output[56] = x[0];
-  output[8] = x[0];
-  output[55] = x[0];
-  output[9] = x[1];
-  output[54] = x[1];
-  output[10] = x[1];
-  output[53] = x[1];
-  output[11] = x[0];
-  output[52] = x[0];
-  output[12] = x[0];
-  output[51] = x[0];
-  output[13] = x[1];
-  output[50] = x[1];
-  output[14] = x[1];
-  output[49] = x[1];
-  output[15] = x[0];
-  output[48] = x[0];
-  output[16] = x[0];
-  output[47] = x[0];
-  output[17] = x[1];
-  output[46] = x[1];
-  output[18] = x[1];
-  output[45] = x[1];
-  output[19] = x[0];
-  output[44] = x[0];
-  output[20] = x[0];
-  output[43] = x[0];
-  output[21] = x[1];
-  output[42] = x[1];
-  output[22] = x[1];
-  output[41] = x[1];
-  output[23] = x[0];
-  output[40] = x[0];
-  output[24] = x[0];
-  output[39] = x[0];
-  output[25] = x[1];
-  output[38] = x[1];
-  output[26] = x[1];
-  output[37] = x[1];
-  output[27] = x[0];
-  output[36] = x[0];
-  output[28] = x[0];
-  output[35] = x[0];
-  output[29] = x[1];
-  output[34] = x[1];
-  output[30] = x[1];
-  output[33] = x[1];
-  output[31] = x[0];
-  output[32] = x[0];
-}
-
-static void idct64_low8_avx2(const __m256i *input, __m256i *output,
-                             int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
-  const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
-  const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
-  const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
-  const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
-  const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
-  const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
-  const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
-  const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
-  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
-  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
-  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
-  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
-  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
-  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
-  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
-  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
-
-  // stage 1
-  __m256i x[64];
-  x[0] = input[0];
-  x[8] = input[4];
-  x[16] = input[2];
-  x[24] = input[6];
-  x[32] = input[1];
-  x[40] = input[5];
-  x[48] = input[3];
-  x[56] = input[7];
-
-  // stage 2
-  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
-  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
-  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
-  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
-
-  // stage 3
-  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
-  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
-  x[33] = x[32];
-  x[38] = x[39];
-  x[41] = x[40];
-  x[46] = x[47];
-  x[49] = x[48];
-  x[54] = x[55];
-  x[57] = x[56];
-  x[62] = x[63];
-
-  // stage 4
-  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
-  x[17] = x[16];
-  x[22] = x[23];
-  x[25] = x[24];
-  x[30] = x[31];
-  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
-
-  // stage 5
-  x[9] = x[8];
-  x[14] = x[15];
-  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
-  x[35] = x[32];
-  x[34] = x[33];
-  x[36] = x[39];
-  x[37] = x[38];
-  x[43] = x[40];
-  x[42] = x[41];
-  x[44] = x[47];
-  x[45] = x[46];
-  x[51] = x[48];
-  x[50] = x[49];
-  x[52] = x[55];
-  x[53] = x[54];
-  x[59] = x[56];
-  x[58] = x[57];
-  x[60] = x[63];
-  x[61] = x[62];
-
-  // stage 6
-  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
-  x[19] = x[16];
-  x[18] = x[17];
-  x[20] = x[23];
-  x[21] = x[22];
-  x[27] = x[24];
-  x[26] = x[25];
-  x[28] = x[31];
-  x[29] = x[30];
-  idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
-
-  // stage 7
-  x[3] = x[0];
-  x[2] = x[1];
-  x[11] = x[8];
-  x[10] = x[9];
-  x[12] = x[15];
-  x[13] = x[14];
-  idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
-
-  // stage 8
-  x[7] = x[0];
-  x[6] = x[1];
-  x[5] = x[2];
-  x[4] = x[3];
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
-  idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
-
-  idct64_stage9_avx2(x, cospi, _r, cos_bit);
-  idct64_stage10_avx2(x, cospi, _r, cos_bit);
-  idct64_stage11_avx2(output, x);
-}
-
-static void idct64_low16_avx2(const __m256i *input, __m256i *output,
-                              int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
-  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
-  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
-  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
-  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
-
-  // stage 1
-  __m256i x[64];
-  x[0] = input[0];
-  x[4] = input[8];
-  x[8] = input[4];
-  x[12] = input[12];
-  x[16] = input[2];
-  x[20] = input[10];
-  x[24] = input[6];
-  x[28] = input[14];
-  x[32] = input[1];
-  x[36] = input[9];
-  x[40] = input[5];
-  x[44] = input[13];
-  x[48] = input[3];
-  x[52] = input[11];
-  x[56] = input[7];
-  x[60] = input[15];
-
-  // stage 2
-  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
-  btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
-  btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
-  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
-  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
-  btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
-  btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
-  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
-
-  // stage 3
-  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
-  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
-  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
-  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
-  x[33] = x[32];
-  x[34] = x[35];
-  x[37] = x[36];
-  x[38] = x[39];
-  x[41] = x[40];
-  x[42] = x[43];
-  x[45] = x[44];
-  x[46] = x[47];
-  x[49] = x[48];
-  x[50] = x[51];
-  x[53] = x[52];
-  x[54] = x[55];
-  x[57] = x[56];
-  x[58] = x[59];
-  x[61] = x[60];
-  x[62] = x[63];
-
-  // stage 4
-  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
-  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
-  x[17] = x[16];
-  x[18] = x[19];
-  x[21] = x[20];
-  x[22] = x[23];
-  x[25] = x[24];
-  x[26] = x[27];
-  x[29] = x[28];
-  x[30] = x[31];
-  idct64_stage4_high32_avx2(x, cospi, _r, cos_bit);
-
-  // stage 5
-  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
-  x[9] = x[8];
-  x[10] = x[11];
-  x[13] = x[12];
-  x[14] = x[15];
-  idct64_stage5_high48_avx2(x, cospi, _r, cos_bit);
-
-  // stage 6
-  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
-  x[5] = x[4];
-  x[6] = x[7];
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
-  idct64_stage6_high48_avx2(x, cospi, _r, cos_bit);
-
-  // stage 7
-  x[3] = x[0];
-  x[2] = x[1];
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
-  btf_16_adds_subs_avx2(&x[8], &x[11]);
-  btf_16_adds_subs_avx2(&x[9], &x[10]);
-  btf_16_adds_subs_avx2(&x[15], &x[12]);
-  btf_16_adds_subs_avx2(&x[14], &x[13]);
-  idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
-
-  // stage 8
-  btf_16_adds_subs_avx2(&x[0], &x[7]);
-  btf_16_adds_subs_avx2(&x[1], &x[6]);
-  btf_16_adds_subs_avx2(&x[2], &x[5]);
-  btf_16_adds_subs_avx2(&x[3], &x[4]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
-  idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
-
-  idct64_stage9_avx2(x, cospi, _r, cos_bit);
-  idct64_stage10_avx2(x, cospi, _r, cos_bit);
-  idct64_stage11_avx2(output, x);
-}
-
-static void idct64_low32_avx2(const __m256i *input, __m256i *output,
-                              int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
-  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
-  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
-  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
-  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
-
-  // stage 1
-  __m256i x[64];
-  x[0] = input[0];
-  x[2] = input[16];
-  x[4] = input[8];
-  x[6] = input[24];
-  x[8] = input[4];
-  x[10] = input[20];
-  x[12] = input[12];
-  x[14] = input[28];
-  x[16] = input[2];
-  x[18] = input[18];
-  x[20] = input[10];
-  x[22] = input[26];
-  x[24] = input[6];
-  x[26] = input[22];
-  x[28] = input[14];
-  x[30] = input[30];
-  x[32] = input[1];
-  x[34] = input[17];
-  x[36] = input[9];
-  x[38] = input[25];
-  x[40] = input[5];
-  x[42] = input[21];
-  x[44] = input[13];
-  x[46] = input[29];
-  x[48] = input[3];
-  x[50] = input[19];
-  x[52] = input[11];
-  x[54] = input[27];
-  x[56] = input[7];
-  x[58] = input[23];
-  x[60] = input[15];
-  x[62] = input[31];
-
-  // stage 2
-  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
-  btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]);
-  btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]);
-  btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
-  btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
-  btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]);
-  btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]);
-  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
-  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
-  btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]);
-  btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]);
-  btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
-  btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
-  btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]);
-  btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]);
-  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
-
-  // stage 3
-  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
-  btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
-  btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
-  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
-  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
-  btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
-  btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
-  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
-  btf_16_adds_subs_avx2(&x[32], &x[33]);
-  btf_16_adds_subs_avx2(&x[35], &x[34]);
-  btf_16_adds_subs_avx2(&x[36], &x[37]);
-  btf_16_adds_subs_avx2(&x[39], &x[38]);
-  btf_16_adds_subs_avx2(&x[40], &x[41]);
-  btf_16_adds_subs_avx2(&x[43], &x[42]);
-  btf_16_adds_subs_avx2(&x[44], &x[45]);
-  btf_16_adds_subs_avx2(&x[47], &x[46]);
-  btf_16_adds_subs_avx2(&x[48], &x[49]);
-  btf_16_adds_subs_avx2(&x[51], &x[50]);
-  btf_16_adds_subs_avx2(&x[52], &x[53]);
-  btf_16_adds_subs_avx2(&x[55], &x[54]);
-  btf_16_adds_subs_avx2(&x[56], &x[57]);
-  btf_16_adds_subs_avx2(&x[59], &x[58]);
-  btf_16_adds_subs_avx2(&x[60], &x[61]);
-  btf_16_adds_subs_avx2(&x[63], &x[62]);
-
-  // stage 4
-  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
-  btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
-  btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
-  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
-  btf_16_adds_subs_avx2(&x[16], &x[17]);
-  btf_16_adds_subs_avx2(&x[19], &x[18]);
-  btf_16_adds_subs_avx2(&x[20], &x[21]);
-  btf_16_adds_subs_avx2(&x[23], &x[22]);
-  btf_16_adds_subs_avx2(&x[24], &x[25]);
-  btf_16_adds_subs_avx2(&x[27], &x[26]);
-  btf_16_adds_subs_avx2(&x[28], &x[29]);
-  btf_16_adds_subs_avx2(&x[31], &x[30]);
-  idct64_stage4_high32_avx2(x, cospi, _r, cos_bit);
-
-  // stage 5
-  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
-  btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
-  btf_16_adds_subs_avx2(&x[8], &x[9]);
-  btf_16_adds_subs_avx2(&x[11], &x[10]);
-  btf_16_adds_subs_avx2(&x[12], &x[13]);
-  btf_16_adds_subs_avx2(&x[15], &x[14]);
-  idct64_stage5_high48_avx2(x, cospi, _r, cos_bit);
-
-  // stage 6
-  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
-  btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
-  btf_16_adds_subs_avx2(&x[4], &x[5]);
-  btf_16_adds_subs_avx2(&x[7], &x[6]);
-  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
-  idct64_stage6_high48_avx2(x, cospi, _r, cos_bit);
-
-  // stage 7
-  btf_16_adds_subs_avx2(&x[0], &x[3]);
-  btf_16_adds_subs_avx2(&x[1], &x[2]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
-  btf_16_adds_subs_avx2(&x[8], &x[11]);
-  btf_16_adds_subs_avx2(&x[9], &x[10]);
-  btf_16_adds_subs_avx2(&x[15], &x[12]);
-  btf_16_adds_subs_avx2(&x[14], &x[13]);
-  idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
-
-  // stage 8
-  btf_16_adds_subs_avx2(&x[0], &x[7]);
-  btf_16_adds_subs_avx2(&x[1], &x[6]);
-  btf_16_adds_subs_avx2(&x[2], &x[5]);
-  btf_16_adds_subs_avx2(&x[3], &x[4]);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
-  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
-  idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
-
-  // stage 9~11
-  idct64_stage9_avx2(x, cospi, _r, cos_bit);
-  idct64_stage10_avx2(x, cospi, _r, cos_bit);
-  idct64_stage11_avx2(output, x);
-}
-
-typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
-                                  int8_t cos_bit);
-
-// 1D functions process 16 pixels at one time.
-static const transform_1d_avx2
-    lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = {
-      {
-          { NULL, NULL, NULL, NULL },
-          { NULL, NULL, NULL, NULL },
-          { NULL, NULL, NULL, NULL },
-      },
-      { { NULL, NULL, NULL, NULL },
-        { NULL, NULL, NULL, NULL },
-        { NULL, NULL, NULL, NULL } },
-      {
-          { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL },
-          { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL },
-          { NULL, NULL, NULL, NULL },
-      },
-      { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
-        { NULL, NULL, NULL, NULL },
-        { NULL, NULL, NULL, NULL } },
-      { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2,
-          idct64_low32_avx2 },
-        { NULL, NULL, NULL, NULL },
-        { NULL, NULL, NULL, NULL } }
-    };
-
-// only process w >= 16 h >= 16
-static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
-    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
-    TX_SIZE tx_size, int eob) {
-  __m256i buf1[64 * 16];
-  int eobx, eoby;
-  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-  const int buf_size_w_div16 = txfm_size_col >> 4;
-  const int buf_size_nonzero_w_div16 = (eobx + 16) >> 4;
-  const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4;
-  const int input_stride = AOMMIN(32, txfm_size_col);
-  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
-
-  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
-  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
-  const transform_1d_avx2 row_txfm =
-      lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
-  const transform_1d_avx2 col_txfm =
-      lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
-
-  assert(col_txfm != NULL);
-  assert(row_txfm != NULL);
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const __m256i scale0 = _mm256_set1_epi16(1 << (15 + shift[0]));
-  for (int i = 0; i < buf_size_nonzero_h_div16; i++) {
-    __m256i buf0[64];
-    const int32_t *input_row = input + (i << 4) * input_stride;
-    for (int j = 0; j < buf_size_nonzero_w_div16; ++j) {
-      __m256i *buf0_cur = buf0 + j * 16;
-      const int32_t *input_cur = input_row + j * 16;
-      load_buffer_32bit_to_16bit_w16_avx2(input_cur, input_stride, buf0_cur,
-                                          16);
-      transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
-    }
-    if (rect_type == 1 || rect_type == -1) {
-      round_shift_avx2(buf0, buf0, input_stride);  // rect special code
-    }
-    row_txfm(buf0, buf0, cos_bit_row);
-    for (int j = 0; j < txfm_size_col; ++j) {
-      buf0[j] = _mm256_mulhrs_epi16(buf0[j], scale0);
-    }
-
-    __m256i *buf1_cur = buf1 + (i << 4);
-    if (lr_flip) {
-      for (int j = 0; j < buf_size_w_div16; ++j) {
-        __m256i temp[16];
-        flip_buf_avx2(buf0 + 16 * j, temp, 16);
-        int offset = txfm_size_row * (buf_size_w_div16 - 1 - j);
-        transpose_16bit_16x16_avx2(temp, buf1_cur + offset);
-      }
-    } else {
-      for (int j = 0; j < buf_size_w_div16; ++j) {
-        transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j);
-      }
-    }
-  }
-  const __m256i scale1 = _mm256_set1_epi16(1 << (15 + shift[1]));
-  for (int i = 0; i < buf_size_w_div16; i++) {
-    __m256i *buf1_cur = buf1 + i * txfm_size_row;
-    col_txfm(buf1_cur, buf1_cur, cos_bit_col);
-    for (int j = 0; j < txfm_size_row; ++j) {
-      buf1_cur[j] = _mm256_mulhrs_epi16(buf1_cur[j], scale1);
-    }
-  }
-  for (int i = 0; i < buf_size_w_div16; i++) {
-    lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i,
-                                 stride, ud_flip, txfm_size_row);
-  }
-}
-
-static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
-                                           int stride, int shift, int height,
-                                           int txw_idx, int rect_type) {
-  const int32_t *input_row = input;
-  const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]);
-  const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) +
-                                       (1 << (NewSqrt2Bits - shift - 1)));
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r);
-  if (rect_type != 1 && rect_type != -1) {
-    for (int i = 0; i < height; ++i) {
-      const __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
-      input_row += stride;
-      __m256i lo = _mm256_unpacklo_epi16(src, one);
-      __m256i hi = _mm256_unpackhi_epi16(src, one);
-      lo = _mm256_madd_epi16(lo, scale__r);
-      hi = _mm256_madd_epi16(hi, scale__r);
-      lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
-      hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
-      out[i] = _mm256_packs_epi32(lo, hi);
-    }
-  } else {
-    const __m256i rect_scale =
-        _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
-    for (int i = 0; i < height; ++i) {
-      __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
-      src = _mm256_mulhrs_epi16(src, rect_scale);
-      input_row += stride;
-      __m256i lo = _mm256_unpacklo_epi16(src, one);
-      __m256i hi = _mm256_unpackhi_epi16(src, one);
-      lo = _mm256_madd_epi16(lo, scale__r);
-      hi = _mm256_madd_epi16(hi, scale__r);
-      lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
-      hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
-      out[i] = _mm256_packs_epi32(lo, hi);
-    }
-  }
-}
-
-static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride,
-                                           __m256i *buf, int shift, int height,
-                                           int txh_idx) {
-  const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]);
-  const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1));
-  const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1));
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r);
-  for (int h = 0; h < height; ++h) {
-    __m256i lo = _mm256_unpacklo_epi16(buf[h], one);
-    __m256i hi = _mm256_unpackhi_epi16(buf[h], one);
-    lo = _mm256_madd_epi16(lo, scale_coeff);
-    hi = _mm256_madd_epi16(hi, scale_coeff);
-    lo = _mm256_srai_epi32(lo, NewSqrt2Bits);
-    hi = _mm256_srai_epi32(hi, NewSqrt2Bits);
-    lo = _mm256_add_epi32(lo, shift__r);
-    hi = _mm256_add_epi32(hi, shift__r);
-    lo = _mm256_srai_epi32(lo, -shift);
-    hi = _mm256_srai_epi32(hi, -shift);
-    const __m256i x = _mm256_packs_epi32(lo, hi);
-    write_recon_w16_avx2(x, output);
-    output += stride;
-  }
-}
-
-static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input,
-                                                  uint8_t *output, int stride,
-                                                  TX_SIZE tx_size,
-                                                  int32_t eob) {
-  (void)eob;
-  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-  const int input_stride = AOMMIN(32, txfm_size_col);
-  const int row_max = AOMMIN(32, txfm_size_row);
-  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
-  __m256i buf[32];
-  for (int i = 0; i < input_stride; i += 16) {
-    iidentity_row_16xn_avx2(buf, input + i, input_stride, shift[0], row_max,
-                            txw_idx, rect_type);
-    iidentity_col_16xn_avx2(output + i, stride, buf, shift[1], row_max,
-                            txh_idx);
-  }
-}
-
-static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2(
-    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
-    TX_SIZE tx_size, int eob) {
-  int eobx, eoby;
-  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-  const int txfm_size_col_notzero = AOMMIN(32, txfm_size_col);
-  const int input_stride = txfm_size_col_notzero;
-  const int buf_size_w_div16 = (eobx + 16) >> 4;
-  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
-
-  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
-  const transform_1d_avx2 col_txfm =
-      lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
-
-  assert(col_txfm != NULL);
-
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  for (int i = 0; i < buf_size_w_div16; i++) {
-    __m256i buf0[64];
-    iidentity_row_16xn_avx2(buf0, input + (i << 4), input_stride, shift[0],
-                            eoby + 1, txw_idx, rect_type);
-    col_txfm(buf0, buf0, cos_bit_col);
-    __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1]));
-    int k = ud_flip ? (txfm_size_row - 1) : 0;
-    const int step = ud_flip ? -1 : 1;
-    for (int j = 0; j < txfm_size_row; ++j, k += step) {
-      __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift);
-      write_recon_w16_avx2(res, output + (i << 4) + j * stride);
-    }
-  }
-}
-
-static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2(
-    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
-    TX_SIZE tx_size, int eob) {
-  __m256i buf1[64];
-  int eobx, eoby;
-  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-  const int buf_size_w_div16 = txfm_size_col >> 4;
-  const int buf_size_h_div16 = (eoby + 16) >> 4;
-  const int input_stride = AOMMIN(32, txfm_size_col);
-  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
-
-  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
-  const transform_1d_avx2 row_txfm =
-      lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
-
-  assert(row_txfm != NULL);
-
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  for (int i = 0; i < buf_size_h_div16; i++) {
-    __m256i buf0[64];
-    const int32_t *input_row = input + i * input_stride * 16;
-    for (int j = 0; j < AOMMIN(4, buf_size_w_div16); ++j) {
-      __m256i *buf0_cur = buf0 + j * 16;
-      load_buffer_32bit_to_16bit_w16_avx2(input_row + j * 16, input_stride,
-                                          buf0_cur, 16);
-      transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
-    }
-    if (rect_type == 1 || rect_type == -1) {
-      round_shift_avx2(buf0, buf0, input_stride);  // rect special code
-    }
-    row_txfm(buf0, buf0, cos_bit_row);
-    round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
-    __m256i *_buf1 = buf1;
-    if (lr_flip) {
-      for (int j = 0; j < buf_size_w_div16; ++j) {
-        __m256i temp[16];
-        flip_buf_avx2(buf0 + 16 * j, temp, 16);
-        transpose_16bit_16x16_avx2(temp,
-                                   _buf1 + 16 * (buf_size_w_div16 - 1 - j));
-      }
-    } else {
-      for (int j = 0; j < buf_size_w_div16; ++j) {
-        transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j);
-      }
-    }
-    for (int j = 0; j < buf_size_w_div16; ++j) {
-      iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride,
-                              buf1 + j * 16, shift[1], 16, txh_idx);
-    }
-  }
-}
-
-// for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64
-static INLINE void lowbd_inv_txfm2d_add_universe_avx2(
-    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
-    TX_SIZE tx_size, int eob) {
-  (void)eob;
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:   // ADST in vertical, DCT in horizontal
-    case DCT_ADST:   // DCT  in vertical, ADST in horizontal
-    case ADST_ADST:  // ADST in both directions
-    case FLIPADST_DCT:
-    case DCT_FLIPADST:
-    case FLIPADST_FLIPADST:
-    case ADST_FLIPADST:
-    case FLIPADST_ADST:
-      lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type,
-                                            tx_size, eob);
-      break;
-    case IDTX:
-      lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob);
-      break;
-    case V_DCT:
-    case V_ADST:
-    case V_FLIPADST:
-      lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type,
-                                           tx_size, eob);
-      break;
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST:
-      lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type,
-                                           tx_size, eob);
-      break;
-    default:
-      av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
-                                     eob);
-      break;
-  }
-}
-
-void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
-                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
-                                   int eob) {
-  switch (tx_size) {
-    case TX_4X4:
-    case TX_8X8:
-    case TX_4X8:
-    case TX_8X4:
-    case TX_8X16:
-    case TX_16X8:
-    case TX_4X16:
-    case TX_16X4:
-    case TX_8X32:
-    case TX_32X8:
-      av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
-                                     eob);
-      break;
-    case TX_16X16:
-    case TX_32X32:
-    case TX_64X64:
-    case TX_16X32:
-    case TX_32X16:
-    case TX_32X64:
-    case TX_64X32:
-    case TX_16X64:
-    case TX_64X16:
-    default:
-      lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type,
-                                         tx_size, eob);
-      break;
-  }
-}
-
-void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
-                           const TxfmParam *txfm_param) {
-  const TX_TYPE tx_type = txfm_param->tx_type;
-  if (!txfm_param->lossless) {
-    av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type,
-                                  txfm_param->tx_size, txfm_param->eob);
-  } else {
-    av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
-  }
-}

diff --git a/av1/common/x86/av1_inv_txfm_avx2.h b/av1/common/x86/av1_inv_txfm_avx2.h
deleted file mode 100644
index 88bd196..0000000
--- a/av1/common/x86/av1_inv_txfm_avx2.h
+++ /dev/null

@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
-#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
-
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-#include "config/av1_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/x86/transpose_sse2.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-#include "aom_dsp/x86/txfm_common_avx2.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// half input is zero
-#define btf_16_w16_0_avx2(w0, w1, in, out0, out1)  \
-  {                                                \
-    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
-    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
-    const __m256i _in = in;                        \
-    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
-    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
-  }
-
-static INLINE void round_shift_avx2(const __m256i *input, __m256i *output,
-                                    int size) {
-  const __m256i scale = _mm256_set1_epi16(NewInvSqrt2 * 8);
-  for (int i = 0; i < size; ++i) {
-    output[i] = _mm256_mulhrs_epi16(input[i], scale);
-  }
-}
-
-static INLINE void write_recon_w16_avx2(__m256i res, uint8_t *output) {
-  __m128i pred = _mm_loadu_si128((__m128i const *)(output));
-  __m256i u = _mm256_adds_epi16(_mm256_cvtepu8_epi16(pred), res);
-  __m128i y = _mm256_castsi256_si128(
-      _mm256_permute4x64_epi64(_mm256_packus_epi16(u, u), 168));
-  _mm_storeu_si128((__m128i *)(output), y);
-}
-
-static INLINE void lowbd_write_buffer_16xn_avx2(__m256i *in, uint8_t *output,
-                                                int stride, int flipud,
-                                                int height) {
-  int j = flipud ? (height - 1) : 0;
-  const int step = flipud ? -1 : 1;
-  for (int i = 0; i < height; ++i, j += step) {
-    write_recon_w16_avx2(in[j], output + i * stride);
-  }
-}
-
-void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
-                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
-                                   int eob);
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_

diff --git a/av1/common/x86/av1_inv_txfm_ssse3.c b/av1/common/x86/av1_inv_txfm_ssse3.c
deleted file mode 100644
index 642e7a2..0000000
--- a/av1/common/x86/av1_inv_txfm_ssse3.c
+++ /dev/null

@@ -1,2957 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include "config/aom_config.h"
-#include "config/av1_rtcd.h"
-
-#include "av1/common/av1_inv_txfm1d_cfg.h"
-#include "av1/common/x86/av1_inv_txfm_ssse3.h"
-#include "av1/common/x86/av1_txfm_sse2.h"
-
-// TODO(venkatsanampudi@ittiam.com): move this to header file
-
-// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
-static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
-                                          4 * 5793 };
-
-// TODO(binpengsmail@gmail.com): replace some for loop with do {} while
-
-static void idct4_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
-  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
-  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
-
-  // stage 1
-  __m128i x[4];
-  x[0] = input[0];
-  x[1] = input[2];
-  x[2] = input[1];
-  x[3] = input[3];
-
-  // stage 2
-  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
-  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
-
-  // stage 3
-  btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
-  btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
-}
-
-static void idct4_w4_sse2(const __m128i *input, __m128i *output,
-                          int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
-  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
-  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
-
-  // stage 1
-  __m128i x[4];
-  x[0] = input[0];
-  x[1] = input[2];
-  x[2] = input[1];
-  x[3] = input[3];
-
-  // stage 2
-  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
-  btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
-
-  // stage 3
-  btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
-  btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
-}
-
-static void idct8_low1_ssse3(const __m128i *input, __m128i *output,
-                             int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-
-  // stage 1
-  __m128i x[2];
-  x[0] = input[0];
-
-  // stage 2
-  // stage 3
-  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
-
-  // stage 4
-  // stage 5
-  output[0] = x[0];
-  output[7] = x[0];
-  output[1] = x[1];
-  output[6] = x[1];
-  output[2] = x[1];
-  output[5] = x[1];
-  output[3] = x[0];
-  output[4] = x[0];
-}
-
-static void idct8_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
-  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
-  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
-  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
-  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
-  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
-  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
-  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
-
-  // stage 1
-  __m128i x[8];
-  x[0] = input[0];
-  x[1] = input[4];
-  x[2] = input[2];
-  x[3] = input[6];
-  x[4] = input[1];
-  x[5] = input[5];
-  x[6] = input[3];
-  x[7] = input[7];
-
-  // stage 2
-  btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
-  btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
-
-  // stage 3
-  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
-  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
-  btf_16_adds_subs_sse2(x[4], x[5]);
-  btf_16_subs_adds_sse2(x[7], x[6]);
-
-  // stage 4
-  btf_16_adds_subs_sse2(x[0], x[3]);
-  btf_16_adds_subs_sse2(x[1], x[2]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
-
-  // stage 5
-  btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
-  btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
-  btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
-  btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
-}
-
-static void idct8_w4_sse2(const __m128i *input, __m128i *output,
-                          int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
-  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
-  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
-  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
-  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
-  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
-  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
-  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
-
-  // stage 1
-  __m128i x[8];
-  x[0] = input[0];
-  x[1] = input[4];
-  x[2] = input[2];
-  x[3] = input[6];
-  x[4] = input[1];
-  x[5] = input[5];
-  x[6] = input[3];
-  x[7] = input[7];
-
-  // stage 2
-  btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
-  btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
-
-  // stage 3
-  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
-  btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
-  btf_16_adds_subs_sse2(x[4], x[5]);
-  btf_16_subs_adds_sse2(x[7], x[6]);
-
-  // stage 4
-  btf_16_adds_subs_sse2(x[0], x[3]);
-  btf_16_adds_subs_sse2(x[1], x[2]);
-  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
-
-  // stage 5
-  btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
-  btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
-  btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
-  btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
-}
-
-static INLINE void idct16_stage5_sse2(__m128i *x, const int32_t *cospi,
-                                      const __m128i __rounding,
-                                      int8_t cos_bit) {
-  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
-  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  btf_16_adds_subs_sse2(x[0], x[3]);
-  btf_16_adds_subs_sse2(x[1], x[2]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
-  btf_16_adds_subs_sse2(x[8], x[11]);
-  btf_16_adds_subs_sse2(x[9], x[10]);
-  btf_16_subs_adds_sse2(x[15], x[12]);
-  btf_16_subs_adds_sse2(x[14], x[13]);
-}
-
-static INLINE void idct16_stage6_sse2(__m128i *x, const int32_t *cospi,
-                                      const __m128i __rounding,
-                                      int8_t cos_bit) {
-  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
-  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  btf_16_adds_subs_sse2(x[0], x[7]);
-  btf_16_adds_subs_sse2(x[1], x[6]);
-  btf_16_adds_subs_sse2(x[2], x[5]);
-  btf_16_adds_subs_sse2(x[3], x[4]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
-}
-
-static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) {
-  btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]);
-  btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]);
-  btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]);
-  btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]);
-  btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]);
-  btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]);
-  btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]);
-  btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]);
-}
-
-static void idct16_low1_ssse3(const __m128i *input, __m128i *output,
-                              int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-
-  // stage 1
-  __m128i x[2];
-  x[0] = input[0];
-
-  // stage 2
-  // stage 3
-  // stage 4
-  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
-
-  // stage 5
-  // stage 6
-  // stage 7
-  output[0] = x[0];
-  output[15] = x[0];
-  output[1] = x[1];
-  output[14] = x[1];
-  output[2] = x[1];
-  output[13] = x[1];
-  output[3] = x[0];
-  output[12] = x[0];
-  output[4] = x[0];
-  output[11] = x[0];
-  output[5] = x[1];
-  output[10] = x[1];
-  output[6] = x[1];
-  output[9] = x[1];
-  output[7] = x[0];
-  output[8] = x[0];
-}
-
-static void idct16_low8_ssse3(const __m128i *input, __m128i *output,
-                              int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
-  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
-  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
-  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
-
-  // stage 1
-  __m128i x[16];
-  x[0] = input[0];
-  x[2] = input[4];
-  x[4] = input[2];
-  x[6] = input[6];
-  x[8] = input[1];
-  x[10] = input[5];
-  x[12] = input[3];
-  x[14] = input[7];
-
-  // stage 2
-  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
-  btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
-  btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
-  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
-
-  // stage 3
-  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
-  btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
-  btf_16_adds_subs_sse2(x[8], x[9]);
-  btf_16_subs_adds_sse2(x[11], x[10]);
-  btf_16_adds_subs_sse2(x[12], x[13]);
-  btf_16_subs_adds_sse2(x[15], x[14]);
-
-  // stage 4
-  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
-  btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
-  btf_16_adds_subs_sse2(x[4], x[5]);
-  btf_16_subs_adds_sse2(x[7], x[6]);
-  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
-  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
-
-  idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
-  idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
-  idct16_stage7_sse2(output, x);
-}
-
-static void idct16_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
-  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
-  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
-  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
-  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
-  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
-  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
-  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
-  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
-  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
-  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
-  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
-  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
-  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
-  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
-  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
-  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
-  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
-
-  // stage 1
-  __m128i x[16];
-  x[0] = input[0];
-  x[1] = input[8];
-  x[2] = input[4];
-  x[3] = input[12];
-  x[4] = input[2];
-  x[5] = input[10];
-  x[6] = input[6];
-  x[7] = input[14];
-  x[8] = input[1];
-  x[9] = input[9];
-  x[10] = input[5];
-  x[11] = input[13];
-  x[12] = input[3];
-  x[13] = input[11];
-  x[14] = input[7];
-  x[15] = input[15];
-
-  // stage 2
-  btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
-  btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
-  btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
-  btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
-
-  // stage 3
-  btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
-  btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
-  btf_16_adds_subs_sse2(x[8], x[9]);
-  btf_16_subs_adds_sse2(x[11], x[10]);
-  btf_16_adds_subs_sse2(x[12], x[13]);
-  btf_16_subs_adds_sse2(x[15], x[14]);
-
-  // stage 4
-  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
-  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
-  btf_16_adds_subs_sse2(x[4], x[5]);
-  btf_16_subs_adds_sse2(x[7], x[6]);
-  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
-  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
-
-  // stage 5~7
-  idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
-  idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
-  idct16_stage7_sse2(output, x);
-}
-
-static void idct16_w4_sse2(const __m128i *input, __m128i *output,
-                           int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
-  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
-  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
-  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
-  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
-  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
-  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
-  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
-  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
-  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
-  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
-  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
-  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
-  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
-  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
-  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
-  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
-  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
-  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
-
-  // stage 1
-  __m128i x[16];
-  x[0] = input[0];
-  x[1] = input[8];
-  x[2] = input[4];
-  x[3] = input[12];
-  x[4] = input[2];
-  x[5] = input[10];
-  x[6] = input[6];
-  x[7] = input[14];
-  x[8] = input[1];
-  x[9] = input[9];
-  x[10] = input[5];
-  x[11] = input[13];
-  x[12] = input[3];
-  x[13] = input[11];
-  x[14] = input[7];
-  x[15] = input[15];
-
-  // stage 2
-  btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
-  btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
-  btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
-  btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
-
-  // stage 3
-  btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
-  btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
-  btf_16_adds_subs_sse2(x[8], x[9]);
-  btf_16_subs_adds_sse2(x[11], x[10]);
-  btf_16_adds_subs_sse2(x[12], x[13]);
-  btf_16_subs_adds_sse2(x[15], x[14]);
-
-  // stage 4
-  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
-  btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
-  btf_16_adds_subs_sse2(x[4], x[5]);
-  btf_16_subs_adds_sse2(x[7], x[6]);
-  btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
-  btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
-
-  // stage 5
-  btf_16_adds_subs_sse2(x[0], x[3]);
-  btf_16_adds_subs_sse2(x[1], x[2]);
-  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
-  btf_16_adds_subs_sse2(x[8], x[11]);
-  btf_16_adds_subs_sse2(x[9], x[10]);
-  btf_16_subs_adds_sse2(x[15], x[12]);
-  btf_16_subs_adds_sse2(x[14], x[13]);
-
-  // stage 6
-  btf_16_adds_subs_sse2(x[0], x[7]);
-  btf_16_adds_subs_sse2(x[1], x[6]);
-  btf_16_adds_subs_sse2(x[2], x[5]);
-  btf_16_adds_subs_sse2(x[3], x[4]);
-  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
-  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
-
-  // stage 7
-  idct16_stage7_sse2(output, x);
-}
-
-static INLINE void idct32_high16_stage3_sse2(__m128i *x) {
-  btf_16_adds_subs_sse2(x[16], x[17]);
-  btf_16_subs_adds_sse2(x[19], x[18]);
-  btf_16_adds_subs_sse2(x[20], x[21]);
-  btf_16_subs_adds_sse2(x[23], x[22]);
-  btf_16_adds_subs_sse2(x[24], x[25]);
-  btf_16_subs_adds_sse2(x[27], x[26]);
-  btf_16_adds_subs_sse2(x[28], x[29]);
-  btf_16_subs_adds_sse2(x[31], x[30]);
-}
-
-static INLINE void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi,
-                                             const __m128i __rounding,
-                                             int8_t cos_bit) {
-  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
-  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
-  const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
-  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
-  const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
-  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
-  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
-  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
-  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
-  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
-}
-
-static INLINE void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi,
-                                             const __m128i __rounding,
-                                             int8_t cos_bit) {
-  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
-  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
-  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
-  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
-  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
-  btf_16_adds_subs_sse2(x[16], x[19]);
-  btf_16_adds_subs_sse2(x[17], x[18]);
-  btf_16_subs_adds_sse2(x[23], x[20]);
-  btf_16_subs_adds_sse2(x[22], x[21]);
-  btf_16_adds_subs_sse2(x[24], x[27]);
-  btf_16_adds_subs_sse2(x[25], x[26]);
-  btf_16_subs_adds_sse2(x[31], x[28]);
-  btf_16_subs_adds_sse2(x[30], x[29]);
-}
-
-static INLINE void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi,
-                                             const __m128i __rounding,
-                                             int8_t cos_bit) {
-  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
-  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
-  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
-  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
-  btf_16_adds_subs_sse2(x[8], x[11]);
-  btf_16_adds_subs_sse2(x[9], x[10]);
-  btf_16_subs_adds_sse2(x[15], x[12]);
-  btf_16_subs_adds_sse2(x[14], x[13]);
-  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
-  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
-  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
-  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
-}
-
-static INLINE void idct32_stage7_sse2(__m128i *x, const int32_t *cospi,
-                                      const __m128i __rounding,
-                                      int8_t cos_bit) {
-  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
-  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  btf_16_adds_subs_sse2(x[0], x[7]);
-  btf_16_adds_subs_sse2(x[1], x[6]);
-  btf_16_adds_subs_sse2(x[2], x[5]);
-  btf_16_adds_subs_sse2(x[3], x[4]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
-  btf_16_adds_subs_sse2(x[16], x[23]);
-  btf_16_adds_subs_sse2(x[17], x[22]);
-  btf_16_adds_subs_sse2(x[18], x[21]);
-  btf_16_adds_subs_sse2(x[19], x[20]);
-  btf_16_subs_adds_sse2(x[31], x[24]);
-  btf_16_subs_adds_sse2(x[30], x[25]);
-  btf_16_subs_adds_sse2(x[29], x[26]);
-  btf_16_subs_adds_sse2(x[28], x[27]);
-}
-
-static INLINE void idct32_stage8_sse2(__m128i *x, const int32_t *cospi,
-                                      const __m128i __rounding,
-                                      int8_t cos_bit) {
-  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
-  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  btf_16_adds_subs_sse2(x[0], x[15]);
-  btf_16_adds_subs_sse2(x[1], x[14]);
-  btf_16_adds_subs_sse2(x[2], x[13]);
-  btf_16_adds_subs_sse2(x[3], x[12]);
-  btf_16_adds_subs_sse2(x[4], x[11]);
-  btf_16_adds_subs_sse2(x[5], x[10]);
-  btf_16_adds_subs_sse2(x[6], x[9]);
-  btf_16_adds_subs_sse2(x[7], x[8]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
-}
-
-static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) {
-  btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]);
-  btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]);
-  btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]);
-  btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]);
-  btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]);
-  btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]);
-  btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]);
-  btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]);
-  btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]);
-  btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]);
-  btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]);
-  btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]);
-  btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]);
-  btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]);
-  btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]);
-  btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]);
-}
-
-static void idct32_low1_ssse3(const __m128i *input, __m128i *output,
-                              int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-
-  // stage 1
-  __m128i x[2];
-  x[0] = input[0];
-
-  // stage 2
-  // stage 3
-  // stage 4
-  // stage 5
-  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
-
-  // stage 6
-  // stage 7
-  // stage 8
-  // stage 9
-  output[0] = x[0];
-  output[31] = x[0];
-  output[1] = x[1];
-  output[30] = x[1];
-  output[2] = x[1];
-  output[29] = x[1];
-  output[3] = x[0];
-  output[28] = x[0];
-  output[4] = x[0];
-  output[27] = x[0];
-  output[5] = x[1];
-  output[26] = x[1];
-  output[6] = x[1];
-  output[25] = x[1];
-  output[7] = x[0];
-  output[24] = x[0];
-  output[8] = x[0];
-  output[23] = x[0];
-  output[9] = x[1];
-  output[22] = x[1];
-  output[10] = x[1];
-  output[21] = x[1];
-  output[11] = x[0];
-  output[20] = x[0];
-  output[12] = x[0];
-  output[19] = x[0];
-  output[13] = x[1];
-  output[18] = x[1];
-  output[14] = x[1];
-  output[17] = x[1];
-  output[15] = x[0];
-  output[16] = x[0];
-}
-
-static void idct32_low8_ssse3(const __m128i *input, __m128i *output,
-                              int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  // stage 1
-  __m128i x[32];
-  x[0] = input[0];
-  x[4] = input[4];
-  x[8] = input[2];
-  x[12] = input[6];
-  x[16] = input[1];
-  x[20] = input[5];
-  x[24] = input[3];
-  x[28] = input[7];
-
-  // stage 2
-  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
-  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
-  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
-  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
-
-  // stage 3
-  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
-  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
-  x[17] = x[16];
-  x[18] = x[19];
-  x[21] = x[20];
-  x[22] = x[23];
-  x[25] = x[24];
-  x[26] = x[27];
-  x[29] = x[28];
-  x[30] = x[31];
-
-  // stage 4
-  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
-  x[9] = x[8];
-  x[10] = x[11];
-  x[13] = x[12];
-  x[14] = x[15];
-  idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
-
-  // stage 5
-  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
-  x[5] = x[4];
-  x[6] = x[7];
-  idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
-  // stage 6
-  x[3] = x[0];
-  x[2] = x[1];
-  idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
-
-  idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
-  idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
-  idct32_stage9_sse2(output, x);
-}
-
-static void idct32_low16_ssse3(const __m128i *input, __m128i *output,
-                               int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  // stage 1
-  __m128i x[32];
-  x[0] = input[0];
-  x[2] = input[8];
-  x[4] = input[4];
-  x[6] = input[12];
-  x[8] = input[2];
-  x[10] = input[10];
-  x[12] = input[6];
-  x[14] = input[14];
-  x[16] = input[1];
-  x[18] = input[9];
-  x[20] = input[5];
-  x[22] = input[13];
-  x[24] = input[3];
-  x[26] = input[11];
-  x[28] = input[7];
-  x[30] = input[15];
-
-  // stage 2
-  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
-  btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
-  btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
-  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
-  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
-  btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
-  btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
-  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
-
-  // stage 3
-  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
-  btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
-  btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
-  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
-  idct32_high16_stage3_sse2(x);
-
-  // stage 4
-  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
-  btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
-  btf_16_adds_subs_sse2(x[8], x[9]);
-  btf_16_subs_adds_sse2(x[11], x[10]);
-  btf_16_adds_subs_sse2(x[12], x[13]);
-  btf_16_subs_adds_sse2(x[15], x[14]);
-  idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
-
-  // stage 5
-  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
-  btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
-  btf_16_adds_subs_sse2(x[4], x[5]);
-  btf_16_subs_adds_sse2(x[7], x[6]);
-  idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
-
-  btf_16_adds_subs_sse2(x[0], x[3]);
-  btf_16_adds_subs_sse2(x[1], x[2]);
-  idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
-
-  idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
-  idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
-  idct32_stage9_sse2(output, x);
-}
-
-static void idct32_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
-  const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
-  const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
-  const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
-  const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
-  const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
-  const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
-  const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
-  const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
-  const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
-  const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
-  const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
-  const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
-  const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
-  const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
-  const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
-  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
-  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
-  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
-  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
-  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
-  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
-  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
-  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
-  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
-  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
-  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
-  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
-  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
-  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
-  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
-
-  // stage 1
-  __m128i x[32];
-  x[0] = input[0];
-  x[1] = input[16];
-  x[2] = input[8];
-  x[3] = input[24];
-  x[4] = input[4];
-  x[5] = input[20];
-  x[6] = input[12];
-  x[7] = input[28];
-  x[8] = input[2];
-  x[9] = input[18];
-  x[10] = input[10];
-  x[11] = input[26];
-  x[12] = input[6];
-  x[13] = input[22];
-  x[14] = input[14];
-  x[15] = input[30];
-  x[16] = input[1];
-  x[17] = input[17];
-  x[18] = input[9];
-  x[19] = input[25];
-  x[20] = input[5];
-  x[21] = input[21];
-  x[22] = input[13];
-  x[23] = input[29];
-  x[24] = input[3];
-  x[25] = input[19];
-  x[26] = input[11];
-  x[27] = input[27];
-  x[28] = input[7];
-  x[29] = input[23];
-  x[30] = input[15];
-  x[31] = input[31];
-
-  // stage 2
-  btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31]);
-  btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30]);
-  btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29]);
-  btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28]);
-  btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27]);
-  btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26]);
-  btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25]);
-  btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24]);
-
-  // stage 3
-  btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
-  btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
-  btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
-  btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
-  idct32_high16_stage3_sse2(x);
-
-  // stage 4
-  btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
-  btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
-  btf_16_adds_subs_sse2(x[8], x[9]);
-  btf_16_subs_adds_sse2(x[11], x[10]);
-  btf_16_adds_subs_sse2(x[12], x[13]);
-  btf_16_subs_adds_sse2(x[15], x[14]);
-  idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
-
-  // stage 5
-  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
-  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
-  btf_16_adds_subs_sse2(x[4], x[5]);
-  btf_16_adds_subs_sse2(x[7], x[6]);
-  idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
-
-  // stage 6
-  btf_16_adds_subs_sse2(x[0], x[3]);
-  btf_16_adds_subs_sse2(x[1], x[2]);
-  idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
-
-  // stage 7~8
-  idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
-  idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
-  idct32_stage9_sse2(output, x);
-}
-
-static INLINE void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi,
-                                             const __m128i __rounding,
-                                             int8_t cos_bit) {
-  const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
-  const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
-  const __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
-  const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
-  const __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
-  const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
-  const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
-  const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
-  const __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
-  const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
-  const __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
-  const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
-  btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
-  btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]);
-  btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]);
-  btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
-  btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
-  btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]);
-  btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]);
-  btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
-}
-
-static INLINE void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi,
-                                             const __m128i __rounding,
-                                             int8_t cos_bit) {
-  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
-  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
-  const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
-  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
-  const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
-  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
-  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
-  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
-  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
-  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
-  btf_16_adds_subs_sse2(x[32], x[35]);
-  btf_16_adds_subs_sse2(x[33], x[34]);
-  btf_16_subs_adds_sse2(x[39], x[36]);
-  btf_16_subs_adds_sse2(x[38], x[37]);
-  btf_16_adds_subs_sse2(x[40], x[43]);
-  btf_16_adds_subs_sse2(x[41], x[42]);
-  btf_16_subs_adds_sse2(x[47], x[44]);
-  btf_16_subs_adds_sse2(x[46], x[45]);
-  btf_16_adds_subs_sse2(x[48], x[51]);
-  btf_16_adds_subs_sse2(x[49], x[50]);
-  btf_16_subs_adds_sse2(x[55], x[52]);
-  btf_16_subs_adds_sse2(x[54], x[53]);
-  btf_16_adds_subs_sse2(x[56], x[59]);
-  btf_16_adds_subs_sse2(x[57], x[58]);
-  btf_16_subs_adds_sse2(x[63], x[60]);
-  btf_16_subs_adds_sse2(x[62], x[61]);
-}
-
-static INLINE void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi,
-                                             const __m128i __rounding,
-                                             int8_t cos_bit) {
-  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
-  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
-  const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
-  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
-  const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
-  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
-  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]);
-  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]);
-  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]);
-  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]);
-  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]);
-  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]);
-  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]);
-  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]);
-}
-
-static INLINE void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi,
-                                             const __m128i __rounding,
-                                             int8_t cos_bit) {
-  btf_16_adds_subs_sse2(x[16], x[19]);
-  btf_16_adds_subs_sse2(x[17], x[18]);
-  btf_16_subs_adds_sse2(x[23], x[20]);
-  btf_16_subs_adds_sse2(x[22], x[21]);
-  btf_16_adds_subs_sse2(x[24], x[27]);
-  btf_16_adds_subs_sse2(x[25], x[26]);
-  btf_16_subs_adds_sse2(x[31], x[28]);
-  btf_16_subs_adds_sse2(x[30], x[29]);
-  idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
-}
-
-static INLINE void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi,
-                                             const __m128i __rounding,
-                                             int8_t cos_bit) {
-  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
-  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
-  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
-  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
-  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
-  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
-  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
-  btf_16_adds_subs_sse2(x[32], x[39]);
-  btf_16_adds_subs_sse2(x[33], x[38]);
-  btf_16_adds_subs_sse2(x[34], x[37]);
-  btf_16_adds_subs_sse2(x[35], x[36]);
-  btf_16_subs_adds_sse2(x[47], x[40]);
-  btf_16_subs_adds_sse2(x[46], x[41]);
-  btf_16_subs_adds_sse2(x[45], x[42]);
-  btf_16_subs_adds_sse2(x[44], x[43]);
-  btf_16_adds_subs_sse2(x[48], x[55]);
-  btf_16_adds_subs_sse2(x[49], x[54]);
-  btf_16_adds_subs_sse2(x[50], x[53]);
-  btf_16_adds_subs_sse2(x[51], x[52]);
-  btf_16_subs_adds_sse2(x[63], x[56]);
-  btf_16_subs_adds_sse2(x[62], x[57]);
-  btf_16_subs_adds_sse2(x[61], x[58]);
-  btf_16_subs_adds_sse2(x[60], x[59]);
-}
-
-static INLINE void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi,
-                                             const __m128i __rounding,
-                                             int8_t cos_bit) {
-  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
-  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
-  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
-  btf_16_adds_subs_sse2(x[16], x[23]);
-  btf_16_adds_subs_sse2(x[17], x[22]);
-  btf_16_adds_subs_sse2(x[18], x[21]);
-  btf_16_adds_subs_sse2(x[19], x[20]);
-  btf_16_subs_adds_sse2(x[31], x[24]);
-  btf_16_subs_adds_sse2(x[30], x[25]);
-  btf_16_subs_adds_sse2(x[29], x[26]);
-  btf_16_subs_adds_sse2(x[28], x[27]);
-  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]);
-  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]);
-  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]);
-  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]);
-  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]);
-  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]);
-  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]);
-  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]);
-}
-
-static INLINE void idct64_stage9_sse2(__m128i *x, const int32_t *cospi,
-                                      const __m128i __rounding,
-                                      int8_t cos_bit) {
-  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
-  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  btf_16_adds_subs_sse2(x[0], x[15]);
-  btf_16_adds_subs_sse2(x[1], x[14]);
-  btf_16_adds_subs_sse2(x[2], x[13]);
-  btf_16_adds_subs_sse2(x[3], x[12]);
-  btf_16_adds_subs_sse2(x[4], x[11]);
-  btf_16_adds_subs_sse2(x[5], x[10]);
-  btf_16_adds_subs_sse2(x[6], x[9]);
-  btf_16_adds_subs_sse2(x[7], x[8]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
-  btf_16_adds_subs_sse2(x[32], x[47]);
-  btf_16_adds_subs_sse2(x[33], x[46]);
-  btf_16_adds_subs_sse2(x[34], x[45]);
-  btf_16_adds_subs_sse2(x[35], x[44]);
-  btf_16_adds_subs_sse2(x[36], x[43]);
-  btf_16_adds_subs_sse2(x[37], x[42]);
-  btf_16_adds_subs_sse2(x[38], x[41]);
-  btf_16_adds_subs_sse2(x[39], x[40]);
-  btf_16_subs_adds_sse2(x[63], x[48]);
-  btf_16_subs_adds_sse2(x[62], x[49]);
-  btf_16_subs_adds_sse2(x[61], x[50]);
-  btf_16_subs_adds_sse2(x[60], x[51]);
-  btf_16_subs_adds_sse2(x[59], x[52]);
-  btf_16_subs_adds_sse2(x[58], x[53]);
-  btf_16_subs_adds_sse2(x[57], x[54]);
-  btf_16_subs_adds_sse2(x[56], x[55]);
-}
-
-static INLINE void idct64_stage10_sse2(__m128i *x, const int32_t *cospi,
-                                       const __m128i __rounding,
-                                       int8_t cos_bit) {
-  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
-  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  btf_16_adds_subs_sse2(x[0], x[31]);
-  btf_16_adds_subs_sse2(x[1], x[30]);
-  btf_16_adds_subs_sse2(x[2], x[29]);
-  btf_16_adds_subs_sse2(x[3], x[28]);
-  btf_16_adds_subs_sse2(x[4], x[27]);
-  btf_16_adds_subs_sse2(x[5], x[26]);
-  btf_16_adds_subs_sse2(x[6], x[25]);
-  btf_16_adds_subs_sse2(x[7], x[24]);
-  btf_16_adds_subs_sse2(x[8], x[23]);
-  btf_16_adds_subs_sse2(x[9], x[22]);
-  btf_16_adds_subs_sse2(x[10], x[21]);
-  btf_16_adds_subs_sse2(x[11], x[20]);
-  btf_16_adds_subs_sse2(x[12], x[19]);
-  btf_16_adds_subs_sse2(x[13], x[18]);
-  btf_16_adds_subs_sse2(x[14], x[17]);
-  btf_16_adds_subs_sse2(x[15], x[16]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]);
-}
-
-static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) {
-  btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]);
-  btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]);
-  btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]);
-  btf_16_adds_subs_out_sse2(output[3], output[60], x[3], x[60]);
-  btf_16_adds_subs_out_sse2(output[4], output[59], x[4], x[59]);
-  btf_16_adds_subs_out_sse2(output[5], output[58], x[5], x[58]);
-  btf_16_adds_subs_out_sse2(output[6], output[57], x[6], x[57]);
-  btf_16_adds_subs_out_sse2(output[7], output[56], x[7], x[56]);
-  btf_16_adds_subs_out_sse2(output[8], output[55], x[8], x[55]);
-  btf_16_adds_subs_out_sse2(output[9], output[54], x[9], x[54]);
-  btf_16_adds_subs_out_sse2(output[10], output[53], x[10], x[53]);
-  btf_16_adds_subs_out_sse2(output[11], output[52], x[11], x[52]);
-  btf_16_adds_subs_out_sse2(output[12], output[51], x[12], x[51]);
-  btf_16_adds_subs_out_sse2(output[13], output[50], x[13], x[50]);
-  btf_16_adds_subs_out_sse2(output[14], output[49], x[14], x[49]);
-  btf_16_adds_subs_out_sse2(output[15], output[48], x[15], x[48]);
-  btf_16_adds_subs_out_sse2(output[16], output[47], x[16], x[47]);
-  btf_16_adds_subs_out_sse2(output[17], output[46], x[17], x[46]);
-  btf_16_adds_subs_out_sse2(output[18], output[45], x[18], x[45]);
-  btf_16_adds_subs_out_sse2(output[19], output[44], x[19], x[44]);
-  btf_16_adds_subs_out_sse2(output[20], output[43], x[20], x[43]);
-  btf_16_adds_subs_out_sse2(output[21], output[42], x[21], x[42]);
-  btf_16_adds_subs_out_sse2(output[22], output[41], x[22], x[41]);
-  btf_16_adds_subs_out_sse2(output[23], output[40], x[23], x[40]);
-  btf_16_adds_subs_out_sse2(output[24], output[39], x[24], x[39]);
-  btf_16_adds_subs_out_sse2(output[25], output[38], x[25], x[38]);
-  btf_16_adds_subs_out_sse2(output[26], output[37], x[26], x[37]);
-  btf_16_adds_subs_out_sse2(output[27], output[36], x[27], x[36]);
-  btf_16_adds_subs_out_sse2(output[28], output[35], x[28], x[35]);
-  btf_16_adds_subs_out_sse2(output[29], output[34], x[29], x[34]);
-  btf_16_adds_subs_out_sse2(output[30], output[33], x[30], x[33]);
-  btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]);
-}
-
-static void idct64_low1_ssse3(const __m128i *input, __m128i *output,
-                              int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-
-  // stage 1
-  __m128i x[32];
-  x[0] = input[0];
-
-  // stage 2
-  // stage 3
-  // stage 4
-  // stage 5
-  // stage 6
-  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
-
-  // stage 7
-  // stage 8
-  // stage 9
-  // stage 10
-  // stage 11
-  output[0] = x[0];
-  output[63] = x[0];
-  output[1] = x[1];
-  output[62] = x[1];
-  output[2] = x[1];
-  output[61] = x[1];
-  output[3] = x[0];
-  output[60] = x[0];
-  output[4] = x[0];
-  output[59] = x[0];
-  output[5] = x[1];
-  output[58] = x[1];
-  output[6] = x[1];
-  output[57] = x[1];
-  output[7] = x[0];
-  output[56] = x[0];
-  output[8] = x[0];
-  output[55] = x[0];
-  output[9] = x[1];
-  output[54] = x[1];
-  output[10] = x[1];
-  output[53] = x[1];
-  output[11] = x[0];
-  output[52] = x[0];
-  output[12] = x[0];
-  output[51] = x[0];
-  output[13] = x[1];
-  output[50] = x[1];
-  output[14] = x[1];
-  output[49] = x[1];
-  output[15] = x[0];
-  output[48] = x[0];
-  output[16] = x[0];
-  output[47] = x[0];
-  output[17] = x[1];
-  output[46] = x[1];
-  output[18] = x[1];
-  output[45] = x[1];
-  output[19] = x[0];
-  output[44] = x[0];
-  output[20] = x[0];
-  output[43] = x[0];
-  output[21] = x[1];
-  output[42] = x[1];
-  output[22] = x[1];
-  output[41] = x[1];
-  output[23] = x[0];
-  output[40] = x[0];
-  output[24] = x[0];
-  output[39] = x[0];
-  output[25] = x[1];
-  output[38] = x[1];
-  output[26] = x[1];
-  output[37] = x[1];
-  output[27] = x[0];
-  output[36] = x[0];
-  output[28] = x[0];
-  output[35] = x[0];
-  output[29] = x[1];
-  output[34] = x[1];
-  output[30] = x[1];
-  output[33] = x[1];
-  output[31] = x[0];
-  output[32] = x[0];
-}
-
-static void idct64_low8_ssse3(const __m128i *input, __m128i *output,
-                              int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
-  const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
-  const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
-  const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
-  const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
-  const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
-  const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
-  const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
-  const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
-  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
-  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
-  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
-  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
-  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
-  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
-  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
-
-  // stage 1
-  __m128i x[64];
-  x[0] = input[0];
-  x[8] = input[4];
-  x[16] = input[2];
-  x[24] = input[6];
-  x[32] = input[1];
-  x[40] = input[5];
-  x[48] = input[3];
-  x[56] = input[7];
-
-  // stage 2
-  btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
-  btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
-  btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
-  btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
-
-  // stage 3
-  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
-  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
-  x[33] = x[32];
-  x[38] = x[39];
-  x[41] = x[40];
-  x[46] = x[47];
-  x[49] = x[48];
-  x[54] = x[55];
-  x[57] = x[56];
-  x[62] = x[63];
-
-  // stage 4
-  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
-  x[17] = x[16];
-  x[22] = x[23];
-  x[25] = x[24];
-  x[30] = x[31];
-  btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
-  btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
-  btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
-  btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
-
-  // stage 5
-  x[9] = x[8];
-  x[14] = x[15];
-  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
-  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
-  x[35] = x[32];
-  x[34] = x[33];
-  x[36] = x[39];
-  x[37] = x[38];
-  x[43] = x[40];
-  x[42] = x[41];
-  x[44] = x[47];
-  x[45] = x[46];
-  x[51] = x[48];
-  x[50] = x[49];
-  x[52] = x[55];
-  x[53] = x[54];
-  x[59] = x[56];
-  x[58] = x[57];
-  x[60] = x[63];
-  x[61] = x[62];
-
-  // stage 6
-  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
-  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
-  x[19] = x[16];
-  x[18] = x[17];
-  x[20] = x[23];
-  x[21] = x[22];
-  x[27] = x[24];
-  x[26] = x[25];
-  x[28] = x[31];
-  x[29] = x[30];
-  idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
-
-  // stage 7
-  x[3] = x[0];
-  x[2] = x[1];
-  x[11] = x[8];
-  x[10] = x[9];
-  x[12] = x[15];
-  x[13] = x[14];
-  idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
-
-  // stage 8
-  x[7] = x[0];
-  x[6] = x[1];
-  x[5] = x[2];
-  x[4] = x[3];
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
-  idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
-
-  idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
-  idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
-  idct64_stage11_sse2(output, x);
-}
-
-static void idct64_low16_ssse3(const __m128i *input, __m128i *output,
-                               int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
-  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
-  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
-  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
-
-  // stage 1
-  __m128i x[64];
-  x[0] = input[0];
-  x[4] = input[8];
-  x[8] = input[4];
-  x[12] = input[12];
-  x[16] = input[2];
-  x[20] = input[10];
-  x[24] = input[6];
-  x[28] = input[14];
-  x[32] = input[1];
-  x[36] = input[9];
-  x[40] = input[5];
-  x[44] = input[13];
-  x[48] = input[3];
-  x[52] = input[11];
-  x[56] = input[7];
-  x[60] = input[15];
-
-  // stage 2
-  btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
-  btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
-  btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
-  btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
-  btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
-  btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
-  btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
-  btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
-
-  // stage 3
-  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
-  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
-  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
-  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
-  x[33] = x[32];
-  x[34] = x[35];
-  x[37] = x[36];
-  x[38] = x[39];
-  x[41] = x[40];
-  x[42] = x[43];
-  x[45] = x[44];
-  x[46] = x[47];
-  x[49] = x[48];
-  x[50] = x[51];
-  x[53] = x[52];
-  x[54] = x[55];
-  x[57] = x[56];
-  x[58] = x[59];
-  x[61] = x[60];
-  x[62] = x[63];
-
-  // stage 4
-  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
-  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
-  x[17] = x[16];
-  x[18] = x[19];
-  x[21] = x[20];
-  x[22] = x[23];
-  x[25] = x[24];
-  x[26] = x[27];
-  x[29] = x[28];
-  x[30] = x[31];
-  idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
-
-  // stage 5
-  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
-  x[9] = x[8];
-  x[10] = x[11];
-  x[13] = x[12];
-  x[14] = x[15];
-  idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
-
-  // stage 6
-  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
-  x[5] = x[4];
-  x[6] = x[7];
-  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
-  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
-  idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
-
-  // stage 7
-  x[3] = x[0];
-  x[2] = x[1];
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
-  btf_16_adds_subs_sse2(x[8], x[11]);
-  btf_16_adds_subs_sse2(x[9], x[10]);
-  btf_16_subs_adds_sse2(x[15], x[12]);
-  btf_16_subs_adds_sse2(x[14], x[13]);
-  idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
-
-  // stage 8
-  btf_16_adds_subs_sse2(x[0], x[7]);
-  btf_16_adds_subs_sse2(x[1], x[6]);
-  btf_16_adds_subs_sse2(x[2], x[5]);
-  btf_16_adds_subs_sse2(x[3], x[4]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
-  idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
-
-  idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
-  idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
-  idct64_stage11_sse2(output, x);
-}
-
-static void idct64_low32_ssse3(const __m128i *input, __m128i *output,
-                               int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
-  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
-  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
-  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
-
-  // stage 1
-  __m128i x[64];
-  x[0] = input[0];
-  x[2] = input[16];
-  x[4] = input[8];
-  x[6] = input[24];
-  x[8] = input[4];
-  x[10] = input[20];
-  x[12] = input[12];
-  x[14] = input[28];
-  x[16] = input[2];
-  x[18] = input[18];
-  x[20] = input[10];
-  x[22] = input[26];
-  x[24] = input[6];
-  x[26] = input[22];
-  x[28] = input[14];
-  x[30] = input[30];
-  x[32] = input[1];
-  x[34] = input[17];
-  x[36] = input[9];
-  x[38] = input[25];
-  x[40] = input[5];
-  x[42] = input[21];
-  x[44] = input[13];
-  x[46] = input[29];
-  x[48] = input[3];
-  x[50] = input[19];
-  x[52] = input[11];
-  x[54] = input[27];
-  x[56] = input[7];
-  x[58] = input[23];
-  x[60] = input[15];
-  x[62] = input[31];
-
-  // stage 2
-  btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
-  btf_16_ssse3(-cospi[33], cospi[31], x[62], x[33], x[62]);
-  btf_16_ssse3(cospi[47], cospi[17], x[34], x[34], x[61]);
-  btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
-  btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
-  btf_16_ssse3(-cospi[41], cospi[23], x[58], x[37], x[58]);
-  btf_16_ssse3(cospi[39], cospi[25], x[38], x[38], x[57]);
-  btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
-  btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
-  btf_16_ssse3(-cospi[37], cospi[27], x[54], x[41], x[54]);
-  btf_16_ssse3(cospi[43], cospi[21], x[42], x[42], x[53]);
-  btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
-  btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
-  btf_16_ssse3(-cospi[45], cospi[19], x[50], x[45], x[50]);
-  btf_16_ssse3(cospi[35], cospi[29], x[46], x[46], x[49]);
-  btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
-
-  // stage 3
-  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
-  btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
-  btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
-  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
-  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
-  btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
-  btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
-  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
-  btf_16_adds_subs_sse2(x[32], x[33]);
-  btf_16_subs_adds_sse2(x[35], x[34]);
-  btf_16_adds_subs_sse2(x[36], x[37]);
-  btf_16_subs_adds_sse2(x[39], x[38]);
-  btf_16_adds_subs_sse2(x[40], x[41]);
-  btf_16_subs_adds_sse2(x[43], x[42]);
-  btf_16_adds_subs_sse2(x[44], x[45]);
-  btf_16_subs_adds_sse2(x[47], x[46]);
-  btf_16_adds_subs_sse2(x[48], x[49]);
-  btf_16_subs_adds_sse2(x[51], x[50]);
-  btf_16_adds_subs_sse2(x[52], x[53]);
-  btf_16_subs_adds_sse2(x[55], x[54]);
-  btf_16_adds_subs_sse2(x[56], x[57]);
-  btf_16_subs_adds_sse2(x[59], x[58]);
-  btf_16_adds_subs_sse2(x[60], x[61]);
-  btf_16_subs_adds_sse2(x[63], x[62]);
-
-  // stage 4
-  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
-  btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
-  btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
-  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
-  btf_16_adds_subs_sse2(x[16], x[17]);
-  btf_16_subs_adds_sse2(x[19], x[18]);
-  btf_16_adds_subs_sse2(x[20], x[21]);
-  btf_16_subs_adds_sse2(x[23], x[22]);
-  btf_16_adds_subs_sse2(x[24], x[25]);
-  btf_16_subs_adds_sse2(x[27], x[26]);
-  btf_16_adds_subs_sse2(x[28], x[29]);
-  btf_16_subs_adds_sse2(x[31], x[30]);
-  idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
-
-  // stage 5
-  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
-  btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
-  btf_16_adds_subs_sse2(x[8], x[9]);
-  btf_16_subs_adds_sse2(x[11], x[10]);
-  btf_16_adds_subs_sse2(x[12], x[13]);
-  btf_16_subs_adds_sse2(x[15], x[14]);
-  idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
-
-  // stage 6
-  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
-  btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
-  btf_16_adds_subs_sse2(x[4], x[5]);
-  btf_16_subs_adds_sse2(x[7], x[6]);
-  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
-  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
-  idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
-
-  // stage 7
-  btf_16_adds_subs_sse2(x[0], x[3]);
-  btf_16_adds_subs_sse2(x[1], x[2]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
-  btf_16_adds_subs_sse2(x[8], x[11]);
-  btf_16_adds_subs_sse2(x[9], x[10]);
-  btf_16_subs_adds_sse2(x[15], x[12]);
-  btf_16_subs_adds_sse2(x[14], x[13]);
-  idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
-
-  // stage 8
-  btf_16_adds_subs_sse2(x[0], x[7]);
-  btf_16_adds_subs_sse2(x[1], x[6]);
-  btf_16_adds_subs_sse2(x[2], x[5]);
-  btf_16_adds_subs_sse2(x[3], x[4]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
-  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
-  idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
-
-  // stage 9~11
-  idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
-  idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
-  idct64_stage11_sse2(output, x);
-}
-
-static void iadst4_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
-  const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
-  const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
-  const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
-  const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
-  const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
-  const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
-  const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
-  const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
-  __m128i x0[4];
-  x0[0] = input[0];
-  x0[1] = input[1];
-  x0[2] = input[2];
-  x0[3] = input[3];
-
-  __m128i u[4];
-  u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
-  u[1] = _mm_unpackhi_epi16(x0[0], x0[2]);
-  u[2] = _mm_unpacklo_epi16(x0[1], x0[3]);
-  u[3] = _mm_unpackhi_epi16(x0[1], x0[3]);
-
-  __m128i x1[16];
-  x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04);  // x0*sin1 + x2*sin4
-  x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04);
-  x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01);  // x0*sin2 - x2*sin1
-  x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01);
-  x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02);  // x1*sin3 + x3*sin2
-  x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02);
-  x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04);  // x1*sin3 - x3*sin4
-  x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04);
-  x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03);  // x0*sin3 - x2*sin3
-  x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03);
-  x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03);  // x2*sin3
-  x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03);
-  x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02);  // x0*sin4 + x2*sin2
-  x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02);
-  x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01);  // -x1*sin3 - x3*sin1
-  x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01);
-
-  __m128i x2[8];
-  x2[0] = _mm_add_epi32(x1[0], x1[4]);  // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2
-  x2[1] = _mm_add_epi32(x1[1], x1[5]);
-  x2[2] = _mm_add_epi32(x1[2], x1[6]);  // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4
-  x2[3] = _mm_add_epi32(x1[3], x1[7]);
-  x2[4] = _mm_add_epi32(x1[8], x1[10]);  // x0*sin3 -x2*sin3 +x3*sin3
-  x2[5] = _mm_add_epi32(x1[9], x1[11]);
-  x2[6] = _mm_add_epi32(x1[12], x1[14]);  // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1
-  x2[7] = _mm_add_epi32(x1[13], x1[15]);
-
-  const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
-  for (int i = 0; i < 4; ++i) {
-    __m128i out0 = _mm_add_epi32(x2[2 * i], rounding);
-    __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding);
-    out0 = _mm_srai_epi32(out0, INV_COS_BIT);
-    out1 = _mm_srai_epi32(out1, INV_COS_BIT);
-    output[i] = _mm_packs_epi32(out0, out1);
-  }
-}
-
-static void iadst4_w4_sse2(const __m128i *input, __m128i *output,
-                           int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
-  const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
-  const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
-  const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
-  const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
-  const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
-  const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
-  const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
-  const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
-  __m128i x0[4];
-  x0[0] = input[0];
-  x0[1] = input[1];
-  x0[2] = input[2];
-  x0[3] = input[3];
-
-  __m128i u[2];
-  u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
-  u[1] = _mm_unpacklo_epi16(x0[1], x0[3]);
-
-  __m128i x1[8];
-  x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04);  // x0*sin1 + x2*sin4
-  x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01);  // x0*sin2 - x2*sin1
-  x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02);  // x1*sin3 + x3*sin2
-  x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04);  // x1*sin3 - x3*sin4
-  x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03);  // x0*sin3 - x2*sin3
-  x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03);    // x2*sin3
-  x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02);  // x0*sin4 + x2*sin2
-  x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01);  // -x1*sin3 - x3*sin1
-
-  __m128i x2[4];
-  x2[0] = _mm_add_epi32(x1[0], x1[2]);  // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2
-  x2[1] = _mm_add_epi32(x1[1], x1[3]);  // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4
-  x2[2] = _mm_add_epi32(x1[4], x1[5]);  // x0*sin3 - x2*sin3 + x3*sin3
-  x2[3] = _mm_add_epi32(x1[6], x1[7]);  // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1
-
-  const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
-  for (int i = 0; i < 4; ++i) {
-    __m128i out0 = _mm_add_epi32(x2[i], rounding);
-    out0 = _mm_srai_epi32(out0, INV_COS_BIT);
-    output[i] = _mm_packs_epi32(out0, out0);
-  }
-}
-
-static void iadst8_low1_ssse3(const __m128i *input, __m128i *output,
-                              int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m128i __zero = _mm_setzero_si128();
-  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
-  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
-  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
-
-  // stage 1
-  __m128i x[8];
-  x[1] = input[0];
-
-  // stage 2
-  btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]);
-
-  // stage 3
-  x[4] = x[0];
-  x[5] = x[1];
-
-  // stage 4
-  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
-
-  // stage 5
-  x[2] = x[0];
-  x[3] = x[1];
-  x[6] = x[4];
-  x[7] = x[5];
-
-  // stage 6
-  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
-  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
-
-  // stage 7
-  output[0] = x[0];
-  output[1] = _mm_subs_epi16(__zero, x[4]);
-  output[2] = x[6];
-  output[3] = _mm_subs_epi16(__zero, x[2]);
-  output[4] = x[3];
-  output[5] = _mm_subs_epi16(__zero, x[7]);
-  output[6] = x[5];
-  output[7] = _mm_subs_epi16(__zero, x[1]);
-}
-
-static void iadst8_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m128i __zero = _mm_setzero_si128();
-  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
-  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
-  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
-  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
-  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
-  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
-  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
-  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
-  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
-  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
-  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
-  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
-
-  // stage 1
-  __m128i x[8];
-  x[0] = input[7];
-  x[1] = input[0];
-  x[2] = input[5];
-  x[3] = input[2];
-  x[4] = input[3];
-  x[5] = input[4];
-  x[6] = input[1];
-  x[7] = input[6];
-
-  // stage 2
-  btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
-  btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
-  btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
-  btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
-
-  // stage 3
-  btf_16_adds_subs_sse2(x[0], x[4]);
-  btf_16_adds_subs_sse2(x[1], x[5]);
-  btf_16_adds_subs_sse2(x[2], x[6]);
-  btf_16_adds_subs_sse2(x[3], x[7]);
-
-  // stage 4
-  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
-  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
-
-  // stage 5
-  btf_16_adds_subs_sse2(x[0], x[2]);
-  btf_16_adds_subs_sse2(x[1], x[3]);
-  btf_16_adds_subs_sse2(x[4], x[6]);
-  btf_16_adds_subs_sse2(x[5], x[7]);
-
-  // stage 6
-  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
-  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
-
-  // stage 7
-  output[0] = x[0];
-  output[1] = _mm_subs_epi16(__zero, x[4]);
-  output[2] = x[6];
-  output[3] = _mm_subs_epi16(__zero, x[2]);
-  output[4] = x[3];
-  output[5] = _mm_subs_epi16(__zero, x[7]);
-  output[6] = x[5];
-  output[7] = _mm_subs_epi16(__zero, x[1]);
-}
-
-static void iadst8_w4_sse2(const __m128i *input, __m128i *output,
-                           int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m128i __zero = _mm_setzero_si128();
-  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
-  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
-  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
-  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
-  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
-  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
-  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
-  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
-  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
-  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
-  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
-  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
-
-  // stage 1
-  __m128i x[8];
-  x[0] = input[7];
-  x[1] = input[0];
-  x[2] = input[5];
-  x[3] = input[2];
-  x[4] = input[3];
-  x[5] = input[4];
-  x[6] = input[1];
-  x[7] = input[6];
-
-  // stage 2
-  btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
-  btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
-  btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
-  btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
-
-  // stage 3
-  btf_16_adds_subs_sse2(x[0], x[4]);
-  btf_16_adds_subs_sse2(x[1], x[5]);
-  btf_16_adds_subs_sse2(x[2], x[6]);
-  btf_16_adds_subs_sse2(x[3], x[7]);
-
-  // stage 4
-  btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
-  btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
-
-  // stage 5
-  btf_16_adds_subs_sse2(x[0], x[2]);
-  btf_16_adds_subs_sse2(x[1], x[3]);
-  btf_16_adds_subs_sse2(x[4], x[6]);
-  btf_16_adds_subs_sse2(x[5], x[7]);
-
-  // stage 6
-  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
-  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
-
-  // stage 7
-  output[0] = x[0];
-  output[1] = _mm_subs_epi16(__zero, x[4]);
-  output[2] = x[6];
-  output[3] = _mm_subs_epi16(__zero, x[2]);
-  output[4] = x[3];
-  output[5] = _mm_subs_epi16(__zero, x[7]);
-  output[6] = x[5];
-  output[7] = _mm_subs_epi16(__zero, x[1]);
-}
-
-static INLINE void iadst16_stage3_ssse3(__m128i *x) {
-  btf_16_adds_subs_sse2(x[0], x[8]);
-  btf_16_adds_subs_sse2(x[1], x[9]);
-  btf_16_adds_subs_sse2(x[2], x[10]);
-  btf_16_adds_subs_sse2(x[3], x[11]);
-  btf_16_adds_subs_sse2(x[4], x[12]);
-  btf_16_adds_subs_sse2(x[5], x[13]);
-  btf_16_adds_subs_sse2(x[6], x[14]);
-  btf_16_adds_subs_sse2(x[7], x[15]);
-}
-
-static INLINE void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi,
-                                        const __m128i __rounding,
-                                        int8_t cos_bit) {
-  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
-  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
-  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
-  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
-  const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
-  const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
-  btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
-  btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
-  btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
-  btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
-}
-
-static INLINE void iadst16_stage5_ssse3(__m128i *x) {
-  btf_16_adds_subs_sse2(x[0], x[4]);
-  btf_16_adds_subs_sse2(x[1], x[5]);
-  btf_16_adds_subs_sse2(x[2], x[6]);
-  btf_16_adds_subs_sse2(x[3], x[7]);
-  btf_16_adds_subs_sse2(x[8], x[12]);
-  btf_16_adds_subs_sse2(x[9], x[13]);
-  btf_16_adds_subs_sse2(x[10], x[14]);
-  btf_16_adds_subs_sse2(x[11], x[15]);
-}
-
-static INLINE void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi,
-                                        const __m128i __rounding,
-                                        int8_t cos_bit) {
-  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
-  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
-  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
-  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
-  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
-  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
-  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
-}
-
-static INLINE void iadst16_stage7_ssse3(__m128i *x) {
-  btf_16_adds_subs_sse2(x[0], x[2]);
-  btf_16_adds_subs_sse2(x[1], x[3]);
-  btf_16_adds_subs_sse2(x[4], x[6]);
-  btf_16_adds_subs_sse2(x[5], x[7]);
-  btf_16_adds_subs_sse2(x[8], x[10]);
-  btf_16_adds_subs_sse2(x[9], x[11]);
-  btf_16_adds_subs_sse2(x[12], x[14]);
-  btf_16_adds_subs_sse2(x[13], x[15]);
-}
-
-static INLINE void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi,
-                                        const __m128i __rounding,
-                                        int8_t cos_bit) {
-  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
-  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
-  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
-  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
-  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
-}
-
-static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
-  const __m128i __zero = _mm_setzero_si128();
-  output[0] = x[0];
-  output[1] = _mm_subs_epi16(__zero, x[8]);
-  output[2] = x[12];
-  output[3] = _mm_subs_epi16(__zero, x[4]);
-  output[4] = x[6];
-  output[5] = _mm_subs_epi16(__zero, x[14]);
-  output[6] = x[10];
-  output[7] = _mm_subs_epi16(__zero, x[2]);
-  output[8] = x[3];
-  output[9] = _mm_subs_epi16(__zero, x[11]);
-  output[10] = x[15];
-  output[11] = _mm_subs_epi16(__zero, x[7]);
-  output[12] = x[5];
-  output[13] = _mm_subs_epi16(__zero, x[13]);
-  output[14] = x[9];
-  output[15] = _mm_subs_epi16(__zero, x[1]);
-}
-
-static void iadst16_low1_ssse3(const __m128i *input, __m128i *output,
-                               int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
-  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
-  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
-  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
-
-  // stage 1
-  __m128i x[16];
-  x[1] = input[0];
-
-  // stage 2
-  btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
-
-  // stage 3
-  x[8] = x[0];
-  x[9] = x[1];
-
-  // stage 4
-  btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
-
-  // stage 5
-  x[4] = x[0];
-  x[5] = x[1];
-  x[12] = x[8];
-  x[13] = x[9];
-
-  // stage 6
-  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
-  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
-
-  // stage 7
-  x[2] = x[0];
-  x[3] = x[1];
-  x[6] = x[4];
-  x[7] = x[5];
-  x[10] = x[8];
-  x[11] = x[9];
-  x[14] = x[12];
-  x[15] = x[13];
-
-  iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
-  iadst16_stage9_ssse3(output, x);
-}
-
-static void iadst16_low8_ssse3(const __m128i *input, __m128i *output,
-                               int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  // stage 1
-  __m128i x[16];
-  x[1] = input[0];
-  x[3] = input[2];
-  x[5] = input[4];
-  x[7] = input[6];
-  x[8] = input[7];
-  x[10] = input[5];
-  x[12] = input[3];
-  x[14] = input[1];
-
-  // stage 2
-  btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
-  btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]);
-  btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]);
-  btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]);
-  btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]);
-  btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]);
-  btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]);
-  btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]);
-
-  // stage 3
-  iadst16_stage3_ssse3(x);
-  iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
-  iadst16_stage5_ssse3(x);
-  iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
-  iadst16_stage7_ssse3(x);
-  iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
-  iadst16_stage9_ssse3(output, x);
-}
-static void iadst16_sse2(const __m128i *input, __m128i *output,
-                         int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
-  const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
-  const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
-  const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
-  const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
-  const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
-  const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
-  const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
-  const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
-  const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
-  const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
-  const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
-  const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
-  const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
-  const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
-  const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
-  const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
-
-  // stage 1
-  __m128i x[16];
-  x[0] = input[15];
-  x[1] = input[0];
-  x[2] = input[13];
-  x[3] = input[2];
-  x[4] = input[11];
-  x[5] = input[4];
-  x[6] = input[9];
-  x[7] = input[6];
-  x[8] = input[7];
-  x[9] = input[8];
-  x[10] = input[5];
-  x[11] = input[10];
-  x[12] = input[3];
-  x[13] = input[12];
-  x[14] = input[1];
-  x[15] = input[14];
-
-  // stage 2
-  btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
-  btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
-  btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
-  btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
-  btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
-  btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
-  btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
-  btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
-
-  // stage 3~9
-  iadst16_stage3_ssse3(x);
-  iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
-  iadst16_stage5_ssse3(x);
-  iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
-  iadst16_stage7_ssse3(x);
-  iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
-  iadst16_stage9_ssse3(output, x);
-}
-
-static void iadst16_w4_sse2(const __m128i *input, __m128i *output,
-                            int8_t cos_bit) {
-  (void)cos_bit;
-  const int32_t *cospi = cospi_arr(INV_COS_BIT);
-  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
-
-  const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
-  const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
-  const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
-  const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
-  const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
-  const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
-  const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
-  const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
-  const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
-  const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
-  const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
-  const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
-  const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
-  const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
-  const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
-  const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
-  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
-  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
-  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
-  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
-  const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
-  const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
-  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
-  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
-  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
-  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
-  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
-
-  // stage 1
-  __m128i x[16];
-  x[0] = input[15];
-  x[1] = input[0];
-  x[2] = input[13];
-  x[3] = input[2];
-  x[4] = input[11];
-  x[5] = input[4];
-  x[6] = input[9];
-  x[7] = input[6];
-  x[8] = input[7];
-  x[9] = input[8];
-  x[10] = input[5];
-  x[11] = input[10];
-  x[12] = input[3];
-  x[13] = input[12];
-  x[14] = input[1];
-  x[15] = input[14];
-
-  // stage 2
-  btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
-  btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
-  btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
-  btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
-  btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
-  btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
-  btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
-  btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
-
-  // stage 3
-  iadst16_stage3_ssse3(x);
-
-  // stage 4
-  btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
-  btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
-  btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
-  btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
-
-  // stage 5
-  iadst16_stage5_ssse3(x);
-
-  // stage 6
-  btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
-  btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
-  btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
-  btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
-
-  // stage 7
-  iadst16_stage7_ssse3(x);
-
-  // stage 8
-  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
-  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
-  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
-  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
-
-  // stage 9
-  iadst16_stage9_ssse3(output, x);
-}
-
-static void iidentity4_ssse3(const __m128i *input, __m128i *output,
-                             int8_t cos_bit) {
-  (void)cos_bit;
-  const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits));
-  const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
-  for (int i = 0; i < 4; ++i) {
-    __m128i x = _mm_mulhrs_epi16(input[i], scale);
-    output[i] = _mm_adds_epi16(x, input[i]);
-  }
-}
-
-static void iidentity8_sse2(const __m128i *input, __m128i *output,
-                            int8_t cos_bit) {
-  (void)cos_bit;
-  for (int i = 0; i < 8; ++i) {
-    output[i] = _mm_adds_epi16(input[i], input[i]);
-  }
-}
-
-static void iidentity16_ssse3(const __m128i *input, __m128i *output,
-                              int8_t cos_bit) {
-  (void)cos_bit;
-  const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits));
-  const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
-  for (int i = 0; i < 16; ++i) {
-    __m128i x = _mm_mulhrs_epi16(input[i], scale);
-    __m128i srcx2 = _mm_adds_epi16(input[i], input[i]);
-    output[i] = _mm_adds_epi16(x, srcx2);
-  }
-}
-
-static INLINE __m128i lowbd_get_recon_8x8_sse2(const __m128i pred,
-                                               __m128i res) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero));
-  return _mm_packus_epi16(x0, x0);
-}
-
-static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output,
-                                               int stride, int flipud,
-                                               const int height) {
-  int j = flipud ? (height - 1) : 0;
-  const int step = flipud ? -1 : 1;
-  const __m128i zero = _mm_setzero_si128();
-  for (int i = 0; i < height; ++i, j += step) {
-    const __m128i v = _mm_cvtsi32_si128(*((uint32_t *)(output + i * stride)));
-    __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
-    u = _mm_packus_epi16(u, zero);
-    *((uint32_t *)(output + i * stride)) = _mm_cvtsi128_si32(u);
-  }
-}
-
-static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output,
-                                               int stride, int flipud,
-                                               const int height) {
-  int j = flipud ? (height - 1) : 0;
-  const int step = flipud ? -1 : 1;
-  for (int i = 0; i < height; ++i, j += step) {
-    const __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
-    const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]);
-    _mm_storel_epi64((__m128i *)(output + i * stride), u);
-  }
-}
-
-// 1D functions process process 8 pixels at one time.
-static const transform_1d_ssse3
-    lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = {
-      { idct4_sse2, iadst4_sse2, iidentity4_ssse3 },
-      { idct8_sse2, iadst8_sse2, iidentity8_sse2 },
-      { idct16_sse2, iadst16_sse2, iidentity16_ssse3 },
-      { idct32_sse2, NULL, NULL },
-      { idct64_low32_ssse3, NULL, NULL },
-    };
-
-// functions for blocks with eob at DC and within
-// topleft 8x8, 16x16, 32x32 corner
-static const transform_1d_ssse3
-    lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
-      {
-          { idct4_sse2, idct4_sse2, NULL, NULL },
-          { iadst4_sse2, iadst4_sse2, NULL, NULL },
-          { iidentity4_ssse3, iidentity4_ssse3, NULL, NULL },
-      },
-      { { idct8_low1_ssse3, idct8_sse2, NULL, NULL },
-        { iadst8_low1_ssse3, iadst8_sse2, NULL, NULL },
-        { iidentity8_sse2, iidentity8_sse2, NULL, NULL } },
-      {
-          { idct16_low1_ssse3, idct16_low8_ssse3, idct16_sse2, NULL },
-          { iadst16_low1_ssse3, iadst16_low8_ssse3, iadst16_sse2, NULL },
-          { NULL, NULL, NULL, NULL },
-      },
-      { { idct32_low1_ssse3, idct32_low8_ssse3, idct32_low16_ssse3,
-          idct32_sse2 },
-        { NULL, NULL, NULL, NULL },
-        { NULL, NULL, NULL, NULL } },
-      { { idct64_low1_ssse3, idct64_low8_ssse3, idct64_low16_ssse3,
-          idct64_low32_ssse3 },
-        { NULL, NULL, NULL, NULL },
-        { NULL, NULL, NULL, NULL } }
-    };
-
-// 1D functions process process 4 pixels at one time.
-// used in 4x4, 4x8, 4x16, 8x4, 16x4
-static const transform_1d_ssse3
-    lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = {
-      { idct4_w4_sse2, iadst4_w4_sse2, iidentity4_ssse3 },
-      { idct8_w4_sse2, iadst8_w4_sse2, iidentity8_sse2 },
-      { idct16_w4_sse2, iadst16_w4_sse2, iidentity16_ssse3 },
-      { NULL, NULL, NULL },
-      { NULL, NULL, NULL },
-    };
-
-static INLINE void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input,
-                                           int stride, int shift, int height,
-                                           int txw_idx, int rect_type) {
-  const int32_t *input_row = input;
-  const __m128i scale = _mm_set1_epi16(NewSqrt2list[txw_idx]);
-  const __m128i rounding = _mm_set1_epi16((1 << (NewSqrt2Bits - 1)) +
-                                          (1 << (NewSqrt2Bits - shift - 1)));
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding);
-  if (rect_type != 1 && rect_type != -1) {
-    for (int i = 0; i < height; ++i) {
-      const __m128i src = load_32bit_to_16bit(input_row);
-      input_row += stride;
-      __m128i lo = _mm_unpacklo_epi16(src, one);
-      __m128i hi = _mm_unpackhi_epi16(src, one);
-      lo = _mm_madd_epi16(lo, scale_rounding);
-      hi = _mm_madd_epi16(hi, scale_rounding);
-      lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
-      hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
-      out[i] = _mm_packs_epi32(lo, hi);
-    }
-  } else {
-    const __m128i rect_scale =
-        _mm_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
-    for (int i = 0; i < height; ++i) {
-      __m128i src = load_32bit_to_16bit(input_row);
-      src = _mm_mulhrs_epi16(src, rect_scale);
-      input_row += stride;
-      __m128i lo = _mm_unpacklo_epi16(src, one);
-      __m128i hi = _mm_unpackhi_epi16(src, one);
-      lo = _mm_madd_epi16(lo, scale_rounding);
-      hi = _mm_madd_epi16(hi, scale_rounding);
-      lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
-      hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
-      out[i] = _mm_packs_epi32(lo, hi);
-    }
-  }
-}
-
-static INLINE void iidentity_col_8xn_ssse3(uint8_t *output, int stride,
-                                           __m128i *buf, int shift, int height,
-                                           int txh_idx) {
-  const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]);
-  const __m128i scale_rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1));
-  const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1));
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding);
-  const __m128i zero = _mm_setzero_si128();
-  for (int h = 0; h < height; ++h) {
-    __m128i lo = _mm_unpacklo_epi16(buf[h], one);
-    __m128i hi = _mm_unpackhi_epi16(buf[h], one);
-    lo = _mm_madd_epi16(lo, scale_coeff);
-    hi = _mm_madd_epi16(hi, scale_coeff);
-    lo = _mm_srai_epi32(lo, NewSqrt2Bits);
-    hi = _mm_srai_epi32(hi, NewSqrt2Bits);
-    lo = _mm_add_epi32(lo, shift_rounding);
-    hi = _mm_add_epi32(hi, shift_rounding);
-    lo = _mm_srai_epi32(lo, -shift);
-    hi = _mm_srai_epi32(hi, -shift);
-    __m128i x = _mm_packs_epi32(lo, hi);
-
-    const __m128i pred = _mm_loadl_epi64((__m128i const *)(output));
-    x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero));
-    const __m128i u = _mm_packus_epi16(x, x);
-    _mm_storel_epi64((__m128i *)(output), u);
-    output += stride;
-  }
-}
-
-static INLINE void lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input,
-                                                   uint8_t *output, int stride,
-                                                   TX_SIZE tx_size) {
-  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-  const int input_stride = AOMMIN(32, txfm_size_col);
-  const int row_max = AOMMIN(32, txfm_size_row);
-  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
-  __m128i buf[32];
-
-  for (int i = 0; i < (input_stride >> 3); ++i) {
-    iidentity_row_8xn_ssse3(buf, input + 8 * i, input_stride, shift[0], row_max,
-                            txw_idx, rect_type);
-    iidentity_col_8xn_ssse3(output + 8 * i, stride, buf, shift[1], row_max,
-                            txh_idx);
-  }
-}
-
-static void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input,
-                                           uint8_t *output, int stride,
-                                           TX_TYPE tx_type, TX_SIZE tx_size_,
-                                           int eob) {
-  (void)tx_size_;
-  (void)eob;
-  __m128i buf[4];
-  const TX_SIZE tx_size = TX_4X4;
-  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-
-  const transform_1d_ssse3 row_txfm =
-      lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
-  const transform_1d_ssse3 col_txfm =
-      lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
-
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
-  transpose_16bit_4x4(buf, buf);
-  row_txfm(buf, buf, cos_bit_row);
-  if (lr_flip) {
-    __m128i temp[4];
-    flip_buf_sse2(buf, temp, txfm_size_col);
-    transpose_16bit_4x4(temp, buf);
-  } else {
-    transpose_16bit_4x4(buf, buf);
-  }
-  col_txfm(buf, buf, cos_bit_col);
-  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
-  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
-}
-
-static INLINE __m128i lowbd_get_recon_16x16_sse2(const __m128i pred,
-                                                 __m128i res0, __m128i res1) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i x0 = _mm_unpacklo_epi8(pred, zero);
-  __m128i x1 = _mm_unpackhi_epi8(pred, zero);
-  x0 = _mm_adds_epi16(res0, x0);
-  x1 = _mm_adds_epi16(res1, x1);
-  return _mm_packus_epi16(x0, x1);
-}
-
-static INLINE void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output,
-                                                int stride, int flipud,
-                                                int height) {
-  int j = flipud ? (height - 1) : 0;
-  const int step = flipud ? -1 : 1;
-  for (int i = 0; i < height; ++i, j += step) {
-    __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
-    __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]);
-    _mm_storeu_si128((__m128i *)(output + i * stride), u);
-  }
-}
-
-static INLINE void round_shift_ssse3(const __m128i *input, __m128i *output,
-                                     int size) {
-  const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8);
-  for (int i = 0; i < size; ++i) {
-    output[i] = _mm_mulhrs_epi16(input[i], scale);
-  }
-}
-
-static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(
-    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
-    TX_SIZE tx_size, int eob) {
-  __m128i buf1[64 * 8];
-  int eobx, eoby;
-  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-  const int buf_size_w_div8 = txfm_size_col >> 3;
-  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
-  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
-  const int input_stride = AOMMIN(32, txfm_size_col);
-  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
-
-  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
-  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
-  const transform_1d_ssse3 row_txfm =
-      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
-  const transform_1d_ssse3 col_txfm =
-      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
-
-  assert(col_txfm != NULL);
-  assert(row_txfm != NULL);
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
-    __m128i buf0[64];
-    const int32_t *input_row = input + i * input_stride * 8;
-    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
-      __m128i *buf0_cur = buf0 + j * 8;
-      load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
-      transpose_16bit_8x8(buf0_cur, buf0_cur);
-    }
-    if (rect_type == 1 || rect_type == -1) {
-      round_shift_ssse3(buf0, buf0, input_stride);  // rect special code
-    }
-    row_txfm(buf0, buf0, cos_bit_row);
-    round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
-    __m128i *_buf1 = buf1 + i * 8;
-    if (lr_flip) {
-      for (int j = 0; j < buf_size_w_div8; ++j) {
-        __m128i temp[8];
-        flip_buf_sse2(buf0 + 8 * j, temp, 8);
-        transpose_16bit_8x8(temp,
-                            _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j));
-      }
-    } else {
-      for (int j = 0; j < buf_size_w_div8; ++j) {
-        transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j);
-      }
-    }
-  }
-  for (int i = 0; i < buf_size_w_div8; i++) {
-    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, cos_bit_col);
-    round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
-  }
-
-  if (txfm_size_col >= 16) {
-    for (int i = 0; i < (txfm_size_col >> 4); i++) {
-      lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2,
-                                   output + 16 * i, stride, ud_flip,
-                                   txfm_size_row);
-    }
-  } else if (txfm_size_col == 8) {
-    lowbd_write_buffer_8xn_sse2(buf1, output, stride, ud_flip, txfm_size_row);
-  }
-}
-
-static INLINE void lowbd_inv_txfm2d_add_h_identity_ssse3(
-    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
-    TX_SIZE tx_size, int eob) {
-  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
-  int eobx, eoby;
-  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-  const int buf_size_w_div8 = (eobx + 8) >> 3;
-  const int input_stride = AOMMIN(32, txfm_size_col);
-  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
-
-  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
-  assert(fun_idx < 5);
-  const transform_1d_ssse3 col_txfm =
-      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
-
-  assert(col_txfm != NULL);
-
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  for (int i = 0; i < buf_size_w_div8; i++) {
-    __m128i buf0[64];
-    iidentity_row_8xn_ssse3(buf0, input + 8 * i, input_stride, shift[0],
-                            eoby + 1, txw_idx, rect_type);
-    col_txfm(buf0, buf0, cos_bit_col);
-    __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1]));
-    int k = ud_flip ? (txfm_size_row - 1) : 0;
-    const int step = ud_flip ? -1 : 1;
-    uint8_t *out = output + 8 * i;
-    for (int j = 0; j < txfm_size_row; ++j, k += step) {
-      const __m128i v = _mm_loadl_epi64((__m128i const *)(out));
-      __m128i res = _mm_mulhrs_epi16(buf0[k], mshift);
-      const __m128i u = lowbd_get_recon_8x8_sse2(v, res);
-      _mm_storel_epi64((__m128i *)(out), u);
-      out += stride;
-    }
-  }
-}
-
-static INLINE void lowbd_inv_txfm2d_add_v_identity_ssse3(
-    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
-    TX_SIZE tx_size, int eob) {
-  __m128i buf1[64];
-  int eobx, eoby;
-  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
-  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-  const int buf_size_w_div8 = txfm_size_col >> 3;
-  const int buf_size_h_div8 = (eoby + 8) >> 3;
-  const int input_stride = AOMMIN(32, txfm_size_col);
-  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
-
-  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
-  const transform_1d_ssse3 row_txfm =
-      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
-
-  assert(row_txfm != NULL);
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  for (int i = 0; i < buf_size_h_div8; i++) {
-    __m128i buf0[64];
-    const int32_t *input_row = input + i * input_stride * 8;
-    for (int j = 0; j < AOMMIN(4, buf_size_w_div8); ++j) {
-      __m128i *buf0_cur = buf0 + j * 8;
-      load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
-      transpose_16bit_8x8(buf0_cur, buf0_cur);
-    }
-    if (rect_type == 1 || rect_type == -1) {
-      round_shift_ssse3(buf0, buf0, input_stride);  // rect special code
-    }
-    row_txfm(buf0, buf0, cos_bit_row);
-    round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
-    __m128i *_buf1 = buf1;
-    if (lr_flip) {
-      for (int j = 0; j < buf_size_w_div8; ++j) {
-        __m128i temp[8];
-        flip_buf_sse2(buf0 + 8 * j, temp, 8);
-        transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j));
-      }
-    } else {
-      for (int j = 0; j < buf_size_w_div8; ++j) {
-        transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j);
-      }
-    }
-
-    for (int j = 0; j < buf_size_w_div8; ++j) {
-      iidentity_col_8xn_ssse3(output + i * 8 * stride + j * 8, stride,
-                              buf1 + j * 8, shift[1], 8, txh_idx);
-    }
-  }
-}
-
-// for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64
-static INLINE void lowbd_inv_txfm2d_add_universe_ssse3(
-    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
-    TX_SIZE tx_size, int eob) {
-  switch (tx_type) {
-    case DCT_DCT:
-      lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
-                                             tx_size, eob);
-      break;
-    case IDTX:
-      lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size);
-      break;
-    case V_DCT:
-    case V_ADST:
-    case V_FLIPADST:
-      lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type,
-                                            tx_size, eob);
-      break;
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST:
-      lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type,
-                                            tx_size, eob);
-      break;
-    default:
-      lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
-                                             tx_size, eob);
-      break;
-  }
-}
-
-static void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input,
-                                           uint8_t *output, int stride,
-                                           TX_TYPE tx_type, TX_SIZE tx_size_,
-                                           int eob) {
-  (void)tx_size_;
-  (void)eob;
-  __m128i buf[8];
-  const TX_SIZE tx_size = TX_4X8;
-  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-
-  const transform_1d_ssse3 row_txfm =
-      lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
-  const transform_1d_ssse3 col_txfm =
-      lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
-
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
-  transpose_16bit_4x8(buf, buf);
-  round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
-  row_txfm(buf, buf, cos_bit_row);
-  // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0
-  if (lr_flip) {
-    __m128i temp[4];
-    flip_buf_sse2(buf, temp, txfm_size_col);
-    transpose_16bit_8x4(temp, buf);
-  } else {
-    transpose_16bit_8x4(buf, buf);
-  }
-  col_txfm(buf, buf, cos_bit_col);
-  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
-  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
-}
-
-static void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input,
-                                           uint8_t *output, int stride,
-                                           TX_TYPE tx_type, TX_SIZE tx_size_,
-                                           int eob) {
-  (void)tx_size_;
-  (void)eob;
-  __m128i buf[8];
-  const TX_SIZE tx_size = TX_8X4;
-  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-
-  const transform_1d_ssse3 row_txfm =
-      lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
-  const transform_1d_ssse3 col_txfm =
-      lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
-
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  load_buffer_32bit_to_16bit(input, txfm_size_col, buf, txfm_size_row);
-  transpose_16bit_8x4(buf, buf);
-  round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
-  row_txfm(buf, buf, cos_bit_row);
-  // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0
-  if (lr_flip) {
-    __m128i temp[8];
-    flip_buf_sse2(buf, temp, txfm_size_col);
-    transpose_16bit_4x8(temp, buf);
-  } else {
-    transpose_16bit_4x8(buf, buf);
-  }
-  col_txfm(buf, buf, cos_bit_col);
-  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
-  lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
-}
-
-static void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input,
-                                            uint8_t *output, int stride,
-                                            TX_TYPE tx_type, TX_SIZE tx_size_,
-                                            int eob) {
-  (void)tx_size_;
-  (void)eob;
-  __m128i buf[16];
-  const TX_SIZE tx_size = TX_4X16;
-  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-
-  const transform_1d_ssse3 row_txfm =
-      lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
-  const transform_1d_ssse3 col_txfm =
-      lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
-
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-
-  const int row_one_loop = 8;
-  for (int i = 0; i < 2; ++i) {
-    const int32_t *input_cur = input + i * txfm_size_col * row_one_loop;
-    __m128i *buf_cur = buf + i * row_one_loop;
-    load_buffer_32bit_to_16bit_w4(input_cur, txfm_size_col, buf_cur,
-                                  row_one_loop);
-    transpose_16bit_4x8(buf_cur, buf_cur);
-    if (row_txfm == iidentity4_ssse3) {
-      const __m128i scale = pair_set_epi16(NewSqrt2, 3 << (NewSqrt2Bits - 1));
-      const __m128i ones = _mm_set1_epi16(1);
-      for (int j = 0; j < 4; ++j) {
-        const __m128i buf_lo = _mm_unpacklo_epi16(buf_cur[j], ones);
-        const __m128i buf_hi = _mm_unpackhi_epi16(buf_cur[j], ones);
-        const __m128i buf_32_lo =
-            _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1));
-        const __m128i buf_32_hi =
-            _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1));
-        buf_cur[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
-      }
-    } else {
-      row_txfm(buf_cur, buf_cur, cos_bit_row);
-      round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
-    }
-    if (lr_flip) {
-      __m128i temp[8];
-      flip_buf_sse2(buf_cur, temp, txfm_size_col);
-      transpose_16bit_8x4(temp, buf_cur);
-    } else {
-      transpose_16bit_8x4(buf_cur, buf_cur);
-    }
-  }
-  col_txfm(buf, buf, cos_bit_col);
-  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
-  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
-}
-
-static void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input,
-                                            uint8_t *output, int stride,
-                                            TX_TYPE tx_type, TX_SIZE tx_size_,
-                                            int eob) {
-  (void)tx_size_;
-  (void)eob;
-  __m128i buf[16];
-  const TX_SIZE tx_size = TX_16X4;
-  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
-  const int txfm_size_col = tx_size_wide[tx_size];
-  const int txfm_size_row = tx_size_high[tx_size];
-  const int buf_size_w_div8 = txfm_size_col >> 3;
-
-  const transform_1d_ssse3 row_txfm =
-      lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
-  const transform_1d_ssse3 col_txfm =
-      lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
-
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int row_one_loop = 8;
-  for (int i = 0; i < buf_size_w_div8; ++i) {
-    const int32_t *input_cur = input + i * row_one_loop;
-    __m128i *buf_cur = buf + i * row_one_loop;
-    load_buffer_32bit_to_16bit(input_cur, txfm_size_col, buf_cur,
-                               txfm_size_row);
-    transpose_16bit_8x4(buf_cur, buf_cur);
-  }
-  if (row_txfm == iidentity16_ssse3) {
-    const __m128i scale = pair_set_epi16(2 * NewSqrt2, 3 << (NewSqrt2Bits - 1));
-    const __m128i ones = _mm_set1_epi16(1);
-    for (int j = 0; j < 16; ++j) {
-      const __m128i buf_lo = _mm_unpacklo_epi16(buf[j], ones);
-      const __m128i buf_hi = _mm_unpackhi_epi16(buf[j], ones);
-      const __m128i buf_32_lo =
-          _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1));
-      const __m128i buf_32_hi =
-          _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1));
-      buf[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
-    }
-  } else {
-    row_txfm(buf, buf, cos_bit_row);
-    round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
-  }
-  if (lr_flip) {
-    __m128i temp[16];
-    flip_buf_sse2(buf, temp, 16);
-    transpose_16bit_4x8(temp, buf);
-    transpose_16bit_4x8(temp + 8, buf + 8);
-  } else {
-    transpose_16bit_4x8(buf, buf);
-    transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop);
-  }
-  for (int i = 0; i < buf_size_w_div8; i++) {
-    col_txfm(buf + i * row_one_loop, buf + i * row_one_loop, cos_bit_col);
-    round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]);
-  }
-  lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4);
-  lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4);
-}
-
-void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
-                                    int stride, TX_TYPE tx_type,
-                                    TX_SIZE tx_size, int eob) {
-  switch (tx_size) {
-    case TX_4X4:
-      lowbd_inv_txfm2d_add_4x4_ssse3(input, output, stride, tx_type, tx_size,
-                                     eob);
-      break;
-    case TX_4X8:
-      lowbd_inv_txfm2d_add_4x8_ssse3(input, output, stride, tx_type, tx_size,
-                                     eob);
-      break;
-    case TX_8X4:
-      lowbd_inv_txfm2d_add_8x4_ssse3(input, output, stride, tx_type, tx_size,
-                                     eob);
-      break;
-    case TX_4X16:
-      lowbd_inv_txfm2d_add_4x16_ssse3(input, output, stride, tx_type, tx_size,
-                                      eob);
-      break;
-    case TX_16X4:
-      lowbd_inv_txfm2d_add_16x4_ssse3(input, output, stride, tx_type, tx_size,
-                                      eob);
-      break;
-    default:
-      lowbd_inv_txfm2d_add_universe_ssse3(input, output, stride, tx_type,
-                                          tx_size, eob);
-      break;
-  }
-}
-
-void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
-                            const TxfmParam *txfm_param) {
-  if (!txfm_param->lossless) {
-    const TX_TYPE tx_type = txfm_param->tx_type;
-    av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type,
-                                   txfm_param->tx_size, txfm_param->eob);
-
-  } else {
-    av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
-  }
-}

diff --git a/av1/common/x86/av1_inv_txfm_ssse3.h b/av1/common/x86/av1_inv_txfm_ssse3.h
index 1551527..beb517a 100644
--- a/av1/common/x86/av1_inv_txfm_ssse3.h
+++ b/av1/common/x86/av1_inv_txfm_ssse3.h

@@ -26,52 +26,6 @@
 extern "C" {
 #endif
 
-#define btf_16_ssse3(w0, w1, in, out0, out1)    \
-  do {                                          \
-    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
-    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
-    const __m128i _in = in;                     \
-    out0 = _mm_mulhrs_epi16(_in, _w0);          \
-    out1 = _mm_mulhrs_epi16(_in, _w1);          \
-  } while (0)
-
-#define btf_16_adds_subs_sse2(in0, in1) \
-  do {                                  \
-    const __m128i _in0 = in0;           \
-    const __m128i _in1 = in1;           \
-    in0 = _mm_adds_epi16(_in0, _in1);   \
-    in1 = _mm_subs_epi16(_in0, _in1);   \
-  } while (0)
-
-#define btf_16_subs_adds_sse2(in0, in1) \
-  do {                                  \
-    const __m128i _in0 = in0;           \
-    const __m128i _in1 = in1;           \
-    in1 = _mm_subs_epi16(_in0, _in1);   \
-    in0 = _mm_adds_epi16(_in0, _in1);   \
-  } while (0)
-
-#define btf_16_adds_subs_out_sse2(out0, out1, in0, in1) \
-  do {                                                  \
-    const __m128i _in0 = in0;                           \
-    const __m128i _in1 = in1;                           \
-    out0 = _mm_adds_epi16(_in0, _in1);                  \
-    out1 = _mm_subs_epi16(_in0, _in1);                  \
-  } while (0)
-
-static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
-  if (bit < 0) {
-    const __m128i scale = _mm_set1_epi16(1 << (15 + bit));
-    for (int i = 0; i < size; ++i) {
-      in[i] = _mm_mulhrs_epi16(in[i], scale);
-    }
-  } else if (bit > 0) {
-    for (int i = 0; i < size; ++i) {
-      in[i] = _mm_slli_epi16(in[i], bit);
-    }
-  }
-}
-
 // 1D itx types
 enum {
   IDCT_1D,
@@ -170,7 +124,7 @@
   av1_eob_to_eobxy_32x16_default,
 };
 
-static const int lowbd_txfm_all_1d_zeros_idx[32] = {
+static const int highbd_txfm_all_1d_zeros_idx[32] = {
   0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 };
@@ -220,12 +174,6 @@
   *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
 }
 
-typedef void (*transform_1d_ssse3)(const __m128i *input, __m128i *output,
-                                   int8_t cos_bit);
-
-void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
-                                    int stride, TX_TYPE tx_type,
-                                    TX_SIZE tx_size, int eob);
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/common/x86/cfl_avx2.c b/av1/common/x86/cfl_avx2.c
index ec26f98..f49f914 100644
--- a/av1/common/x86/cfl_avx2.c
+++ b/av1/common/x86/cfl_avx2.c

@@ -59,107 +59,6 @@
  *
  * Note: For 4:2:0 luma subsampling, the width will never be greater than 16.
  */
-static void cfl_luma_subsampling_420_lbd_avx2(const uint8_t *input,
-                                              int input_stride,
-                                              uint16_t *pred_buf_q3, int width,
-                                              int height) {
-  (void)width;                               // Forever 32
-  const __m256i twos = _mm256_set1_epi8(2);  // Thirty two twos
-  const int luma_stride = input_stride << 1;
-  __m256i *row = (__m256i *)pred_buf_q3;
-  const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256;
-  do {
-    __m256i top = _mm256_loadu_si256((__m256i *)input);
-    __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
-
-    __m256i top_16x16 = _mm256_maddubs_epi16(top, twos);
-    __m256i bot_16x16 = _mm256_maddubs_epi16(bot, twos);
-    __m256i sum_16x16 = _mm256_add_epi16(top_16x16, bot_16x16);
-
-    _mm256_storeu_si256(row, sum_16x16);
-
-    input += luma_stride;
-  } while ((row += CFL_BUF_LINE_I256) < row_end);
-}
-
-CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, lbd)
-
-/**
- * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
- * precise version of a box filter 4:2:2 pixel subsampling in Q3.
- *
- * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
- * active area is specified using width and height.
- *
- * Note: We don't need to worry about going over the active area, as long as we
- * stay inside the CfL prediction buffer.
- */
-static void cfl_luma_subsampling_422_lbd_avx2(const uint8_t *input,
-                                              int input_stride,
-                                              uint16_t *pred_buf_q3, int width,
-                                              int height) {
-  (void)width;                                // Forever 32
-  const __m256i fours = _mm256_set1_epi8(4);  // Thirty two fours
-  __m256i *row = (__m256i *)pred_buf_q3;
-  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
-  do {
-    __m256i top = _mm256_loadu_si256((__m256i *)input);
-    __m256i top_16x16 = _mm256_maddubs_epi16(top, fours);
-    _mm256_storeu_si256(row, top_16x16);
-    input += input_stride;
-  } while ((row += CFL_BUF_LINE_I256) < row_end);
-}
-
-CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, lbd)
-
-/**
- * Multiplies the pixels by 8 (scaling in Q3). The AVX2 subsampling is only
- * performed on block of width 32.
- *
- * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
- * active area is specified using width and height.
- *
- * Note: We don't need to worry about going over the active area, as long as we
- * stay inside the CfL prediction buffer.
- */
-static void cfl_luma_subsampling_444_lbd_avx2(const uint8_t *input,
-                                              int input_stride,
-                                              uint16_t *pred_buf_q3, int width,
-                                              int height) {
-  (void)width;  // Forever 32
-  __m256i *row = (__m256i *)pred_buf_q3;
-  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
-  const __m256i zeros = _mm256_setzero_si256();
-  do {
-    __m256i top = _mm256_loadu_si256((__m256i *)input);
-    top = _mm256_permute4x64_epi64(top, _MM_SHUFFLE(3, 1, 2, 0));
-
-    __m256i row_lo = _mm256_unpacklo_epi8(top, zeros);
-    row_lo = _mm256_slli_epi16(row_lo, 3);
-    __m256i row_hi = _mm256_unpackhi_epi8(top, zeros);
-    row_hi = _mm256_slli_epi16(row_hi, 3);
-
-    _mm256_storeu_si256(row, row_lo);
-    _mm256_storeu_si256(row + 1, row_hi);
-
-    input += input_stride;
-  } while ((row += CFL_BUF_LINE_I256) < row_end);
-}
-
-CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, lbd)
-
-/**
- * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
- * precise version of a box filter 4:2:0 pixel subsampling in Q3.
- *
- * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
- * active area is specified using width and height.
- *
- * Note: We don't need to worry about going over the active area, as long as we
- * stay inside the CfL prediction buffer.
- *
- * Note: For 4:2:0 luma subsampling, the width will never be greater than 16.
- */
 static void cfl_luma_subsampling_420_hbd_avx2(const uint16_t *input,
                                               int input_stride,
                                               uint16_t *pred_buf_q3, int width,
@@ -250,57 +149,6 @@
   return _mm256_add_epi16(scaled_luma_q0, dc_q0);
 }
 
-static INLINE void cfl_predict_lbd_avx2(const int16_t *pred_buf_q3,
-                                        uint8_t *dst, int dst_stride,
-                                        int alpha_q3, int width, int height) {
-  (void)width;
-  const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
-  const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
-  const __m256i dc_q0 = _mm256_set1_epi16(*dst);
-  __m256i *row = (__m256i *)pred_buf_q3;
-  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
-
-  do {
-    __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
-    __m256i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
-    res = _mm256_packus_epi16(res, next);
-    res = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0));
-    _mm256_storeu_si256((__m256i *)dst, res);
-    dst += dst_stride;
-  } while ((row += CFL_BUF_LINE_I256) < row_end);
-}
-
-CFL_PREDICT_X(avx2, 32, 8, lbd);
-CFL_PREDICT_X(avx2, 32, 16, lbd);
-CFL_PREDICT_X(avx2, 32, 32, lbd);
-
-cfl_predict_lbd_fn cfl_get_predict_lbd_fn_avx2(TX_SIZE tx_size) {
-  static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = {
-    cfl_predict_lbd_4x4_ssse3,   /* 4x4 */
-    cfl_predict_lbd_8x8_ssse3,   /* 8x8 */
-    cfl_predict_lbd_16x16_ssse3, /* 16x16 */
-    cfl_predict_lbd_32x32_avx2,  /* 32x32 */
-    NULL,                        /* 64x64 (invalid CFL size) */
-    cfl_predict_lbd_4x8_ssse3,   /* 4x8 */
-    cfl_predict_lbd_8x4_ssse3,   /* 8x4 */
-    cfl_predict_lbd_8x16_ssse3,  /* 8x16 */
-    cfl_predict_lbd_16x8_ssse3,  /* 16x8 */
-    cfl_predict_lbd_16x32_ssse3, /* 16x32 */
-    cfl_predict_lbd_32x16_avx2,  /* 32x16 */
-    NULL,                        /* 32x64 (invalid CFL size) */
-    NULL,                        /* 64x32 (invalid CFL size) */
-    cfl_predict_lbd_4x16_ssse3,  /* 4x16  */
-    cfl_predict_lbd_16x4_ssse3,  /* 16x4  */
-    cfl_predict_lbd_8x32_ssse3,  /* 8x32  */
-    cfl_predict_lbd_32x8_avx2,   /* 32x8  */
-    NULL,                        /* 16x64 (invalid CFL size) */
-    NULL,                        /* 64x16 (invalid CFL size) */
-  };
-  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
-  // function pointer array out of bounds.
-  return pred[tx_size % TX_SIZES_ALL];
-}
-
 static __m256i highbd_max_epi16(int bd) {
   const __m256i neg_one = _mm256_set1_epi16(-1);
   // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)

diff --git a/av1/common/x86/cfl_simd.h b/av1/common/x86/cfl_simd.h
index ba362ac..3ac903c 100644
--- a/av1/common/x86/cfl_simd.h
+++ b/av1/common/x86/cfl_simd.h

@@ -15,90 +15,6 @@
 
 #include "av1/common/blockd.h"
 
-// SSSE3 version is optimal for with == 4, we reuse them in AVX2
-void cfl_subsample_lbd_420_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
-                                     uint16_t *output_q3);
-void cfl_subsample_lbd_420_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
-                                     uint16_t *output_q3);
-void cfl_subsample_lbd_420_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
-                                      uint16_t *output_q3);
-
-// SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void cfl_subsample_lbd_420_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
-                                     uint16_t *output_q3);
-void cfl_subsample_lbd_420_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
-                                     uint16_t *output_q3);
-void cfl_subsample_lbd_420_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
-                                      uint16_t *output_q3);
-void cfl_subsample_lbd_420_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
-                                      uint16_t *output_q3);
-
-// SSSE3 version is optimal for with == 16, we reuse it in AVX2
-void cfl_subsample_lbd_420_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
-                                      uint16_t *output_q3);
-void cfl_subsample_lbd_420_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
-                                      uint16_t *output_q3);
-void cfl_subsample_lbd_420_16x16_ssse3(const uint8_t *cfl_type,
-                                       int input_stride, uint16_t *output_q3);
-void cfl_subsample_lbd_420_16x32_ssse3(const uint8_t *cfl_type,
-                                       int input_stride, uint16_t *output_q3);
-
-// SSSE3 version is optimal for with == 4, we reuse them in AVX2
-void cfl_subsample_lbd_422_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
-                                     uint16_t *output_q3);
-void cfl_subsample_lbd_422_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
-                                     uint16_t *output_q3);
-void cfl_subsample_lbd_422_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
-                                      uint16_t *output_q3);
-
-// SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void cfl_subsample_lbd_422_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
-                                     uint16_t *output_q3);
-void cfl_subsample_lbd_422_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
-                                     uint16_t *output_q3);
-void cfl_subsample_lbd_422_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
-                                      uint16_t *output_q3);
-void cfl_subsample_lbd_422_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
-                                      uint16_t *output_q3);
-
-// SSSE3 version is optimal for with == 16, we reuse it in AVX2
-void cfl_subsample_lbd_422_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
-                                      uint16_t *output_q3);
-void cfl_subsample_lbd_422_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
-                                      uint16_t *output_q3);
-void cfl_subsample_lbd_422_16x16_ssse3(const uint8_t *cfl_type,
-                                       int input_stride, uint16_t *output_q3);
-void cfl_subsample_lbd_422_16x32_ssse3(const uint8_t *cfl_type,
-                                       int input_stride, uint16_t *output_q3);
-
-// SSSE3 version is optimal for with == 4, we reuse them in AVX2
-void cfl_subsample_lbd_444_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
-                                     uint16_t *output_q3);
-void cfl_subsample_lbd_444_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
-                                     uint16_t *output_q3);
-void cfl_subsample_lbd_444_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
-                                      uint16_t *output_q3);
-
-// SSSE3 version is optimal for with == 8, we reuse it in AVX2
-void cfl_subsample_lbd_444_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
-                                     uint16_t *output_q3);
-void cfl_subsample_lbd_444_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
-                                     uint16_t *output_q3);
-void cfl_subsample_lbd_444_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
-                                      uint16_t *output_q3);
-void cfl_subsample_lbd_444_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
-                                      uint16_t *output_q3);
-
-// SSSE3 version is optimal for with == 16, we reuse it in AVX2
-void cfl_subsample_lbd_444_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
-                                      uint16_t *output_q3);
-void cfl_subsample_lbd_444_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
-                                      uint16_t *output_q3);
-void cfl_subsample_lbd_444_16x16_ssse3(const uint8_t *cfl_type,
-                                       int input_stride, uint16_t *output_q3);
-void cfl_subsample_lbd_444_16x32_ssse3(const uint8_t *cfl_type,
-                                       int input_stride, uint16_t *output_q3);
-
 void cfl_subsample_hbd_420_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
                                      uint16_t *output_q3);
 void cfl_subsample_hbd_420_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
@@ -191,31 +107,6 @@
 void cfl_subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst);
 void cfl_subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst);
 
-void cfl_predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                               int dst_stride, int alpha_q3);
-void cfl_predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                               int dst_stride, int alpha_q3);
-void cfl_predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                                int dst_stride, int alpha_q3);
-
-void cfl_predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                               int dst_stride, int alpha_q3);
-void cfl_predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                               int dst_stride, int alpha_q3);
-void cfl_predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                                int dst_stride, int alpha_q3);
-void cfl_predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                                int dst_stride, int alpha_q3);
-
-void cfl_predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                                int dst_stride, int alpha_q3);
-void cfl_predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                                int dst_stride, int alpha_q3);
-void cfl_predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                                 int dst_stride, int alpha_q3);
-void cfl_predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
-                                 int dst_stride, int alpha_q3);
-
 void cfl_predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
                                int dst_stride, int alpha_q3, int bd);
 void cfl_predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,

diff --git a/av1/common/x86/cfl_ssse3.c b/av1/common/x86/cfl_ssse3.c
index 52e1cc3..ec7152d 100644
--- a/av1/common/x86/cfl_ssse3.c
+++ b/av1/common/x86/cfl_ssse3.c

@@ -38,147 +38,6 @@
  * Note: We don't need to worry about going over the active area, as long as we
  * stay inside the CfL prediction buffer.
  */
-static INLINE void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input,
-                                                      int input_stride,
-                                                      uint16_t *pred_buf_q3,
-                                                      int width, int height) {
-  const __m128i twos = _mm_set1_epi8(2);
-  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
-  const __m128i *end = pred_buf_m128i + (height >> 1) * CFL_BUF_LINE_I128;
-  const int luma_stride = input_stride << 1;
-  do {
-    if (width == 4) {
-      __m128i top = _mm_loadh_epi32((__m128i *)input);
-      top = _mm_maddubs_epi16(top, twos);
-      __m128i bot = _mm_loadh_epi32((__m128i *)(input + input_stride));
-      bot = _mm_maddubs_epi16(bot, twos);
-      const __m128i sum = _mm_add_epi16(top, bot);
-      _mm_storeh_epi32(pred_buf_m128i, sum);
-    } else if (width == 8) {
-      __m128i top = _mm_loadl_epi64((__m128i *)input);
-      top = _mm_maddubs_epi16(top, twos);
-      __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
-      bot = _mm_maddubs_epi16(bot, twos);
-      const __m128i sum = _mm_add_epi16(top, bot);
-      _mm_storel_epi64(pred_buf_m128i, sum);
-    } else {
-      __m128i top = _mm_loadu_si128((__m128i *)input);
-      top = _mm_maddubs_epi16(top, twos);
-      __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
-      bot = _mm_maddubs_epi16(bot, twos);
-      const __m128i sum = _mm_add_epi16(top, bot);
-      _mm_storeu_si128(pred_buf_m128i, sum);
-      if (width == 32) {
-        __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
-        __m128i bot_1 =
-            _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
-        top_1 = _mm_maddubs_epi16(top_1, twos);
-        bot_1 = _mm_maddubs_epi16(bot_1, twos);
-        __m128i sum_1 = _mm_add_epi16(top_1, bot_1);
-        _mm_storeu_si128(pred_buf_m128i + 1, sum_1);
-      }
-    }
-    input += luma_stride;
-    pred_buf_m128i += CFL_BUF_LINE_I128;
-  } while (pred_buf_m128i < end);
-}
-
-/**
- * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
- * precise version of a box filter 4:2:2 pixel subsampling in Q3.
- *
- * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
- * active area is specified using width and height.
- *
- * Note: We don't need to worry about going over the active area, as long as we
- * stay inside the CfL prediction buffer.
- */
-static INLINE void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input,
-                                                      int input_stride,
-                                                      uint16_t *pred_buf_q3,
-                                                      int width, int height) {
-  const __m128i fours = _mm_set1_epi8(4);
-  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
-  const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
-  do {
-    if (width == 4) {
-      __m128i top = _mm_loadh_epi32((__m128i *)input);
-      top = _mm_maddubs_epi16(top, fours);
-      _mm_storeh_epi32(pred_buf_m128i, top);
-    } else if (width == 8) {
-      __m128i top = _mm_loadl_epi64((__m128i *)input);
-      top = _mm_maddubs_epi16(top, fours);
-      _mm_storel_epi64(pred_buf_m128i, top);
-    } else {
-      __m128i top = _mm_loadu_si128((__m128i *)input);
-      top = _mm_maddubs_epi16(top, fours);
-      _mm_storeu_si128(pred_buf_m128i, top);
-      if (width == 32) {
-        __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
-        top_1 = _mm_maddubs_epi16(top_1, fours);
-        _mm_storeu_si128(pred_buf_m128i + 1, top_1);
-      }
-    }
-    input += input_stride;
-    pred_buf_m128i += CFL_BUF_LINE_I128;
-  } while (pred_buf_m128i < end);
-}
-
-/**
- * Multiplies the pixels by 8 (scaling in Q3).
- *
- * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
- * active area is specified using width and height.
- *
- * Note: We don't need to worry about going over the active area, as long as we
- * stay inside the CfL prediction buffer.
- */
-static INLINE void cfl_luma_subsampling_444_lbd_ssse3(const uint8_t *input,
-                                                      int input_stride,
-                                                      uint16_t *pred_buf_q3,
-                                                      int width, int height) {
-  const __m128i zeros = _mm_setzero_si128();
-  const int luma_stride = input_stride;
-  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
-  const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
-  do {
-    if (width == 4) {
-      __m128i row = _mm_loadh_epi32((__m128i *)input);
-      row = _mm_unpacklo_epi8(row, zeros);
-      _mm_storel_epi64(pred_buf_m128i, _mm_slli_epi16(row, 3));
-    } else if (width == 8) {
-      __m128i row = _mm_loadl_epi64((__m128i *)input);
-      row = _mm_unpacklo_epi8(row, zeros);
-      _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row, 3));
-    } else {
-      __m128i row = _mm_loadu_si128((__m128i *)input);
-      const __m128i row_lo = _mm_unpacklo_epi8(row, zeros);
-      const __m128i row_hi = _mm_unpackhi_epi8(row, zeros);
-      _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row_lo, 3));
-      _mm_storeu_si128(pred_buf_m128i + 1, _mm_slli_epi16(row_hi, 3));
-      if (width == 32) {
-        __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1);
-        const __m128i row_1_lo = _mm_unpacklo_epi8(row_1, zeros);
-        const __m128i row_1_hi = _mm_unpackhi_epi8(row_1, zeros);
-        _mm_storeu_si128(pred_buf_m128i + 2, _mm_slli_epi16(row_1_lo, 3));
-        _mm_storeu_si128(pred_buf_m128i + 3, _mm_slli_epi16(row_1_hi, 3));
-      }
-    }
-    input += luma_stride;
-    pred_buf_m128i += CFL_BUF_LINE_I128;
-  } while (pred_buf_m128i < end);
-}
-
-/**
- * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
- * precise version of a box filter 4:2:0 pixel subsampling in Q3.
- *
- * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
- * active area is specified using width and height.
- *
- * Note: We don't need to worry about going over the active area, as long as we
- * stay inside the CfL prediction buffer.
- */
 static INLINE void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input,
                                                       int input_stride,
                                                       uint16_t *pred_buf_q3,
@@ -309,39 +168,6 @@
   return _mm_add_epi16(scaled_luma_q0, dc_q0);
 }
 
-static INLINE void cfl_predict_lbd_ssse3(const int16_t *pred_buf_q3,
-                                         uint8_t *dst, int dst_stride,
-                                         int alpha_q3, int width, int height) {
-  const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
-  const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
-  const __m128i dc_q0 = _mm_set1_epi16(*dst);
-  __m128i *row = (__m128i *)pred_buf_q3;
-  const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
-  do {
-    __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
-    if (width < 16) {
-      res = _mm_packus_epi16(res, res);
-      if (width == 4)
-        _mm_storeh_epi32((__m128i *)dst, res);
-      else
-        _mm_storel_epi64((__m128i *)dst, res);
-    } else {
-      __m128i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
-      res = _mm_packus_epi16(res, next);
-      _mm_storeu_si128((__m128i *)dst, res);
-      if (width == 32) {
-        res = predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0);
-        next = predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0);
-        res = _mm_packus_epi16(res, next);
-        _mm_storeu_si128((__m128i *)(dst + 16), res);
-      }
-    }
-    dst += dst_stride;
-  } while ((row += CFL_BUF_LINE_I128) < row_end);
-}
-
-CFL_PREDICT_FN(ssse3, lbd)
-
 static INLINE __m128i highbd_max_epi16(int bd) {
   const __m128i neg_one = _mm_set1_epi16(-1);
   // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)

diff --git a/av1/common/x86/convolve_2d_avx2.c b/av1/common/x86/convolve_2d_avx2.c
deleted file mode 100644
index a982574..0000000
--- a/av1/common/x86/convolve_2d_avx2.c
+++ /dev/null

@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <immintrin.h>
-
-#include "config/av1_rtcd.h"
-
-#include "aom_dsp/x86/convolve_avx2.h"
-#include "aom_dsp/x86/convolve_common_intrin.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/synonyms.h"
-#include "av1/common/convolve.h"
-
-void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
-                             int dst_stride, int w, int h,
-                             const InterpFilterParams *filter_params_x,
-                             const InterpFilterParams *filter_params_y,
-                             const int subpel_x_qn, const int subpel_y_qn,
-                             ConvolveParams *conv_params) {
-  const int bd = 8;
-  int im_stride = 8, i;
-  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
-  const int bits =
-      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-
-  assert(conv_params->round_0 > 0);
-
-  const __m256i round_const_h = _mm256_set1_epi16(
-      ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
-  const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
-
-  const __m256i sum_round_v = _mm256_set1_epi32(
-      (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
-  const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
-
-  const __m256i round_const_v = _mm256_set1_epi32(
-      ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
-      ((1 << (offset_bits - conv_params->round_1)) >> 1));
-  const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
-
-  __m256i filt[4], coeffs_h[4], coeffs_v[4];
-
-  filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
-  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-
-  prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
-  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
-
-  const int16_t *const filter_x = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-  const int16_t *const filter_y = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_qn & SUBPEL_MASK);
-
-  int horiz_tap = SUBPEL_TAPS;
-  int vert_tap = SUBPEL_TAPS;
-
-  if (!(filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]))
-    horiz_tap = 4;
-  else if (!(filter_x[0] | filter_x[7]))
-    horiz_tap = 6;
-
-  if (!(filter_y[0] | filter_y[1] | filter_y[6] | filter_y[7]))
-    vert_tap = 4;
-  else if (!(filter_y[0] | filter_y[7]))
-    vert_tap = 6;
-
-  if (horiz_tap == 6)
-    prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
-  else
-    prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
-
-  if (vert_tap == 6)
-    prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
-  else
-    prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
-
-  int im_h = h + vert_tap - 1;
-  const int fo_vert = vert_tap / 2 - 1;
-  const int fo_horiz = horiz_tap / 2 - 1;
-  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-
-  filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-  filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-  for (int j = 0; j < w; j += 8) {
-    if (horiz_tap == 4) {
-      CONVOLVE_SR_HORIZONTAL_FILTER_4TAP
-    } else if (horiz_tap == 6) {
-      CONVOLVE_SR_HORIZONTAL_FILTER_6TAP
-    } else {
-      CONVOLVE_SR_HORIZONTAL_FILTER_8TAP
-    }
-
-    if (vert_tap == 4) {
-      CONVOLVE_SR_VERTICAL_FILTER_4TAP
-    } else if (vert_tap == 6) {
-      CONVOLVE_SR_VERTICAL_FILTER_6TAP
-    } else {
-      CONVOLVE_SR_VERTICAL_FILTER_8TAP
-    }
-  }
-}

diff --git a/av1/common/x86/convolve_2d_sse2.c b/av1/common/x86/convolve_2d_sse2.c
deleted file mode 100644
index 242f62f..0000000
--- a/av1/common/x86/convolve_2d_sse2.c
+++ /dev/null

@@ -1,331 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <emmintrin.h>
-
-#include "config/av1_rtcd.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/convolve_sse2.h"
-#include "av1/common/convolve.h"
-
-void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
-                             int dst_stride, int w, int h,
-                             const InterpFilterParams *filter_params_x,
-                             const InterpFilterParams *filter_params_y,
-                             const int subpel_x_qn, const int subpel_y_qn,
-                             ConvolveParams *conv_params) {
-  const int bd = 8;
-
-  DECLARE_ALIGNED(16, int16_t,
-                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
-  int im_h = h + filter_params_y->taps - 1;
-  int im_stride = MAX_SB_SIZE;
-  int i, j;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-
-  const __m128i zero = _mm_setzero_si128();
-  const int bits =
-      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-
-  assert(conv_params->round_0 > 0);
-
-  /* Horizontal filter */
-  {
-    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_x, subpel_x_qn & SUBPEL_MASK);
-    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const = _mm_set1_epi32(
-        (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1));
-    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
-
-    for (i = 0; i < im_h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        const __m128i data =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-
-        // Filter even-index pixels
-        const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
-                                         _mm_add_epi32(res_2, res_6));
-        res_even =
-            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
-
-        // Filter odd-index pixels
-        const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
-                                        _mm_add_epi32(res_3, res_7));
-        res_odd =
-            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
-
-        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
-        __m128i res = _mm_packs_epi32(res_even, res_odd);
-        _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
-      }
-    }
-  }
-
-  /* Vertical filter */
-  {
-    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_y, subpel_y_qn & SUBPEL_MASK);
-    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i sum_round =
-        _mm_set1_epi32((1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
-    const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1);
-
-    const __m128i round_const = _mm_set1_epi32(
-        ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
-        ((1 << (offset_bits - conv_params->round_1)) >> 1));
-    const __m128i round_shift = _mm_cvtsi32_si128(bits);
-
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        // Filter even-index pixels
-        const int16_t *data = &im_block[i * im_stride + j];
-        const __m128i src_0 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
-                               *(__m128i *)(data + 1 * im_stride));
-        const __m128i src_2 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
-                               *(__m128i *)(data + 3 * im_stride));
-        const __m128i src_4 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
-                               *(__m128i *)(data + 5 * im_stride));
-        const __m128i src_6 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
-                               *(__m128i *)(data + 7 * im_stride));
-
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                               _mm_add_epi32(res_4, res_6));
-
-        // Filter odd-index pixels
-        const __m128i src_1 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
-                               *(__m128i *)(data + 1 * im_stride));
-        const __m128i src_3 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
-                               *(__m128i *)(data + 3 * im_stride));
-        const __m128i src_5 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
-                               *(__m128i *)(data + 5 * im_stride));
-        const __m128i src_7 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
-                               *(__m128i *)(data + 7 * im_stride));
-
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
-
-        // Rearrange pixels back into the order 0 ... 7
-        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-        __m128i res_lo_round =
-            _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift);
-        __m128i res_hi_round =
-            _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift);
-
-        res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
-                                     round_shift);
-        res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
-                                     round_shift);
-
-        const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
-        const __m128i res = _mm_packus_epi16(res16, res16);
-
-        // Accumulate values into the destination buffer
-        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
-
-        if (w == 2) {
-          *(uint16_t *)p = (uint16_t)_mm_cvtsi128_si32(res);
-        } else if (w == 4) {
-          *(uint32_t *)p = _mm_cvtsi128_si32(res);
-        } else {
-          _mm_storel_epi64(p, res);
-        }
-      }
-    }
-  }
-}
-
-void av1_dist_wtd_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
-                                        uint8_t *dst0, int dst_stride0, int w,
-                                        int h, ConvolveParams *conv_params) {
-  const int bd = 8;
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
-
-  const int bits =
-      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
-  const int do_average = conv_params->do_average;
-  const int use_wtd_comp_avg = is_uneven_wtd_comp_avg(conv_params);
-  const int w0 = conv_params->fwd_offset;
-  const int w1 = conv_params->bck_offset;
-  const __m128i wt0 = _mm_set1_epi16(w0);
-  const __m128i wt1 = _mm_set1_epi16(w1);
-  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i left_shift = _mm_cvtsi32_si128(bits);
-  int i, j;
-
-  const int offset_0 =
-      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
-  const __m128i offset_const = _mm_set1_epi16(offset);
-  const int rounding_shift =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
-
-  assert((w % 4) == 0);
-
-  if (!(w % 16)) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 16) {
-        const __m128i d8 = _mm_loadu_si128((__m128i *)&src[j]);
-
-        const __m128i d16_lo = _mm_unpacklo_epi8(d8, zero);
-        const __m128i d16_hi = _mm_unpackhi_epi8(d8, zero);
-
-        const __m128i res_lo = _mm_sll_epi16(d16_lo, left_shift);
-        const __m128i res_unsigned_lo = _mm_add_epi16(res_lo, offset_const);
-
-        const __m128i res_hi = _mm_sll_epi16(d16_hi, left_shift);
-        const __m128i res_unsigned_hi = _mm_add_epi16(res_hi, offset_const);
-
-        if (do_average) {
-          const __m128i data_ref_0_lo = _mm_loadu_si128((__m128i *)(&dst[j]));
-          const __m128i data_ref_0_hi =
-              _mm_loadu_si128((__m128i *)(&dst[j + 8]));
-
-          const __m128i comp_avg_res_lo =
-              comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt, use_wtd_comp_avg);
-
-          const __m128i round_result_lo = convolve_rounding(
-              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
-
-          const __m128i comp_avg_res_hi =
-              comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt, use_wtd_comp_avg);
-
-          const __m128i round_result_hi = convolve_rounding(
-              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
-
-          const __m128i res_8 =
-              _mm_packus_epi16(round_result_lo, round_result_hi);
-
-          _mm_store_si128((__m128i *)(&dst0[j]), res_8);
-        } else {
-          _mm_store_si128((__m128i *)(&dst[j]), res_unsigned_lo);
-          _mm_store_si128((__m128i *)(&dst[j + 8]), res_unsigned_hi);
-        }
-      }
-      src += src_stride;
-      dst += dst_stride;
-      dst0 += dst_stride0;
-    }
-  } else {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]);
-        const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
-
-        const __m128i res = _mm_sll_epi16(d16_0, left_shift);
-        const __m128i res_unsigned = _mm_add_epi16(res, offset_const);
-
-        if (do_average) {
-          const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[j]));
-
-          const __m128i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_wtd_comp_avg);
-
-          const __m128i round_result = convolve_rounding(
-              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
-
-          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
-
-          if (w > 4)
-            _mm_storel_epi64((__m128i *)(&dst0[j]), res_8);
-          else
-            *(uint32_t *)(&dst0[j]) = _mm_cvtsi128_si32(res_8);
-        } else {
-#if CONFIG_OPTFLOW_REFINEMENT
-          _mm_storeu_si128((__m128i *)(&dst[j]), res_unsigned);
-#else
-          _mm_store_si128((__m128i *)(&dst[j]), res_unsigned);
-#endif  // CONFIG_OPTFLOW_REFINEMENT
-        }
-      }
-      src += src_stride;
-      dst += dst_stride;
-      dst0 += dst_stride0;
-    }
-  }
-}

diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c
deleted file mode 100644
index e942eae..0000000
--- a/av1/common/x86/convolve_avx2.c
+++ /dev/null

@@ -1,617 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <immintrin.h>
-
-#include "config/av1_rtcd.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/x86/convolve_avx2.h"
-#include "aom_dsp/x86/synonyms.h"
-
-void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
-                            int dst_stride, int w, int h,
-                            const InterpFilterParams *filter_params_y,
-                            const int subpel_y_qn) {
-  int i, j, vert_tap = SUBPEL_TAPS;
-  // right shift is F-1 because we are already dividing
-  // filter co-efficients by 2
-  const int right_shift_bits = (FILTER_BITS - 1);
-  const __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
-  const __m256i right_shift_const =
-      _mm256_set1_epi16((1 << right_shift_bits) >> 1);
-
-  __m256i coeffs[4], s[8];
-  __m128i d[6];
-
-  // Condition for checking valid vert_filt taps
-  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_qn & SUBPEL_MASK);
-  if (!(filter[0] | filter[1] | filter[6] | filter[7])) {
-    vert_tap = 4;
-  } else if (!(filter[0] | filter[7])) {
-    vert_tap = 6;
-  }
-
-  if (vert_tap == 6)
-    prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs);
-  else
-    prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
-
-  // vert_filt as 4 tap
-  if (vert_tap == 4) {
-    const int fo_vert = 1;
-    const uint8_t *const src_ptr = src - fo_vert * src_stride;
-    for (j = 0; j < w; j += 16) {
-      const uint8_t *data = &src_ptr[j];
-      d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
-      d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
-      d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
-      d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
-      d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
-
-      // Load lines a and b. Line a to lower 128, line b to upper 128
-      const __m256i src_01a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
-
-      const __m256i src_12a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
-
-      const __m256i src_23a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
-
-      const __m256i src_34a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
-
-      s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
-      s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
-
-      s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
-      s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
-
-      for (i = 0; i < h; i += 2) {
-        data = &src_ptr[i * src_stride + j];
-        d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
-        const __m256i src_45a = _mm256_permute2x128_si256(
-            _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
-
-        d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
-        const __m256i src_56a = _mm256_permute2x128_si256(
-            _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20);
-
-        s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
-        s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
-
-        const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
-        /* rounding code */
-        // shift by F - 1
-        const __m256i res_16b_lo = _mm256_sra_epi16(
-            _mm256_add_epi16(res_lo, right_shift_const), right_shift);
-        // 8 bit conversion and saturation to uint8
-        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
-
-        if (w - j > 8) {
-          const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
-
-          /* rounding code */
-          // shift by F - 1
-          const __m256i res_16b_hi = _mm256_sra_epi16(
-              _mm256_add_epi16(res_hi, right_shift_const), right_shift);
-          // 8 bit conversion and saturation to uint8
-          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
-
-          __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
-
-          const __m128i res_0 = _mm256_castsi256_si128(res_a);
-          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
-
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                           res_1);
-        } else {
-          const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
-          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
-          if (w - j > 4) {
-            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
-            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                             res_1);
-          } else if (w - j > 2) {
-            xx_storel_32(&dst[i * dst_stride + j], res_0);
-            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
-          } else {
-            __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
-            __m128i *const p_1 =
-                (__m128i *)&dst[i * dst_stride + j + dst_stride];
-            *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
-            *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
-          }
-        }
-        s[0] = s[1];
-        s[1] = s[2];
-
-        s[3] = s[4];
-        s[4] = s[5];
-      }
-    }
-  } else if (vert_tap == 6) {
-    const int fo_vert = vert_tap / 2 - 1;
-    const uint8_t *const src_ptr = src - fo_vert * src_stride;
-
-    for (j = 0; j < w; j += 16) {
-      const uint8_t *data = &src_ptr[j];
-      __m256i src6;
-
-      d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
-      d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
-      d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
-      d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
-      // Load lines a and b. Line a to lower 128, line b to upper 128
-      const __m256i src_01a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
-
-      const __m256i src_12a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
-
-      const __m256i src_23a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
-
-      src6 = _mm256_castsi128_si256(
-          _mm_loadu_si128((__m128i *)(data + 4 * src_stride)));
-      const __m256i src_34a =
-          _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20);
-
-      s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
-      s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
-
-      s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
-      s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
-
-      for (i = 0; i < h; i += 2) {
-        data = &src_ptr[i * src_stride + j];
-        const __m256i src_45a = _mm256_permute2x128_si256(
-            src6,
-            _mm256_castsi128_si256(
-                _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
-            0x20);
-
-        src6 = _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
-        const __m256i src_56a = _mm256_permute2x128_si256(
-            _mm256_castsi128_si256(
-                _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
-            src6, 0x20);
-
-        s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
-        s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
-
-        const __m256i res_lo = convolve_lowbd_6tap(s, coeffs);
-
-        /* rounding code */
-        // shift by F - 1
-        const __m256i res_16b_lo = _mm256_sra_epi16(
-            _mm256_add_epi16(res_lo, right_shift_const), right_shift);
-        // 8 bit conversion and saturation to uint8
-        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
-
-        if (w - j > 8) {
-          const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs);
-
-          /* rounding code */
-          // shift by F - 1
-          const __m256i res_16b_hi = _mm256_sra_epi16(
-              _mm256_add_epi16(res_hi, right_shift_const), right_shift);
-          // 8 bit conversion and saturation to uint8
-          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
-
-          __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
-
-          const __m128i res_0 = _mm256_castsi256_si128(res_a);
-          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
-
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                           res_1);
-        } else {
-          const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
-          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
-          if (w - j > 4) {
-            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
-            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                             res_1);
-          } else if (w - j > 2) {
-            xx_storel_32(&dst[i * dst_stride + j], res_0);
-            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
-          } else {
-            __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
-            __m128i *const p_1 =
-                (__m128i *)&dst[i * dst_stride + j + dst_stride];
-            *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
-            *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
-          }
-        }
-        s[0] = s[1];
-        s[1] = s[2];
-        s[3] = s[4];
-        s[4] = s[5];
-      }
-    }
-  } else {
-    const int fo_vert = filter_params_y->taps / 2 - 1;
-    const uint8_t *const src_ptr = src - fo_vert * src_stride;
-
-    for (j = 0; j < w; j += 16) {
-      const uint8_t *data = &src_ptr[j];
-      __m256i src6;
-
-      d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
-      d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
-      d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
-      d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
-      d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
-      d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
-      // Load lines a and b. Line a to lower 128, line b to upper 128
-      const __m256i src_01a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
-
-      const __m256i src_12a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
-
-      const __m256i src_23a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
-
-      const __m256i src_34a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
-
-      const __m256i src_45a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
-
-      src6 = _mm256_castsi128_si256(
-          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
-      const __m256i src_56a =
-          _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20);
-
-      s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
-      s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
-      s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
-
-      s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
-      s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
-      s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
-
-      for (i = 0; i < h; i += 2) {
-        data = &src_ptr[i * src_stride + j];
-        const __m256i src_67a = _mm256_permute2x128_si256(
-            src6,
-            _mm256_castsi128_si256(
-                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
-            0x20);
-
-        src6 = _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
-        const __m256i src_78a = _mm256_permute2x128_si256(
-            _mm256_castsi128_si256(
-                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
-            src6, 0x20);
-
-        s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
-        s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
-
-        const __m256i res_lo = convolve_lowbd(s, coeffs);
-
-        /* rounding code */
-        // shift by F - 1
-        const __m256i res_16b_lo = _mm256_sra_epi16(
-            _mm256_add_epi16(res_lo, right_shift_const), right_shift);
-        // 8 bit conversion and saturation to uint8
-        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
-
-        if (w - j > 8) {
-          const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
-
-          /* rounding code */
-          // shift by F - 1
-          const __m256i res_16b_hi = _mm256_sra_epi16(
-              _mm256_add_epi16(res_hi, right_shift_const), right_shift);
-          // 8 bit conversion and saturation to uint8
-          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
-
-          __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
-
-          const __m128i res_0 = _mm256_castsi256_si128(res_a);
-          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
-
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                           res_1);
-        } else {
-          const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
-          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
-          if (w - j > 4) {
-            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
-            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                             res_1);
-          } else if (w - j > 2) {
-            xx_storel_32(&dst[i * dst_stride + j], res_0);
-            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
-          } else {
-            __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
-            __m128i *const p_1 =
-                (__m128i *)&dst[i * dst_stride + j + dst_stride];
-            *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
-            *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
-          }
-        }
-        s[0] = s[1];
-        s[1] = s[2];
-        s[2] = s[3];
-
-        s[4] = s[5];
-        s[5] = s[6];
-        s[6] = s[7];
-      }
-    }
-  }
-}
-
-void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
-                            int dst_stride, int w, int h,
-                            const InterpFilterParams *filter_params_x,
-                            const int subpel_x_qn,
-                            ConvolveParams *conv_params) {
-  const int bits = FILTER_BITS - conv_params->round_0;
-
-  const __m256i round_0_const =
-      _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
-  const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
-  const __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
-  const __m128i round_shift = _mm_cvtsi32_si128(bits);
-  int i, horiz_tap = SUBPEL_TAPS;
-
-  assert(bits >= 0);
-  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
-         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
-  assert(conv_params->round_0 > 0);
-
-  __m256i coeffs[4], filt[4];
-  filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
-  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-
-  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-  if (!(filter[0] | filter[1] | filter[6] | filter[7])) {
-    horiz_tap = 4;
-  } else if (!(filter[0] | filter[7])) {
-    horiz_tap = 6;
-  }
-
-  if (horiz_tap == 6)
-    prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs);
-  else
-    prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
-
-  // horz_filt as 4 tap
-  if (horiz_tap == 4) {
-    const int fo_horiz = 1;
-    const uint8_t *const src_ptr = src - fo_horiz;
-    if (w <= 8) {
-      for (i = 0; i < h; i += 2) {
-        const __m256i data = _mm256_permute2x128_si256(
-            _mm256_castsi128_si256(
-                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
-            _mm256_castsi128_si256(_mm_loadu_si128(
-                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
-            0x20);
-
-        __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
-
-        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
-                                   round_0_shift);
-
-        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
-                                   round_shift);
-
-        /* rounding code */
-        // 8 bit conversion and saturation to uint8
-        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
-
-        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
-        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
-
-        if (w > 4) {
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
-        } else if (w > 2) {
-          xx_storel_32(&dst[i * dst_stride], res_0);
-          xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
-        } else {
-          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
-          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
-          *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
-          *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
-        }
-      }
-    } else {
-      for (i = 0; i < h; ++i) {
-        for (int j = 0; j < w; j += 16) {
-          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
-          // 18 19 20 21 22 23
-          const __m256i data = _mm256_inserti128_si256(
-              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
-              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
-              1);
-
-          __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
-
-          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
-                                     round_0_shift);
-
-          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
-                                     round_shift);
-
-          /* rounding code */
-          // 8 bit conversion and saturation to uint8
-          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
-
-          // Store values into the destination buffer
-          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
-          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
-          __m128i res = _mm256_castsi256_si128(res_8b);
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
-        }
-      }
-    }
-  } else if (horiz_tap == 6) {
-    const int fo_horiz = horiz_tap / 2 - 1;
-    const uint8_t *const src_ptr = src - fo_horiz;
-    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-    if (w <= 8) {
-      for (i = 0; i < h; i += 2) {
-        const __m256i data = _mm256_permute2x128_si256(
-            _mm256_castsi128_si256(
-                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
-            _mm256_castsi128_si256(_mm_loadu_si128(
-                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
-            0x20);
-
-        __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
-
-        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
-                                   round_0_shift);
-
-        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
-                                   round_shift);
-
-        /* rounding code */
-        // 8 bit conversion and saturation to uint8
-        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
-
-        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
-        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
-        if (w > 4) {
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
-        } else if (w > 2) {
-          xx_storel_32(&dst[i * dst_stride], res_0);
-          xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
-        } else {
-          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
-          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
-          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
-          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
-        }
-      }
-    } else {
-      for (i = 0; i < h; ++i) {
-        for (int j = 0; j < w; j += 16) {
-          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
-          // 18 19 20 21 22 23
-          const __m256i data = _mm256_inserti128_si256(
-              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
-              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
-              1);
-
-          __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
-
-          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
-                                     round_0_shift);
-
-          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
-                                     round_shift);
-
-          /* rounding code */
-          // 8 bit conversion and saturation to uint8
-          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
-
-          // Store values into the destination buffer
-          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
-          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
-          __m128i res = _mm256_castsi256_si128(res_8b);
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
-        }
-      }
-    }
-  } else {
-    const int fo_horiz = filter_params_x->taps / 2 - 1;
-    const uint8_t *const src_ptr = src - fo_horiz;
-    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-    if (w <= 8) {
-      for (i = 0; i < h; i += 2) {
-        const __m256i data = _mm256_permute2x128_si256(
-            _mm256_castsi128_si256(
-                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
-            _mm256_castsi128_si256(_mm_loadu_si128(
-                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
-            0x20);
-
-        __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
-
-        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
-                                   round_0_shift);
-
-        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
-                                   round_shift);
-
-        /* rounding code */
-        // 8 bit conversion and saturation to uint8
-        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
-
-        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
-        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
-        if (w > 4) {
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
-        } else if (w > 2) {
-          xx_storel_32(&dst[i * dst_stride], res_0);
-          xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
-        } else {
-          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
-          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
-          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
-          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
-        }
-      }
-    } else {
-      for (i = 0; i < h; ++i) {
-        for (int j = 0; j < w; j += 16) {
-          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
-          // 18 19 20 21 22 23
-          const __m256i data = _mm256_inserti128_si256(
-              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
-              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
-              1);
-
-          __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
-
-          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
-                                     round_0_shift);
-
-          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
-                                     round_shift);
-
-          /* rounding code */
-          // 8 bit conversion and saturation to uint8
-          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
-
-          // Store values into the destination buffer
-          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
-          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
-          __m128i res = _mm256_castsi256_si128(res_8b);
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
-        }
-      }
-    }
-  }
-}

diff --git a/av1/common/x86/convolve_sse2.c b/av1/common/x86/convolve_sse2.c
deleted file mode 100644
index 5ceb40b..0000000
--- a/av1/common/x86/convolve_sse2.c
+++ /dev/null

@@ -1,325 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <emmintrin.h>
-
-#include "config/av1_rtcd.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/convolve_common_intrin.h"
-#include "av1/common/convolve.h"
-
-static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
-                                  const int subpel_q4,
-                                  __m128i *const coeffs /* [4] */) {
-  const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params, subpel_q4 & SUBPEL_MASK);
-  const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
-  // coeffs 0 1 0 1 2 3 2 3
-  const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
-  // coeffs 4 5 4 5 6 7 6 7
-  const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
-  coeffs[0] = _mm_unpacklo_epi64(tmp_0, tmp_0);  // coeffs 0 1 0 1 0 1 0 1
-  coeffs[1] = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
-  coeffs[2] = _mm_unpacklo_epi64(tmp_1, tmp_1);  // coeffs 4 5 4 5 4 5 4 5
-  coeffs[3] = _mm_unpackhi_epi64(tmp_1, tmp_1);  // coeffs 6 7 6 7 6 7 6 7
-}
-
-static INLINE __m128i convolve(const __m128i *const s,
-                               const __m128i *const coeffs) {
-  const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]);
-  const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]);
-  const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]);
-  const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]);
-  const __m128i d = _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3));
-  return d;
-}
-
-static INLINE __m128i convolve_lo_x(const __m128i *const s,
-                                    const __m128i *const coeffs) {
-  __m128i ss[4];
-  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
-  ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
-  ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
-  ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
-  return convolve(ss, coeffs);
-}
-
-static INLINE __m128i convolve_lo_y(const __m128i *const s,
-                                    const __m128i *const coeffs) {
-  __m128i ss[4];
-  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
-  ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
-  ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
-  ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
-  return convolve(ss, coeffs);
-}
-
-static INLINE __m128i convolve_hi_y(const __m128i *const s,
-                                    const __m128i *const coeffs) {
-  __m128i ss[4];
-  ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
-  ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
-  ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
-  ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
-  return convolve(ss, coeffs);
-}
-
-void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
-                            int dst_stride, int w, int h,
-                            const InterpFilterParams *filter_params_y,
-                            const int subpel_y_qn) {
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const uint8_t *src_ptr = src - fo_vert * src_stride;
-  const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
-  const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
-  __m128i coeffs[4];
-
-  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
-
-  if (w <= 4) {
-    __m128i s[8], src6, res, res_round, res16;
-    uint32_t res_int;
-    src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
-    s[0] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
-    s[1] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
-    s[2] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
-    s[3] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
-    s[4] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
-    s[5] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
-
-    do {
-      s[6] = _mm_unpacklo_epi8(
-          src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
-      src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
-      s[7] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
-
-      res = convolve_lo_y(s + 0, coeffs);
-      res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
-      res16 = _mm_packs_epi32(res_round, res_round);
-      res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
-
-      if (w == 2)
-        *(uint16_t *)dst = (uint16_t)res_int;
-      else
-        *(uint32_t *)dst = res_int;
-
-      src_ptr += src_stride;
-      dst += dst_stride;
-
-      res = convolve_lo_y(s + 1, coeffs);
-      res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
-      res16 = _mm_packs_epi32(res_round, res_round);
-      res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
-
-      if (w == 2)
-        *(uint16_t *)dst = (uint16_t)res_int;
-      else
-        *(uint32_t *)dst = res_int;
-
-      src_ptr += src_stride;
-      dst += dst_stride;
-
-      s[0] = s[2];
-      s[1] = s[3];
-      s[2] = s[4];
-      s[3] = s[5];
-      s[4] = s[6];
-      s[5] = s[7];
-      h -= 2;
-    } while (h);
-  } else {
-    assert(!(w % 8));
-    int j = 0;
-    do {
-      __m128i s[8], src6, res_lo, res_hi;
-      __m128i res_lo_round, res_hi_round, res16, res;
-      const uint8_t *data = &src_ptr[j];
-
-      src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
-      s[0] = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
-          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
-      s[1] = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
-          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
-      s[2] = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
-          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
-      s[3] = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
-          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
-      s[4] = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
-          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
-      s[5] = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
-
-      int i = 0;
-      do {
-        data = &src_ptr[i * src_stride + j];
-        s[6] = _mm_unpacklo_epi8(
-            src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
-        src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
-        s[7] = _mm_unpacklo_epi8(
-            _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
-
-        res_lo = convolve_lo_y(s, coeffs);  // Filter low index pixels
-        res_hi = convolve_hi_y(s, coeffs);  // Filter high index pixels
-
-        res_lo_round =
-            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
-        res_hi_round =
-            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
-
-        res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
-        res = _mm_packus_epi16(res16, res16);
-
-        _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
-        i++;
-
-        res_lo = convolve_lo_y(s + 1, coeffs);  // Filter low index pixels
-        res_hi = convolve_hi_y(s + 1, coeffs);  // Filter high index pixels
-
-        res_lo_round =
-            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
-        res_hi_round =
-            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
-
-        res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
-        res = _mm_packus_epi16(res16, res16);
-
-        _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
-        i++;
-
-        s[0] = s[2];
-        s[1] = s[3];
-        s[2] = s[4];
-        s[3] = s[5];
-        s[4] = s[6];
-        s[5] = s[7];
-      } while (i < h);
-      j += 8;
-    } while (j < w);
-  }
-}
-
-void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
-                            int dst_stride, int w, int h,
-                            const InterpFilterParams *filter_params_x,
-                            const int subpel_x_qn,
-                            ConvolveParams *conv_params) {
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const uint8_t *src_ptr = src - fo_horiz;
-  const int bits = FILTER_BITS - conv_params->round_0;
-  const __m128i round_0_const =
-      _mm_set1_epi32((1 << conv_params->round_0) >> 1);
-  const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1);
-  const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
-  const __m128i round_shift = _mm_cvtsi32_si128(bits);
-  __m128i coeffs[4];
-
-  assert(bits >= 0);
-  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
-         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
-
-  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs);
-
-  if (w <= 4) {
-    do {
-      const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
-      __m128i s[4];
-
-      s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
-      s[1] =
-          _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
-      s[2] =
-          _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
-      s[3] =
-          _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
-      const __m128i res_lo = convolve_lo_x(s, coeffs);
-      __m128i res_lo_round =
-          _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
-      res_lo_round =
-          _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), round_shift);
-
-      const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round);
-      const __m128i res = _mm_packus_epi16(res16, res16);
-
-      uint32_t r = _mm_cvtsi128_si32(res);
-      if (w == 2)
-        *(uint16_t *)dst = (uint16_t)r;
-      else
-        *(uint32_t *)dst = r;
-
-      src_ptr += src_stride;
-      dst += dst_stride;
-    } while (--h);
-  } else {
-    assert(!(w % 8));
-    int i = 0;
-    do {
-      int j = 0;
-      do {
-        const __m128i data =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-        __m128i s[4];
-
-        // Filter even-index pixels
-        s[0] = data;
-        s[1] = _mm_srli_si128(data, 2);
-        s[2] = _mm_srli_si128(data, 4);
-        s[3] = _mm_srli_si128(data, 6);
-        const __m128i res_even = convolve_lo_x(s, coeffs);
-
-        // Filter odd-index pixels
-        s[0] = _mm_srli_si128(data, 1);
-        s[1] = _mm_srli_si128(data, 3);
-        s[2] = _mm_srli_si128(data, 5);
-        s[3] = _mm_srli_si128(data, 7);
-        const __m128i res_odd = convolve_lo_x(s, coeffs);
-
-        // Rearrange pixels back into the order 0 ... 7
-        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-        __m128i res_lo_round =
-            _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
-        res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
-                                     round_shift);
-        __m128i res_hi_round =
-            _mm_sra_epi32(_mm_add_epi32(res_hi, round_0_const), round_0_shift);
-        res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
-                                     round_shift);
-
-        const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
-        const __m128i res = _mm_packus_epi16(res16, res16);
-
-        _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
-        j += 8;
-      } while (j < w);
-    } while (++i < h);
-  }
-}

diff --git a/av1/common/x86/highbd_inv_txfm_avx2.c b/av1/common/x86/highbd_inv_txfm_avx2.c
index adc2ff2..087204d 100644
--- a/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/av1/common/x86/highbd_inv_txfm_avx2.c

@@ -4124,8 +4124,8 @@
   const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
   const int input_stride = AOMMIN(32, txfm_size_col);
   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
-  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
-  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const int fun_idx_x = highbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = highbd_txfm_all_1d_zeros_idx[eoby];
   const transform_1d_avx2 row_txfm =
       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
   const transform_1d_avx2 col_txfm =

diff --git a/av1/common/x86/highbd_inv_txfm_sse4.c b/av1/common/x86/highbd_inv_txfm_sse4.c
index eddf5b0..dc6dbaa 100644
--- a/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/av1/common/x86/highbd_inv_txfm_sse4.c

@@ -5209,7 +5209,7 @@
   const int buf_size_w_div4 = input_stride >> 2;
   const int buf_size_h_div8 = (eoby + 8) >> 3;
   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
-  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const int fun_idx = highbd_txfm_all_1d_zeros_idx[eoby];
   const transform_1d_sse4_1 row_txfm =
       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
   const transform_1d_sse4_1 col_txfm =
@@ -5273,7 +5273,7 @@
   const int row_max = AOMMIN(32, txfm_size_row);
   const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
-  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx = highbd_txfm_all_1d_zeros_idx[eobx];
   const transform_1d_sse4_1 row_txfm =
       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
   const transform_1d_sse4_1 col_txfm =
@@ -5413,8 +5413,8 @@
   const int input_stride = AOMMIN(32, txfm_size_col);
   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 
-  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
-  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const int fun_idx_x = highbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = highbd_txfm_all_1d_zeros_idx[eoby];
   const transform_1d_sse4_1 row_txfm =
       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
   const transform_1d_sse4_1 col_txfm =

diff --git a/av1/common/x86/intra_edge_sse4.c b/av1/common/x86/intra_edge_sse4.c
index 3ed27bb..5b552c2 100644
--- a/av1/common/x86/intra_edge_sse4.c
+++ b/av1/common/x86/intra_edge_sse4.c

@@ -16,104 +16,6 @@
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
 
-void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
-  if (!strength) return;
-
-  DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = {
-    { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 },  // strength 1: 4,8,4
-    { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 },  // strength 2: 5,6,5
-    { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 }  // strength 3: 2,4,4,4,2
-  };
-
-  DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = {
-    { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
-    { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 },
-    { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 },
-    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-  };
-
-  // Extend the first and last samples to simplify the loop for the 5-tap case
-  p[-1] = p[0];
-  __m128i last = _mm_set1_epi8(p[sz - 1]);
-  _mm_storeu_si128((__m128i *)&p[sz], last);
-
-  // Adjust input pointer for filter support area
-  uint8_t *in = (strength == 3) ? p - 1 : p;
-
-  // Avoid modifying first sample
-  uint8_t *out = p + 1;
-  int len = sz - 1;
-
-  const int use_3tap_filter = (strength < 3);
-
-  if (use_3tap_filter) {
-    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
-    __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]);
-    __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]);
-    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
-    __m128i in0 = _mm_lddqu_si128((__m128i *)in);
-    while (len > 0) {
-      int n_out = (len < 8) ? len : 8;
-      __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
-      __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
-      d0 = _mm_maddubs_epi16(d0, coef0);
-      d1 = _mm_maddubs_epi16(d1, coef0);
-      d0 = _mm_hadd_epi16(d0, d1);
-      __m128i eight = _mm_set1_epi16(8);
-      d0 = _mm_add_epi16(d0, eight);
-      d0 = _mm_srai_epi16(d0, 4);
-      d0 = _mm_packus_epi16(d0, d0);
-      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
-      __m128i n0 = _mm_set1_epi8(n_out);
-      __m128i mask = _mm_cmpgt_epi8(n0, iden);
-      out0 = _mm_blendv_epi8(out0, d0, mask);
-      _mm_storel_epi64((__m128i *)out, out0);
-      __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
-      in0 = _mm_alignr_epi8(in1, in0, 8);
-      in += 8;
-      out += 8;
-      len -= n_out;
-    }
-  } else {  // 5-tap filter
-    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
-    __m128i two = _mm_set1_epi8(2);
-    __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]);
-    __m128i shuf_b = _mm_add_epi8(shuf_a, two);
-    __m128i shuf_c = _mm_add_epi8(shuf_b, two);
-    __m128i shuf_d = _mm_add_epi8(shuf_c, two);
-    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
-    __m128i in0 = _mm_lddqu_si128((__m128i *)in);
-    while (len > 0) {
-      int n_out = (len < 8) ? len : 8;
-      __m128i d0 = _mm_shuffle_epi8(in0, shuf_a);
-      __m128i d1 = _mm_shuffle_epi8(in0, shuf_b);
-      __m128i d2 = _mm_shuffle_epi8(in0, shuf_c);
-      __m128i d3 = _mm_shuffle_epi8(in0, shuf_d);
-      d0 = _mm_maddubs_epi16(d0, coef0);
-      d1 = _mm_maddubs_epi16(d1, coef0);
-      d2 = _mm_maddubs_epi16(d2, coef0);
-      d3 = _mm_maddubs_epi16(d3, coef0);
-      d0 = _mm_hadd_epi16(d0, d1);
-      d2 = _mm_hadd_epi16(d2, d3);
-      d0 = _mm_hadd_epi16(d0, d2);
-      __m128i eight = _mm_set1_epi16(8);
-      d0 = _mm_add_epi16(d0, eight);
-      d0 = _mm_srai_epi16(d0, 4);
-      d0 = _mm_packus_epi16(d0, d0);
-      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
-      __m128i n0 = _mm_set1_epi8(n_out);
-      __m128i mask = _mm_cmpgt_epi8(n0, iden);
-      out0 = _mm_blendv_epi8(out0, d0, mask);
-      _mm_storel_epi64((__m128i *)out, out0);
-      __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
-      in0 = _mm_alignr_epi8(in1, in0, 8);
-      in += 8;
-      out += 8;
-      len -= n_out;
-    }
-  }
-}
-
 void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength) {
   if (!strength) return;
 
@@ -205,66 +107,6 @@
   }
 }
 
-void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) {
-  // interpolate half-sample positions
-  assert(sz <= 24);
-
-  DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = {
-    { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
-  };
-
-  DECLARE_ALIGNED(
-      16, static const int8_t,
-      v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
-                          { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } };
-
-  // Extend first/last samples (upper-left p[-1], last p[sz-1])
-  // to support 4-tap filter
-  p[-2] = p[-1];
-  p[sz] = p[sz - 1];
-
-  uint8_t *in = &p[-2];
-  uint8_t *out = &p[-2];
-
-  int n = sz + 1;  // Input length including upper-left sample
-
-  __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
-  __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
-
-  __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
-  __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]);
-  __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]);
-
-  while (n > 0) {
-    __m128i in8 = _mm_alignr_epi8(in16, in0, 8);
-    __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
-    __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
-    __m128i d2 = _mm_shuffle_epi8(in8, shuf0);
-    __m128i d3 = _mm_shuffle_epi8(in8, shuf1);
-    d0 = _mm_maddubs_epi16(d0, coef0);
-    d1 = _mm_maddubs_epi16(d1, coef0);
-    d2 = _mm_maddubs_epi16(d2, coef0);
-    d3 = _mm_maddubs_epi16(d3, coef0);
-    d0 = _mm_hadd_epi16(d0, d1);
-    d2 = _mm_hadd_epi16(d2, d3);
-    __m128i eight = _mm_set1_epi16(8);
-    d0 = _mm_add_epi16(d0, eight);
-    d2 = _mm_add_epi16(d2, eight);
-    d0 = _mm_srai_epi16(d0, 4);
-    d2 = _mm_srai_epi16(d2, 4);
-    d0 = _mm_packus_epi16(d0, d2);
-    __m128i in1 = _mm_alignr_epi8(in16, in0, 1);
-    __m128i out0 = _mm_unpacklo_epi8(in1, d0);
-    __m128i out1 = _mm_unpackhi_epi8(in1, d0);
-    _mm_storeu_si128((__m128i *)&out[0], out0);
-    _mm_storeu_si128((__m128i *)&out[16], out1);
-    in0 = in16;
-    in16 = _mm_setzero_si128();
-    out += 32;
-    n -= 16;
-  }
-}
-
 void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd) {
   // interpolate half-sample positions
   assert(sz <= 24);

diff --git a/av1/common/x86/jnt_convolve_avx2.c b/av1/common/x86/jnt_convolve_avx2.c
deleted file mode 100644
index d93d3ec..0000000
--- a/av1/common/x86/jnt_convolve_avx2.c
+++ /dev/null

@@ -1,978 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/convolve_avx2.h"
-#include "aom_dsp/x86/convolve_common_intrin.h"
-#include "aom_dsp/x86/convolve_sse4_1.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "av1/common/convolve.h"
-
-static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) {
-  (void)conv_params;
-  const int w0 = conv_params->fwd_offset;
-  const int w1 = conv_params->bck_offset;
-  const __m256i wt0 = _mm256_set1_epi16((int16_t)w0);
-  const __m256i wt1 = _mm256_set1_epi16((int16_t)w1);
-  const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
-  return wt;
-}
-
-static INLINE __m256i load_line2_avx2(const void *a, const void *b) {
-  return _mm256_permute2x128_si256(
-      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)),
-      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20);
-}
-
-void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride,
-                                  uint8_t *dst0, int dst_stride0, int w, int h,
-                                  const InterpFilterParams *filter_params_x,
-                                  const int subpel_x_qn,
-                                  ConvolveParams *conv_params) {
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
-  const int bd = 8;
-  int i, j, is_horiz_4tap = 0;
-  const int bits = FILTER_BITS - conv_params->round_1;
-  const __m256i wt = unpack_weights_avx2(conv_params);
-  const int do_average = conv_params->do_average;
-  const int use_wtd_comp_avg = is_uneven_wtd_comp_avg(conv_params);
-  const int offset_0 =
-      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
-  const __m256i offset_const = _mm256_set1_epi16(offset);
-  const int rounding_shift =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
-
-  assert(bits >= 0);
-  assert(conv_params->round_0 > 0);
-
-  const __m256i round_const =
-      _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
-  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
-
-  __m256i filt[4], coeffs[4];
-
-  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
-  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-
-  prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
-
-  // Condition for checking valid horz_filt taps
-  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
-    is_horiz_4tap = 1;
-
-  // horz_filt as 4 tap
-  if (is_horiz_4tap) {
-    const int fo_horiz = 1;
-    const uint8_t *const src_ptr = src - fo_horiz;
-    for (i = 0; i < h; i += 2) {
-      const uint8_t *src_data = src_ptr + i * src_stride;
-      CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
-      for (j = 0; j < w; j += 8) {
-        const __m256i data =
-            load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
-
-        __m256i res = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
-        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
-        res = _mm256_slli_epi16(res, bits);
-
-        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
-
-        // Accumulate values into the destination buffer
-        if (do_average) {
-          const __m256i data_ref_0 =
-              load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
-          const __m256i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_wtd_comp_avg);
-
-          const __m256i round_result = convolve_rounding(
-              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
-
-          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
-          const __m128i res_0 = _mm256_castsi256_si128(res_8);
-          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
-
-          if (w > 4) {
-            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-            _mm_storel_epi64(
-                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
-          } else {
-            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
-                _mm_cvtsi128_si32(res_0);
-            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
-                _mm_cvtsi128_si32(res_1);
-          }
-        } else {
-          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
-          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
-
-#if CONFIG_OPTFLOW_REFINEMENT
-          _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
-          _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                           res_1);
-#else
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                          res_1);
-#endif  // CONFIG_OPTFLOW_REFINEMENT
-        }
-      }
-    }
-  } else {
-    const int fo_horiz = filter_params_x->taps / 2 - 1;
-    const uint8_t *const src_ptr = src - fo_horiz;
-
-    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-    for (i = 0; i < h; i += 2) {
-      const uint8_t *src_data = src_ptr + i * src_stride;
-      CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
-      for (j = 0; j < w; j += 8) {
-        const __m256i data =
-            load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
-
-        __m256i res = convolve_lowbd_x(data, coeffs, filt);
-
-        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
-
-        res = _mm256_slli_epi16(res, bits);
-
-        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
-
-        // Accumulate values into the destination buffer
-        if (do_average) {
-          const __m256i data_ref_0 =
-              load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
-          const __m256i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_wtd_comp_avg);
-
-          const __m256i round_result = convolve_rounding(
-              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
-
-          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
-          const __m128i res_0 = _mm256_castsi256_si128(res_8);
-          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
-
-          if (w > 4) {
-            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-            _mm_storel_epi64(
-                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
-          } else {
-            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
-                _mm_cvtsi128_si32(res_0);
-            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
-                _mm_cvtsi128_si32(res_1);
-          }
-        } else {
-          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
-          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
-#if CONFIG_OPTFLOW_REFINEMENT
-          _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
-          _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                           res_1);
-#else
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                          res_1);
-#endif  // CONFIG_OPTFLOW_REFINEMENT
-        }
-      }
-    }
-  }
-}
-
-void av1_dist_wtd_convolve_y_avx2(const uint8_t *src, int src_stride,
-                                  uint8_t *dst0, int dst_stride0, int w, int h,
-                                  const InterpFilterParams *filter_params_y,
-                                  const int subpel_y_qn,
-                                  ConvolveParams *conv_params) {
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
-  const int bd = 8;
-  int i, j, is_vert_4tap = 0;
-  // +1 to compensate for dividing the filter coeffs by 2
-  const int left_shift = FILTER_BITS - conv_params->round_0 + 1;
-  const __m256i round_const =
-      _mm256_set1_epi32((1 << conv_params->round_1) >> 1);
-  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-  const __m256i wt = unpack_weights_avx2(conv_params);
-  const int do_average = conv_params->do_average;
-  const int use_wtd_comp_avg = is_uneven_wtd_comp_avg(conv_params);
-  const int offset_0 =
-      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
-  const __m256i offset_const = _mm256_set1_epi16(offset);
-  const int offset_1 = (1 << (bd + FILTER_BITS - 2));
-  const __m256i offset_const_1 = _mm256_set1_epi16(offset_1);
-  const __m256i offset_const_2 = _mm256_set1_epi16((1 << offset_0));
-  const int rounding_shift =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i coeffs[4], s[8];
-
-  assert((FILTER_BITS - conv_params->round_0) >= 0);
-
-  prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
-
-  // Condition for checking valid vert_filt taps
-  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
-    is_vert_4tap = 1;
-
-  if (is_vert_4tap) {
-    const int fo_vert = 1;
-    const uint8_t *const src_ptr = src - fo_vert * src_stride;
-    for (j = 0; j < w; j += 16) {
-      const uint8_t *data = &src_ptr[j];
-      __m256i src4;
-      // Load lines a and b. Line a to lower 128, line b to upper 128
-      {
-        __m256i src_ab[4];
-        __m256i src_a[5];
-        src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
-        for (int kk = 0; kk < 4; ++kk) {
-          data += src_stride;
-          src_a[kk + 1] =
-              _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
-          src_ab[kk] =
-              _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
-        }
-        src4 = src_a[4];
-        s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
-        s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
-
-        s[3] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
-        s[4] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
-      }
-
-      for (i = 0; i < h; i += 2) {
-        data = &src_ptr[(i + 5) * src_stride + j];
-        const __m256i src5 =
-            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
-        const __m256i src_45a = _mm256_permute2x128_si256(src4, src5, 0x20);
-
-        src4 = _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + src_stride)));
-        const __m256i src_56a = _mm256_permute2x128_si256(src5, src4, 0x20);
-
-        s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
-        s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
-
-        __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
-
-        res_lo = _mm256_add_epi16(res_lo, offset_const_1);
-
-        const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
-        const __m256i res_lo_0_shift =
-            _mm256_slli_epi32(res_lo_0_32b, left_shift);
-        const __m256i res_lo_0_round = _mm256_sra_epi32(
-            _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
-
-        const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
-        const __m256i res_lo_1_shift =
-            _mm256_slli_epi32(res_lo_1_32b, left_shift);
-        const __m256i res_lo_1_round = _mm256_sra_epi32(
-            _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
-
-        const __m256i res_lo_round =
-            _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
-
-        const __m256i res_lo_unsigned =
-            _mm256_add_epi16(res_lo_round, offset_const_2);
-
-        if (w - j < 16) {
-          if (do_average) {
-            const __m256i data_ref_0 =
-                load_line2_avx2(&dst[i * dst_stride + j],
-                                &dst[i * dst_stride + j + dst_stride]);
-            const __m256i comp_avg_res =
-                comp_avg(&data_ref_0, &res_lo_unsigned, &wt, use_wtd_comp_avg);
-
-            const __m256i round_result = convolve_rounding(
-                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
-
-            const __m256i res_8 =
-                _mm256_packus_epi16(round_result, round_result);
-            const __m128i res_0 = _mm256_castsi256_si128(res_8);
-            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
-
-            if (w - j > 4) {
-              _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-              _mm_storel_epi64(
-                  (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
-                  res_1);
-            } else {
-              *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
-                  _mm_cvtsi128_si32(res_0);
-              *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
-                  _mm_cvtsi128_si32(res_1);
-            }
-          } else {
-            const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
-            const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
-
-#if CONFIG_OPTFLOW_REFINEMENT
-            _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
-            _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                             res_1);
-#else
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                            res_1);
-#endif  // CONFIG_OPTFLOW_REFINEMENT
-          }
-        } else {
-          __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
-
-          res_hi = _mm256_add_epi16(res_hi, offset_const_1);
-
-          const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
-          const __m256i res_hi_0_shift =
-              _mm256_slli_epi32(res_hi_0_32b, left_shift);
-          const __m256i res_hi_0_round = _mm256_sra_epi32(
-              _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
-
-          const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
-          const __m256i res_hi_1_shift =
-              _mm256_slli_epi32(res_hi_1_32b, left_shift);
-          const __m256i res_hi_1_round = _mm256_sra_epi32(
-              _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
-
-          const __m256i res_hi_round =
-              _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
-
-          const __m256i res_hi_unsigned =
-              _mm256_add_epi16(res_hi_round, offset_const_2);
-
-          if (do_average) {
-            const __m256i data_ref_0_lo =
-                load_line2_avx2(&dst[i * dst_stride + j],
-                                &dst[i * dst_stride + j + dst_stride]);
-
-            const __m256i data_ref_0_hi =
-                load_line2_avx2(&dst[i * dst_stride + j + 8],
-                                &dst[i * dst_stride + j + 8 + dst_stride]);
-
-            const __m256i comp_avg_res_lo = comp_avg(
-                &data_ref_0_lo, &res_lo_unsigned, &wt, use_wtd_comp_avg);
-
-            const __m256i comp_avg_res_hi = comp_avg(
-                &data_ref_0_hi, &res_hi_unsigned, &wt, use_wtd_comp_avg);
-
-            const __m256i round_result_lo =
-                convolve_rounding(&comp_avg_res_lo, &offset_const,
-                                  &rounding_const, rounding_shift);
-
-            const __m256i round_result_hi =
-                convolve_rounding(&comp_avg_res_hi, &offset_const,
-                                  &rounding_const, rounding_shift);
-
-            const __m256i res_8 =
-                _mm256_packus_epi16(round_result_lo, round_result_hi);
-            const __m128i res_0 = _mm256_castsi256_si128(res_8);
-            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
-
-#if CONFIG_OPTFLOW_REFINEMENT || CONFIG_TIP
-            _mm_storeu_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-            _mm_storeu_si128(
-                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
-#else
-            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-            _mm_store_si128(
-                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
-#endif  // CONFIG_OPTFLOW_REFINEMENT || CONFIG_TIP
-          } else {
-            const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
-            const __m128i res_lo_1 =
-                _mm256_extracti128_si256(res_lo_unsigned, 1);
-            const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
-            const __m128i res_hi_1 =
-                _mm256_extracti128_si256(res_hi_unsigned, 1);
-
-#if CONFIG_OPTFLOW_REFINEMENT
-            _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
-            _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                             res_lo_1);
-            _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j + 8]),
-                             res_hi_0);
-            _mm_storeu_si128(
-                (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]),
-                res_hi_1);
-#else
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                            res_lo_1);
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]),
-                            res_hi_0);
-            _mm_store_si128(
-                (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]),
-                res_hi_1);
-#endif  // CONFIG_OPTFLOW_REFINEMENT
-          }
-        }
-        s[0] = s[1];
-        s[1] = s[2];
-
-        s[3] = s[4];
-        s[4] = s[5];
-      }
-    }
-  } else {
-    const int fo_vert = filter_params_y->taps / 2 - 1;
-    const uint8_t *const src_ptr = src - fo_vert * src_stride;
-    for (j = 0; j < w; j += 16) {
-      const uint8_t *data = &src_ptr[j];
-      __m256i src6;
-      // Load lines a and b. Line a to lower 128, line b to upper 128
-      {
-        __m256i src_ab[7];
-        __m256i src_a[7];
-        src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
-        for (int kk = 0; kk < 6; ++kk) {
-          data += src_stride;
-          src_a[kk + 1] =
-              _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
-          src_ab[kk] =
-              _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
-        }
-        src6 = src_a[6];
-        s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
-        s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
-        s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]);
-        s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
-        s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
-        s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]);
-      }
-
-      for (i = 0; i < h; i += 2) {
-        data = &src_ptr[(i + 7) * src_stride + j];
-        const __m256i src7 =
-            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
-        const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20);
-
-        src6 = _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + src_stride)));
-        const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20);
-
-        s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
-        s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
-
-        __m256i res_lo = convolve_lowbd(s, coeffs);
-
-        res_lo = _mm256_add_epi16(res_lo, offset_const_1);
-
-        const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
-        const __m256i res_lo_0_shift =
-            _mm256_slli_epi32(res_lo_0_32b, left_shift);
-        const __m256i res_lo_0_round = _mm256_sra_epi32(
-            _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
-
-        const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
-        const __m256i res_lo_1_shift =
-            _mm256_slli_epi32(res_lo_1_32b, left_shift);
-        const __m256i res_lo_1_round = _mm256_sra_epi32(
-            _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
-
-        const __m256i res_lo_round =
-            _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
-
-        const __m256i res_lo_unsigned =
-            _mm256_add_epi16(res_lo_round, offset_const_2);
-
-        if (w - j < 16) {
-          if (do_average) {
-            const __m256i data_ref_0 =
-                load_line2_avx2(&dst[i * dst_stride + j],
-                                &dst[i * dst_stride + j + dst_stride]);
-            const __m256i comp_avg_res =
-                comp_avg(&data_ref_0, &res_lo_unsigned, &wt, use_wtd_comp_avg);
-
-            const __m256i round_result = convolve_rounding(
-                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
-
-            const __m256i res_8 =
-                _mm256_packus_epi16(round_result, round_result);
-            const __m128i res_0 = _mm256_castsi256_si128(res_8);
-            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
-
-            if (w - j > 4) {
-              _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-              _mm_storel_epi64(
-                  (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
-                  res_1);
-            } else {
-              *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
-                  _mm_cvtsi128_si32(res_0);
-              *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
-                  _mm_cvtsi128_si32(res_1);
-            }
-          } else {
-            const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
-            const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
-
-#if CONFIG_OPTFLOW_REFINEMENT
-            _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
-            _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                             res_1);
-#else
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                            res_1);
-#endif  // CONFIG_OPTFLOW_REFINEMENT
-          }
-        } else {
-          __m256i res_hi = convolve_lowbd(s + 4, coeffs);
-
-          res_hi = _mm256_add_epi16(res_hi, offset_const_1);
-
-          const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
-          const __m256i res_hi_0_shift =
-              _mm256_slli_epi32(res_hi_0_32b, left_shift);
-          const __m256i res_hi_0_round = _mm256_sra_epi32(
-              _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
-
-          const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
-          const __m256i res_hi_1_shift =
-              _mm256_slli_epi32(res_hi_1_32b, left_shift);
-          const __m256i res_hi_1_round = _mm256_sra_epi32(
-              _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
-
-          const __m256i res_hi_round =
-              _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
-
-          const __m256i res_hi_unsigned =
-              _mm256_add_epi16(res_hi_round, offset_const_2);
-
-          if (do_average) {
-            const __m256i data_ref_0_lo =
-                load_line2_avx2(&dst[i * dst_stride + j],
-                                &dst[i * dst_stride + j + dst_stride]);
-
-            const __m256i data_ref_0_hi =
-                load_line2_avx2(&dst[i * dst_stride + j + 8],
-                                &dst[i * dst_stride + j + 8 + dst_stride]);
-
-            const __m256i comp_avg_res_lo = comp_avg(
-                &data_ref_0_lo, &res_lo_unsigned, &wt, use_wtd_comp_avg);
-
-            const __m256i comp_avg_res_hi = comp_avg(
-                &data_ref_0_hi, &res_hi_unsigned, &wt, use_wtd_comp_avg);
-
-            const __m256i round_result_lo =
-                convolve_rounding(&comp_avg_res_lo, &offset_const,
-                                  &rounding_const, rounding_shift);
-
-            const __m256i round_result_hi =
-                convolve_rounding(&comp_avg_res_hi, &offset_const,
-                                  &rounding_const, rounding_shift);
-
-            const __m256i res_8 =
-                _mm256_packus_epi16(round_result_lo, round_result_hi);
-            const __m128i res_0 = _mm256_castsi256_si128(res_8);
-            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
-
-#if CONFIG_OPTFLOW_REFINEMENT || CONFIG_TIP
-            _mm_storeu_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-            _mm_storeu_si128(
-                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
-#else
-            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-            _mm_store_si128(
-                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
-#endif  // CONFIG_OPTFLOW_REFINEMENT || CONFIG_TIP
-          } else {
-            const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
-            const __m128i res_lo_1 =
-                _mm256_extracti128_si256(res_lo_unsigned, 1);
-            const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
-            const __m128i res_hi_1 =
-                _mm256_extracti128_si256(res_hi_unsigned, 1);
-
-#if CONFIG_OPTFLOW_REFINEMENT
-            _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
-            _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                             res_lo_1);
-            _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j + 8]),
-                             res_hi_0);
-            _mm_storeu_si128(
-                (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]),
-                res_hi_1);
-#else
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                            res_lo_1);
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]),
-                            res_hi_0);
-            _mm_store_si128(
-                (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]),
-                res_hi_1);
-#endif  // CONFIG_OPTFLOW_REFINEMENT
-          }
-        }
-        s[0] = s[1];
-        s[1] = s[2];
-        s[2] = s[3];
-
-        s[4] = s[5];
-        s[5] = s[6];
-        s[6] = s[7];
-      }
-    }
-  }
-}
-
-void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride,
-                                   uint8_t *dst0, int dst_stride0, int w, int h,
-                                   const InterpFilterParams *filter_params_x,
-                                   const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_qn, const int subpel_y_qn,
-                                   ConvolveParams *conv_params) {
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
-  const int bd = 8;
-
-  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
-
-  int im_stride = 8;
-  int i, is_horiz_4tap = 0, is_vert_4tap = 0;
-  const __m256i wt = unpack_weights_avx2(conv_params);
-  const int do_average = conv_params->do_average;
-  const int use_wtd_comp_avg = is_uneven_wtd_comp_avg(conv_params);
-  const int offset_0 =
-      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
-  const __m256i offset_const = _mm256_set1_epi16(offset);
-  const int rounding_shift =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
-
-  assert(conv_params->round_0 > 0);
-
-  const __m256i round_const_h = _mm256_set1_epi16(
-      ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
-  const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
-
-  const __m256i round_const_v = _mm256_set1_epi32(
-      ((1 << conv_params->round_1) >> 1) -
-      (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
-  const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
-
-  __m256i filt[4], coeffs_x[4], coeffs_y[4];
-
-  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
-  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-
-  prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_x);
-  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
-
-  // Condition for checking valid horz_filt taps
-  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_x[0], coeffs_x[3]), 0)))
-    is_horiz_4tap = 1;
-
-  // Condition for checking valid vert_filt taps
-  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_y[0], coeffs_y[3]), 0)))
-    is_vert_4tap = 1;
-
-  if (is_horiz_4tap) {
-    int im_h = h + filter_params_y->taps - 1;
-    const int fo_vert = filter_params_y->taps / 2 - 1;
-    const int fo_horiz = 1;
-    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-    for (int j = 0; j < w; j += 8) {
-      /* Horizontal filter */
-      const uint8_t *src_h = src_ptr + j;
-      for (i = 0; i < im_h; i += 2) {
-        __m256i data =
-            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));
-        if (i + 1 < im_h)
-          data = _mm256_inserti128_si256(
-              data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1);
-        src_h += (src_stride << 1);
-        __m256i res = convolve_lowbd_x_4tap(data, coeffs_x + 1, filt);
-
-        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
-                               round_shift_h);
-
-        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
-      }
-      DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
-    }
-  } else if (is_vert_4tap) {
-    int im_h = h + 3;
-    const int fo_vert = 1;
-    const int fo_horiz = filter_params_x->taps / 2 - 1;
-    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-
-    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-    for (int j = 0; j < w; j += 8) {
-      /* Horizontal filter */
-      const uint8_t *src_h = src_ptr + j;
-      DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
-
-      /* Vertical filter */
-      __m256i s[6];
-      __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
-      __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
-      __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
-      __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
-
-      s[0] = _mm256_unpacklo_epi16(s0, s1);
-      s[1] = _mm256_unpacklo_epi16(s2, s3);
-
-      s[3] = _mm256_unpackhi_epi16(s0, s1);
-      s[4] = _mm256_unpackhi_epi16(s2, s3);
-
-      for (i = 0; i < h; i += 2) {
-        const int16_t *data = &im_block[i * im_stride];
-
-        const __m256i s4 =
-            _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
-        const __m256i s5 =
-            _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
-
-        s[2] = _mm256_unpacklo_epi16(s4, s5);
-        s[5] = _mm256_unpackhi_epi16(s4, s5);
-
-        const __m256i res_a = convolve_4tap(s, coeffs_y + 1);
-        const __m256i res_a_round = _mm256_sra_epi32(
-            _mm256_add_epi32(res_a, round_const_v), round_shift_v);
-
-        if (w - j > 4) {
-          const __m256i res_b = convolve_4tap(s + 3, coeffs_y + 1);
-          const __m256i res_b_round = _mm256_sra_epi32(
-              _mm256_add_epi32(res_b, round_const_v), round_shift_v);
-          const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);
-          const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
-
-          if (do_average) {
-            const __m256i data_ref_0 =
-                load_line2_avx2(&dst[i * dst_stride + j],
-                                &dst[i * dst_stride + j + dst_stride]);
-            const __m256i comp_avg_res =
-                comp_avg(&data_ref_0, &res_unsigned, &wt, use_wtd_comp_avg);
-
-            const __m256i round_result = convolve_rounding(
-                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
-
-            const __m256i res_8 =
-                _mm256_packus_epi16(round_result, round_result);
-            const __m128i res_0 = _mm256_castsi256_si128(res_8);
-            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
-
-            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-            _mm_storel_epi64(
-                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
-          } else {
-            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
-            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
-
-#if CONFIG_OPTFLOW_REFINEMENT
-            _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
-            _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                             res_1);
-#else
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                            res_1);
-#endif  // CONFIG_OPTFLOW_REFINEMENT
-          }
-        } else {
-          const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);
-          const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
-
-          if (do_average) {
-            const __m256i data_ref_0 =
-                load_line2_avx2(&dst[i * dst_stride + j],
-                                &dst[i * dst_stride + j + dst_stride]);
-
-            const __m256i comp_avg_res =
-                comp_avg(&data_ref_0, &res_unsigned, &wt, use_wtd_comp_avg);
-
-            const __m256i round_result = convolve_rounding(
-                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
-
-            const __m256i res_8 =
-                _mm256_packus_epi16(round_result, round_result);
-            const __m128i res_0 = _mm256_castsi256_si128(res_8);
-            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
-
-            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
-                _mm_cvtsi128_si32(res_0);
-            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
-                _mm_cvtsi128_si32(res_1);
-
-          } else {
-            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
-            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
-
-#if CONFIG_OPTFLOW_REFINEMENT
-            _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
-            _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                             res_1);
-#else
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                            res_1);
-#endif  // CONFIG_OPTFLOW_REFINEMENT
-          }
-        }
-        s[0] = s[1];
-        s[1] = s[2];
-        s[3] = s[4];
-        s[4] = s[5];
-      }
-    }
-  } else {
-    int im_h = h + filter_params_y->taps - 1;
-    const int fo_vert = filter_params_y->taps / 2 - 1;
-    const int fo_horiz = filter_params_x->taps / 2 - 1;
-    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-
-    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-    for (int j = 0; j < w; j += 8) {
-      /* Horizontal filter */
-      const uint8_t *src_h = src_ptr + j;
-      DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
-
-      DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
-    }
-  }
-}
-
-void av1_dist_wtd_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
-                                        uint8_t *dst0, int dst_stride0, int w,
-                                        int h, ConvolveParams *conv_params) {
-  const int bd = 8;
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
-
-  const int bits =
-      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
-  const __m128i left_shift = _mm_cvtsi32_si128(bits);
-  const int do_average = conv_params->do_average;
-  const int use_wtd_comp_avg = is_uneven_wtd_comp_avg(conv_params);
-  const __m256i wt = unpack_weights_avx2(conv_params);
-  const __m256i zero = _mm256_setzero_si256();
-
-  const int offset_0 =
-      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
-  const __m256i offset_const = _mm256_set1_epi16(offset);
-  const int rounding_shift =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
-  int i, j;
-
-  if (!(w % 16)) {
-    for (i = 0; i < h; i += 1) {
-      for (j = 0; j < w; j += 16) {
-        const __m256i src_16bit = _mm256_cvtepu8_epi16(
-            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j])));
-
-        const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
-        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
-
-        if (do_average) {
-          const __m256i data_ref_0 =
-              _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
-
-          const __m256i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_wtd_comp_avg);
-
-          const __m256i round_result = convolve_rounding(
-              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
-
-          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
-          const __m256i res_0 = _mm256_permute4x64_epi64(res_8, 0xD8);
-
-#if CONFIG_TIP
-          _mm_storeu_si128((__m128i *)(&dst0[i * dst_stride0 + j]),
-                           _mm256_castsi256_si128(res_0));
-#else
-          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]),
-                          _mm256_castsi256_si128(res_0));
-#endif  // CONFIG_TIP
-        } else {
-          _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]),
-                             res_unsigned);
-        }
-      }
-    }
-  } else if (!(w % 4)) {
-    for (i = 0; i < h; i += 2) {
-      for (j = 0; j < w; j += 8) {
-        const __m128i src_row_0 =
-            _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j]));
-        const __m128i src_row_1 =
-            _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride]));
-        // since not all compilers yet support _mm256_set_m128i()
-        const __m256i src_10 = _mm256_insertf128_si256(
-            _mm256_castsi128_si256(src_row_0), src_row_1, 1);
-
-        const __m256i src_16bit = _mm256_unpacklo_epi8(src_10, zero);
-
-        const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
-
-        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
-
-        // Accumulate values into the destination buffer
-        if (do_average) {
-          const __m256i data_ref_0 = load_line2_avx2(
-              &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
-          const __m256i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_wtd_comp_avg);
-
-          const __m256i round_result = convolve_rounding(
-              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
-
-          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
-          const __m128i res_0 = _mm256_castsi256_si128(res_8);
-          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
-
-          if (w > 4) {
-            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-            _mm_storel_epi64(
-                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
-          } else {
-            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
-                _mm_cvtsi128_si32(res_0);
-            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
-                _mm_cvtsi128_si32(res_1);
-          }
-        } else {
-          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
-          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
-
-#if CONFIG_OPTFLOW_REFINEMENT
-          _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
-          _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                           res_1);
-#else
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                          res_1);
-#endif  // CONFIG_OPTFLOW_REFINEMENT
-        }
-      }
-    }
-  }
-}

diff --git a/av1/common/x86/jnt_convolve_sse2.c b/av1/common/x86/jnt_convolve_sse2.c
deleted file mode 100644
index 6ab726d..0000000
--- a/av1/common/x86/jnt_convolve_sse2.c
+++ /dev/null

@@ -1,639 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <emmintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/convolve_sse2.h"
-
-void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride,
-                                  uint8_t *dst0, int dst_stride0, int w, int h,
-                                  const InterpFilterParams *filter_params_x,
-                                  const int subpel_x_qn,
-                                  ConvolveParams *conv_params) {
-  const int bd = 8;
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  const int dst_stride = conv_params->dst_stride;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const uint8_t *src_ptr = src - fo_horiz;
-  const int bits = FILTER_BITS - conv_params->round_1;
-  const __m128i left_shift = _mm_cvtsi32_si128(bits);
-  const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_0) >> 1);
-  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
-  const int do_average = conv_params->do_average;
-  const int use_wtd_comp_avg = is_uneven_wtd_comp_avg(conv_params);
-  const int w0 = conv_params->fwd_offset;
-  const int w1 = conv_params->bck_offset;
-  const __m128i wt0 = _mm_set1_epi16(w0);
-  const __m128i wt1 = _mm_set1_epi16(w1);
-  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
-  const int offset_0 =
-      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
-  const __m128i offset_const = _mm_set1_epi16(offset);
-  const int rounding_shift =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
-  __m128i coeffs[4];
-
-  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs);
-
-  if (w == 4) {
-    do {
-      const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
-      __m128i s[4];
-
-      s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
-      s[1] =
-          _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
-      s[2] =
-          _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
-      s[3] =
-          _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
-      const __m128i res_lo = convolve_lo_x(s, coeffs);
-      const __m128i res_lo_round =
-          _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
-      const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
-
-      const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_lo_shift);
-      const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
-
-      // Accumulate values into the destination buffer
-      if (do_average) {
-        const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
-
-        const __m128i comp_avg_res =
-            comp_avg(&data_ref_0, &res_unsigned, &wt, use_wtd_comp_avg);
-
-        const __m128i round_result = convolve_rounding(
-            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
-
-        const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
-        *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
-      } else {
-#if CONFIG_OPTFLOW_REFINEMENT
-        _mm_storeu_si128((__m128i *)(&dst[0]), res_unsigned);
-#else
-        _mm_store_si128((__m128i *)(&dst[0]), res_unsigned);
-#endif  // CONFIG_OPTFLOW_REFINEMENT
-      }
-      src_ptr += src_stride;
-      dst += dst_stride;
-      dst0 += dst_stride0;
-    } while (--h);
-  } else {
-    assert(!(w % 8));
-    int i = 0;
-    do {
-      int j = 0;
-      do {
-        const __m128i data =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-        __m128i s[4];
-
-        // Filter even-index pixels
-        s[0] = data;
-        s[1] = _mm_srli_si128(data, 2);
-        s[2] = _mm_srli_si128(data, 4);
-        s[3] = _mm_srli_si128(data, 6);
-        const __m128i res_even = convolve_lo_x(s, coeffs);
-
-        // Filter odd-index pixels
-        s[0] = _mm_srli_si128(data, 1);
-        s[1] = _mm_srli_si128(data, 3);
-        s[2] = _mm_srli_si128(data, 5);
-        s[3] = _mm_srli_si128(data, 7);
-        const __m128i res_odd = convolve_lo_x(s, coeffs);
-
-        // Rearrange pixels back into the order 0 ... 7
-        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-        const __m128i res_lo_round =
-            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
-        const __m128i res_hi_round =
-            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
-        const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
-        const __m128i res_hi_shift = _mm_sll_epi32(res_hi_round, left_shift);
-
-        const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
-        const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
-
-        // Accumulate values into the destination buffer
-        if (do_average) {
-          const __m128i data_ref_0 =
-              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
-
-          const __m128i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_wtd_comp_avg);
-
-          const __m128i round_result = convolve_rounding(
-              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
-
-          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
-          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
-        } else {
-#if CONFIG_OPTFLOW_REFINEMENT
-          _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
-#else
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
-#endif  // CONFIG_OPTFLOW_REFINEMENT
-        }
-        j += 8;
-      } while (j < w);
-    } while (++i < h);
-  }
-}
-
-void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride,
-                                  uint8_t *dst0, int dst_stride0, int w, int h,
-                                  const InterpFilterParams *filter_params_y,
-                                  const int subpel_y_qn,
-                                  ConvolveParams *conv_params) {
-  const int bd = 8;
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  const int dst_stride = conv_params->dst_stride;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const uint8_t *src_ptr = src - fo_vert * src_stride;
-  const int bits = FILTER_BITS - conv_params->round_0;
-  const __m128i left_shift = _mm_cvtsi32_si128(bits);
-  const int do_average = conv_params->do_average;
-  const __m128i wt0 = _mm_set1_epi16(conv_params->fwd_offset);
-  const __m128i wt1 = _mm_set1_epi16(conv_params->bck_offset);
-  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
-  const int use_wtd_comp_avg = is_uneven_wtd_comp_avg(conv_params);
-  const int offset_0 =
-      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
-  const __m128i offset_const = _mm_set1_epi16(offset);
-  const int rounding_shift =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
-  const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_1) >> 1);
-  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-  __m128i coeffs[4];
-
-  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
-
-  if (w == 4) {
-    __m128i s[8], src6, res, res_shift;
-    src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
-    s[0] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
-    s[1] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
-    s[2] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
-    s[3] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
-    s[4] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
-    s[5] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
-
-    do {
-      s[6] = _mm_unpacklo_epi8(
-          src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
-      src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
-      s[7] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
-
-      res = convolve_lo_y(s + 0, coeffs);
-      res_shift = _mm_sll_epi32(res, left_shift);
-      res_shift =
-          _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
-
-      __m128i res_16b = _mm_packs_epi32(res_shift, res_shift);
-      __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
-
-      // Accumulate values into the destination buffer
-      if (do_average) {
-        const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
-
-        const __m128i comp_avg_res =
-            comp_avg(&data_ref_0, &res_unsigned, &wt, use_wtd_comp_avg);
-
-        const __m128i round_result = convolve_rounding(
-            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
-
-        const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
-        *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
-
-      } else {
-#if CONFIG_OPTFLOW_REFINEMENT
-        _mm_storeu_si128((__m128i *)dst, res_unsigned);
-#else
-        _mm_store_si128((__m128i *)dst, res_unsigned);
-#endif  // CONFIG_OPTFLOW_REFINEMENT
-      }
-
-      src_ptr += src_stride;
-      dst += dst_stride;
-      dst0 += dst_stride0;
-
-      res = convolve_lo_y(s + 1, coeffs);
-      res_shift = _mm_sll_epi32(res, left_shift);
-      res_shift =
-          _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
-
-      res_16b = _mm_packs_epi32(res_shift, res_shift);
-      res_unsigned = _mm_add_epi16(res_16b, offset_const);
-
-      // Accumulate values into the destination buffer
-      if (do_average) {
-        const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
-
-        const __m128i comp_avg_res =
-            comp_avg(&data_ref_0, &res_unsigned, &wt, use_wtd_comp_avg);
-
-        const __m128i round_result = convolve_rounding(
-            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
-
-        const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
-        *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
-
-      } else {
-#if CONFIG_OPTFLOW_REFINEMENT
-        _mm_storeu_si128((__m128i *)dst, res_unsigned);
-#else
-        _mm_store_si128((__m128i *)dst, res_unsigned);
-#endif  // CONFIG_OPTFLOW_REFINEMENT
-      }
-
-      src_ptr += src_stride;
-      dst += dst_stride;
-      dst0 += dst_stride0;
-
-      s[0] = s[2];
-      s[1] = s[3];
-      s[2] = s[4];
-      s[3] = s[5];
-      s[4] = s[6];
-      s[5] = s[7];
-      h -= 2;
-    } while (h);
-  } else {
-    assert(!(w % 8));
-    int j = 0;
-    do {
-      __m128i s[8], src6, res_lo, res_hi, res_lo_shift, res_hi_shift;
-      const uint8_t *data = &src_ptr[j];
-
-      src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
-      s[0] = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
-          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
-      s[1] = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
-          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
-      s[2] = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
-          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
-      s[3] = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
-          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
-      s[4] = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
-          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
-      s[5] = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
-
-      int i = 0;
-      do {
-        data = &src_ptr[i * src_stride + j];
-        s[6] = _mm_unpacklo_epi8(
-            src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
-        src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
-        s[7] = _mm_unpacklo_epi8(
-            _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
-
-        res_lo = convolve_lo_y(s, coeffs);  // Filter low index pixels
-        res_hi = convolve_hi_y(s, coeffs);  // Filter high index pixels
-        res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
-        res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
-        res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
-                                     round_shift);
-        res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
-                                     round_shift);
-
-        __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
-        __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
-
-        // Accumulate values into the destination buffer
-        if (do_average) {
-          const __m128i data_ref_0 =
-              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
-
-          const __m128i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_wtd_comp_avg);
-
-          const __m128i round_result = convolve_rounding(
-              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
-
-          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
-          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
-        } else {
-#if CONFIG_OPTFLOW_REFINEMENT
-          _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
-#else
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
-#endif  // CONFIG_OPTFLOW_REFINEMENT
-        }
-        i++;
-
-        res_lo = convolve_lo_y(s + 1, coeffs);  // Filter low index pixels
-        res_hi = convolve_hi_y(s + 1, coeffs);  // Filter high index pixels
-        res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
-        res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
-        res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
-                                     round_shift);
-        res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
-                                     round_shift);
-        res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
-        res_unsigned = _mm_add_epi16(res_16b, offset_const);
-
-        // Accumulate values into the destination buffer
-        if (do_average) {
-          __m128i data_ref_0 =
-              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
-
-          const __m128i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_wtd_comp_avg);
-
-          const __m128i round_result = convolve_rounding(
-              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
-
-          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
-          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
-        } else {
-#if CONFIG_OPTFLOW_REFINEMENT
-          _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
-#else
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
-#endif  // CONFIG_OPTFLOW_REFINEMENT
-        }
-        i++;
-
-        s[0] = s[2];
-        s[1] = s[3];
-        s[2] = s[4];
-        s[3] = s[5];
-        s[4] = s[6];
-        s[5] = s[7];
-      } while (i < h);
-      j += 8;
-    } while (j < w);
-  }
-}
-
-void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride,
-                                   uint8_t *dst0, int dst_stride0, int w, int h,
-                                   const InterpFilterParams *filter_params_x,
-                                   const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_qn, const int subpel_y_qn,
-                                   ConvolveParams *conv_params) {
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
-  const int bd = 8;
-
-  DECLARE_ALIGNED(16, int16_t,
-                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
-  int im_h = h + filter_params_y->taps - 1;
-  int im_stride = MAX_SB_SIZE;
-  int i, j;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const int do_average = conv_params->do_average;
-  const int use_wtd_comp_avg = is_uneven_wtd_comp_avg(conv_params);
-  const int w0 = conv_params->fwd_offset;
-  const int w1 = conv_params->bck_offset;
-  const __m128i wt0 = _mm_set1_epi16(w0);
-  const __m128i wt1 = _mm_set1_epi16(w1);
-  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
-  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-
-  const __m128i zero = _mm_setzero_si128();
-
-  const int offset_0 =
-      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
-  const __m128i offset_const = _mm_set1_epi16(offset);
-  const int rounding_shift =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
-
-  /* Horizontal filter */
-  {
-    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_x, subpel_x_qn & SUBPEL_MASK);
-    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const = _mm_set1_epi32(
-        ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
-    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
-
-    for (i = 0; i < im_h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        __m128i temp_lo, temp_hi;
-        const __m128i data =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-
-        const __m128i src_lo = _mm_unpacklo_epi8(data, zero);
-        const __m128i src_hi = _mm_unpackhi_epi8(data, zero);
-
-        // Filter even-index pixels
-        const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01);
-        temp_lo = _mm_srli_si128(src_lo, 4);
-        temp_hi = _mm_slli_si128(src_hi, 12);
-        const __m128i src_2 = _mm_or_si128(temp_hi, temp_lo);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        temp_lo = _mm_srli_si128(src_lo, 8);
-        temp_hi = _mm_slli_si128(src_hi, 8);
-        const __m128i src_4 = _mm_or_si128(temp_hi, temp_lo);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        temp_lo = _mm_srli_si128(src_lo, 12);
-        temp_hi = _mm_slli_si128(src_hi, 4);
-        const __m128i src_6 = _mm_or_si128(temp_hi, temp_lo);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
-                                         _mm_add_epi32(res_2, res_6));
-        res_even =
-            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
-
-        // Filter odd-index pixels
-        temp_lo = _mm_srli_si128(src_lo, 2);
-        temp_hi = _mm_slli_si128(src_hi, 14);
-        const __m128i src_1 = _mm_or_si128(temp_hi, temp_lo);
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        temp_lo = _mm_srli_si128(src_lo, 6);
-        temp_hi = _mm_slli_si128(src_hi, 10);
-        const __m128i src_3 = _mm_or_si128(temp_hi, temp_lo);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        temp_lo = _mm_srli_si128(src_lo, 10);
-        temp_hi = _mm_slli_si128(src_hi, 6);
-        const __m128i src_5 = _mm_or_si128(temp_hi, temp_lo);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        temp_lo = _mm_srli_si128(src_lo, 14);
-        temp_hi = _mm_slli_si128(src_hi, 2);
-        const __m128i src_7 = _mm_or_si128(temp_hi, temp_lo);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
-                                        _mm_add_epi32(res_3, res_7));
-        res_odd =
-            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
-
-        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
-        __m128i res = _mm_packs_epi32(res_even, res_odd);
-#if CONFIG_OPTFLOW_REFINEMENT
-        _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
-#else
-        _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res);
-#endif  // CONFIG_OPTFLOW_REFINEMENT
-      }
-    }
-  }
-
-  /* Vertical filter */
-  {
-    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_y, subpel_y_qn & SUBPEL_MASK);
-    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const = _mm_set1_epi32(
-        ((1 << conv_params->round_1) >> 1) -
-        (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
-    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        // Filter even-index pixels
-        const int16_t *data = &im_block[i * im_stride + j];
-        const __m128i src_0 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
-                               *(__m128i *)(data + 1 * im_stride));
-        const __m128i src_2 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
-                               *(__m128i *)(data + 3 * im_stride));
-        const __m128i src_4 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
-                               *(__m128i *)(data + 5 * im_stride));
-        const __m128i src_6 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
-                               *(__m128i *)(data + 7 * im_stride));
-
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                               _mm_add_epi32(res_4, res_6));
-
-        // Filter odd-index pixels
-        const __m128i src_1 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
-                               *(__m128i *)(data + 1 * im_stride));
-        const __m128i src_3 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
-                               *(__m128i *)(data + 3 * im_stride));
-        const __m128i src_5 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
-                               *(__m128i *)(data + 5 * im_stride));
-        const __m128i src_7 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
-                               *(__m128i *)(data + 7 * im_stride));
-
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
-
-        // Rearrange pixels back into the order 0 ... 7
-        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-        const __m128i res_lo_round =
-            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
-        const __m128i res_hi_round =
-            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
-
-        const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round);
-        const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
-
-        // Accumulate values into the destination buffer
-        if (do_average) {
-          const __m128i data_ref_0 =
-              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
-
-          const __m128i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_wtd_comp_avg);
-
-          const __m128i round_result = convolve_rounding(
-              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
-
-          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
-
-          if (w > 4)
-            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
-          else
-            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
-                _mm_cvtsi128_si32(res_8);
-        } else {
-#if CONFIG_OPTFLOW_REFINEMENT
-          _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
-#else
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
-#endif  // CONFIG_OPTFLOW_REFINEMENT
-        }
-      }
-    }
-  }
-}

diff --git a/av1/common/x86/jnt_convolve_ssse3.c b/av1/common/x86/jnt_convolve_ssse3.c
deleted file mode 100644
index 4ad05b2..0000000
--- a/av1/common/x86/jnt_convolve_ssse3.c
+++ /dev/null

@@ -1,240 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <tmmintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/convolve_sse2.h"
-
-void av1_dist_wtd_convolve_2d_ssse3(
-    const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
-    int h, const InterpFilterParams *filter_params_x,
-    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
-    const int subpel_y_qn, ConvolveParams *conv_params) {
-  CONV_BUF_TYPE *dst = conv_params->dst;
-  int dst_stride = conv_params->dst_stride;
-  const int bd = 8;
-
-  DECLARE_ALIGNED(16, int16_t,
-                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
-  int im_h = h + filter_params_y->taps - 1;
-  int im_stride = MAX_SB_SIZE;
-  int i, j;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const int do_average = conv_params->do_average;
-  const int use_wtd_comp_avg = is_uneven_wtd_comp_avg(conv_params);
-  const int w0 = conv_params->fwd_offset;
-  const int w1 = conv_params->bck_offset;
-  const __m128i wt0 = _mm_set1_epi16(w0);
-  const __m128i wt1 = _mm_set1_epi16(w1);
-  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
-
-  const int offset_0 =
-      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
-  const __m128i offset_const = _mm_set1_epi16(offset);
-  const int rounding_shift =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
-
-  /* Horizontal filter */
-  {
-    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_x, subpel_x_qn & SUBPEL_MASK);
-    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const = _mm_set1_epi32(
-        ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
-    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
-
-    for (i = 0; i < im_h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        const __m128i data =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-
-        const __m128i src_lo = _mm_unpacklo_epi8(data, zero);
-        const __m128i src_hi = _mm_unpackhi_epi8(data, zero);
-
-        // Filter even-index pixels
-        const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01);
-        const __m128i src_2 = _mm_alignr_epi8(src_hi, src_lo, 4);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i src_4 = _mm_alignr_epi8(src_hi, src_lo, 8);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i src_6 = _mm_alignr_epi8(src_hi, src_lo, 12);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
-                                         _mm_add_epi32(res_2, res_6));
-        res_even =
-            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
-
-        // Filter odd-index pixels
-        const __m128i src_1 = _mm_alignr_epi8(src_hi, src_lo, 2);
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i src_3 = _mm_alignr_epi8(src_hi, src_lo, 6);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i src_5 = _mm_alignr_epi8(src_hi, src_lo, 10);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i src_7 = _mm_alignr_epi8(src_hi, src_lo, 14);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
-                                        _mm_add_epi32(res_3, res_7));
-        res_odd =
-            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
-
-        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
-        __m128i res = _mm_packs_epi32(res_even, res_odd);
-#if CONFIG_OPTFLOW_REFINEMENT
-        _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
-#else
-        _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res);
-#endif  // CONFIG_OPTFLOW_REFINEMENT
-      }
-    }
-  }
-
-  /* Vertical filter */
-  {
-    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_y, subpel_y_qn & SUBPEL_MASK);
-    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const = _mm_set1_epi32(
-        ((1 << conv_params->round_1) >> 1) -
-        (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
-    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        // Filter even-index pixels
-        const int16_t *data = &im_block[i * im_stride + j];
-        const __m128i src_0 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
-                               *(__m128i *)(data + 1 * im_stride));
-        const __m128i src_2 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
-                               *(__m128i *)(data + 3 * im_stride));
-        const __m128i src_4 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
-                               *(__m128i *)(data + 5 * im_stride));
-        const __m128i src_6 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
-                               *(__m128i *)(data + 7 * im_stride));
-
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                               _mm_add_epi32(res_4, res_6));
-
-        // Filter odd-index pixels
-        const __m128i src_1 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
-                               *(__m128i *)(data + 1 * im_stride));
-        const __m128i src_3 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
-                               *(__m128i *)(data + 3 * im_stride));
-        const __m128i src_5 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
-                               *(__m128i *)(data + 5 * im_stride));
-        const __m128i src_7 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
-                               *(__m128i *)(data + 7 * im_stride));
-
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
-
-        // Rearrange pixels back into the order 0 ... 7
-        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-        const __m128i res_lo_round =
-            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
-        const __m128i res_hi_round =
-            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
-
-        const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round);
-        const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
-
-        // Accumulate values into the destination buffer
-        if (do_average) {
-          const __m128i data_ref_0 =
-              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
-
-          const __m128i comp_avg_res =
-              comp_avg(&data_ref_0, &res_unsigned, &wt, use_wtd_comp_avg);
-
-          const __m128i round_result = convolve_rounding(
-              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
-
-          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
-
-          if (w > 4)
-            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
-          else
-            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
-                _mm_cvtsi128_si32(res_8);
-        } else {
-#if CONFIG_OPTFLOW_REFINEMENT
-          _mm_storeu_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
-#else
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
-#endif  // CONFIG_OPTFLOW_REFINEMENT
-        }
-      }
-    }
-  }
-}

diff --git a/av1/common/x86/optflow_refine_sse4.c b/av1/common/x86/optflow_refine_sse4.c
index 514387b..6a31b8f 100644
--- a/av1/common/x86/optflow_refine_sse4.c
+++ b/av1/common/x86/optflow_refine_sse4.c

@@ -20,16 +20,6 @@
 #include "aom_dsp/x86/synonyms.h"
 
 #if CONFIG_OPTFLOW_REFINEMENT
-static INLINE __m128i round_power_of_two_signed_epi16(__m128i v_val_d,
-                                                      const __m128i v_bias_d,
-                                                      const __m128i ones,
-                                                      const int bits) {
-  const __m128i v_sign_d = _mm_sign_epi16(ones, v_val_d);
-  __m128i reg = _mm_mullo_epi16(v_val_d, v_sign_d);
-  reg = _mm_srli_epi16(_mm_adds_epi16(reg, v_bias_d), bits);
-  return _mm_mullo_epi16(reg, v_sign_d);
-}
-
 static INLINE __m128i round_power_of_two_signed_epi32(__m128i temp1,
                                                       __m128i temp2,
                                                       const __m128i v_bias_d,
@@ -47,225 +37,6 @@
   return (_mm_packs_epi32(temp1, temp2));
 }
 
-void av1_bicubic_grad_interpolation_sse4_1(const int16_t *pred_src,
-                                           int16_t *x_grad, int16_t *y_grad,
-                                           const int bw, const int bh) {
-#if OPFL_BICUBIC_GRAD
-  assert(bw % 8 == 0);
-  assert(bh % 8 == 0);
-
-  __m128i coeff_bi[4][2];
-  coeff_bi[0][0] =
-      _mm_set1_epi16((int16_t)coeffs_bicubic[SUBPEL_GRAD_DELTA_BITS][0][0]);
-  coeff_bi[0][1] =
-      _mm_set1_epi16((int16_t)coeffs_bicubic[SUBPEL_GRAD_DELTA_BITS][1][0]);
-  coeff_bi[1][0] =
-      _mm_set1_epi16((int16_t)coeffs_bicubic[SUBPEL_GRAD_DELTA_BITS][0][1]);
-  coeff_bi[1][1] =
-      _mm_set1_epi16((int16_t)coeffs_bicubic[SUBPEL_GRAD_DELTA_BITS][1][1]);
-  coeff_bi[2][0] = _mm_insert_epi16(coeff_bi[0][0], 42, 0);
-  coeff_bi[2][1] = _mm_insert_epi16(coeff_bi[0][1], -6, 0);
-  coeff_bi[3][0] = _mm_insert_epi16(coeff_bi[0][0], 42, 7);
-  coeff_bi[3][1] = _mm_insert_epi16(coeff_bi[0][1], -6, 7);
-  const __m128i v_bias_d = _mm_set1_epi16((1 << bicubic_bits) >> 1);
-  const __m128i ones = _mm_set1_epi16(1);
-
-#if OPFL_DOWNSAMP_QUINCUNX
-  __m128i mask_val[2] = { _mm_set_epi16(0, 1, 0, 1, 0, 1, 0, 1),
-                          _mm_set_epi16(1, 0, 1, 0, 1, 0, 1, 0) };
-#endif
-  if (bw < 16) {
-    __m128i coeff[2];
-    coeff[0] = _mm_insert_epi16(coeff_bi[2][0], 42, 7);
-    coeff[1] = _mm_insert_epi16(coeff_bi[2][1], -6, 7);
-
-    for (int col = 0; col < bh; col++) {
-      const int is_y_boundary = (col + 1 > bh - 1 || col - 1 < 0);
-      const int id_prev1 = AOMMAX(col - 1, 0);
-      const int id_prev2 = AOMMAX(col - 2, 0);
-      const int id_next1 = AOMMIN(col + 1, bh - 1);
-      const int id_next2 = AOMMIN(col + 2, bh - 1);
-#if OPFL_DOWNSAMP_QUINCUNX
-      __m128i mask = mask_val[col & 0x1];
-#endif
-      for (int row = 0; row < bw; row += 8) {
-        __m128i vpred_next1, vpred_prev1, vpred_next2, vpred_prev2;
-        __m128i temp, sub1, sub2;
-
-        // Subtract interpolated pixel at (i, j+delta) by the one at (i,
-        // j-delta)
-        const int16_t *src = &pred_src[col * bw + row];
-        vpred_prev1 =
-            _mm_set_epi16(*(src + 6), *(src + 5), *(src + 4), *(src + 3),
-                          *(src + 2), *(src + 1), *src, *src);
-        vpred_prev2 = _mm_set_epi16(*(src + 5), *(src + 4), *(src + 3),
-                                    *(src + 2), *(src + 1), *src, *src, *src);
-        vpred_next1 =
-            _mm_set_epi16(*(src + 7), *(src + 7), *(src + 6), *(src + 5),
-                          *(src + 4), *(src + 3), *(src + 2), *(src + 1));
-        vpred_next2 =
-            _mm_set_epi16(*(src + 7), *(src + 7), *(src + 7), *(src + 6),
-                          *(src + 5), *(src + 4), *(src + 3), *(src + 2));
-
-        sub1 = _mm_sub_epi16(vpred_next1, vpred_prev1);
-        sub2 = _mm_sub_epi16(vpred_next2, vpred_prev2);
-
-        temp = _mm_add_epi16(_mm_mullo_epi16(sub1, coeff[0]),
-                             _mm_mullo_epi16(sub2, coeff[1]));
-
-#if OPFL_DOWNSAMP_QUINCUNX
-        temp = _mm_mullo_epi16(temp, mask);
-#endif
-        temp =
-            round_power_of_two_signed_epi16(temp, v_bias_d, ones, bicubic_bits);
-
-        const int idx = col * bw + row;
-        xx_storeu_128(x_grad + idx, temp);
-
-        // Subtract interpolated pixel at (i+delta, j) by the one at (i-delta,
-        // j)
-        src = pred_src + row;
-        vpred_prev1 = xx_loadu_128(src + id_prev1 * bw);
-        vpred_prev2 = xx_loadu_128(src + id_prev2 * bw);
-        vpred_next1 = xx_loadu_128(src + id_next1 * bw);
-        vpred_next2 = xx_loadu_128(src + id_next2 * bw);
-
-        sub1 = _mm_sub_epi16(vpred_next1, vpred_prev1);
-        sub2 = _mm_sub_epi16(vpred_next2, vpred_prev2);
-
-        temp = _mm_add_epi16(_mm_mullo_epi16(sub1, coeff_bi[is_y_boundary][0]),
-                             _mm_mullo_epi16(sub2, coeff_bi[is_y_boundary][1]));
-
-#if OPFL_DOWNSAMP_QUINCUNX
-        temp = _mm_mullo_epi16(temp, mask);
-#endif
-        temp =
-            round_power_of_two_signed_epi16(temp, v_bias_d, ones, bicubic_bits);
-        xx_storeu_128(y_grad + idx, temp);
-      }
-    }
-  } else {
-    for (int col = 0; col < bh; col++) {
-      const int is_y_boundary = (col + 1 > bh - 1 || col - 1 < 0);
-      const int id_prev = AOMMAX(col - 1, 0);
-      const int id_prev2 = AOMMAX(col - 2, 0);
-      const int id_next = AOMMIN(col + 1, bh - 1);
-      const int id_next2 = AOMMIN(col + 2, bh - 1);
-#if OPFL_DOWNSAMP_QUINCUNX
-      __m128i mask = mask_val[col & 0x1];
-#endif
-      for (int row = 0; row < bw; row += 16) {
-        __m128i vpred_next1_1, vpred_prev1_1, vpred_next2_1, vpred_prev2_1;
-        __m128i vpred_next1_2, vpred_prev1_2, vpred_next2_2, vpred_prev2_2;
-        __m128i temp1, temp2;
-        __m128i sub1, sub2, sub3, sub4;
-
-        // Subtract interpolated pixel at (i, j+delta) by the one at (i,
-        // j-delta)
-        const int16_t *src = &pred_src[col * bw + row];
-        if (row - 1 < 0) {
-          vpred_prev1_1 =
-              _mm_set_epi16(*(src + 6), *(src + 5), *(src + 4), *(src + 3),
-                            *(src + 2), *(src + 1), *src, *src);
-          vpred_prev2_1 =
-              _mm_set_epi16(*(src + 5), *(src + 4), *(src + 3), *(src + 2),
-                            *(src + 1), *src, *src, *src);
-        } else {
-          vpred_prev1_1 = xx_loadu_128((__m128i *)(src - 1));
-          vpred_prev2_1 = xx_loadu_128((__m128i *)(src - 2));
-        }
-        if (row + 16 > bw - 1) {
-          vpred_next1_2 =
-              _mm_set_epi16(*(src + 15), *(src + 15), *(src + 14), *(src + 13),
-                            *(src + 12), *(src + 11), *(src + 10), *(src + 9));
-          vpred_next2_2 =
-              _mm_set_epi16(*(src + 15), *(src + 15), *(src + 15), *(src + 14),
-                            *(src + 13), *(src + 12), *(src + 11), *(src + 10));
-        } else {
-          vpred_next1_2 = xx_loadu_128(src + 9);
-          vpred_next2_2 = xx_loadu_128(src + 10);
-        }
-        vpred_prev1_2 = xx_loadu_128(src + 7);
-        vpred_prev2_2 = xx_loadu_128(src + 6);
-        vpred_next1_1 = xx_loadu_128(src + 1);
-        vpred_next2_1 = xx_loadu_128(src + 2);
-
-        sub1 = _mm_sub_epi16(vpred_next1_1, vpred_prev1_1);
-        sub2 = _mm_sub_epi16(vpred_next2_1, vpred_prev2_1);
-
-        sub3 = _mm_sub_epi16(vpred_next1_2, vpred_prev1_2);
-        sub4 = _mm_sub_epi16(vpred_next2_2, vpred_prev2_2);
-
-        const int is_left_boundary = row - 1 < 0 ? 2 : 0;
-        const int is_right_boundary = row + 16 > bw - 1 ? 3 : 0;
-        temp1 =
-            _mm_add_epi16(_mm_mullo_epi16(sub1, coeff_bi[is_left_boundary][0]),
-                          _mm_mullo_epi16(sub2, coeff_bi[is_left_boundary][1]));
-        temp2 = _mm_add_epi16(
-            _mm_mullo_epi16(sub3, coeff_bi[is_right_boundary][0]),
-            _mm_mullo_epi16(sub4, coeff_bi[is_right_boundary][1]));
-
-#if OPFL_DOWNSAMP_QUINCUNX
-        temp1 = _mm_mullo_epi16(temp1, mask);
-        temp2 = _mm_mullo_epi16(temp2, mask);
-#endif
-        temp1 = round_power_of_two_signed_epi16(temp1, v_bias_d, ones,
-                                                bicubic_bits);
-        temp2 = round_power_of_two_signed_epi16(temp2, v_bias_d, ones,
-                                                bicubic_bits);
-
-        const int idx = col * bw + row;
-        xx_storeu_128(x_grad + idx, temp1);
-        xx_storeu_128(x_grad + idx + 8, temp2);
-
-        // Subtract interpolated pixel at (i+delta, j) by the one at (i-delta,
-        // j)
-        src = pred_src + row;
-        vpred_prev1_1 = xx_loadu_128(src + bw * id_prev);
-        vpred_prev2_1 = xx_loadu_128(src + bw * id_prev2);
-        vpred_next1_1 = xx_loadu_128(src + id_next * bw);
-        vpred_next2_1 = xx_loadu_128(src + id_next2 * bw);
-
-        vpred_prev1_2 = xx_loadu_128(src + bw * id_prev + 8);
-        vpred_prev2_2 = xx_loadu_128(src + bw * id_prev2 + 8);
-        vpred_next1_2 = xx_loadu_128(src + id_next * bw + 8);
-        vpred_next2_2 = xx_loadu_128(src + id_next2 * bw + 8);
-
-        sub1 = _mm_sub_epi16(vpred_next1_1, vpred_prev1_1);
-        sub2 = _mm_sub_epi16(vpred_next2_1, vpred_prev2_1);
-
-        sub3 = _mm_sub_epi16(vpred_next1_2, vpred_prev1_2);
-        sub4 = _mm_sub_epi16(vpred_next2_2, vpred_prev2_2);
-
-        temp1 =
-            _mm_add_epi16(_mm_mullo_epi16(sub1, coeff_bi[is_y_boundary][0]),
-                          _mm_mullo_epi16(sub2, coeff_bi[is_y_boundary][1]));
-        temp2 =
-            _mm_add_epi16(_mm_mullo_epi16(sub3, coeff_bi[is_y_boundary][0]),
-                          _mm_mullo_epi16(sub4, coeff_bi[is_y_boundary][1]));
-#if OPFL_DOWNSAMP_QUINCUNX
-        temp1 = _mm_mullo_epi16(temp1, mask);
-        temp2 = _mm_mullo_epi16(temp2, mask);
-#endif
-        temp1 = round_power_of_two_signed_epi16(temp1, v_bias_d, ones,
-                                                bicubic_bits);
-        temp2 = round_power_of_two_signed_epi16(temp2, v_bias_d, ones,
-                                                bicubic_bits);
-
-        xx_storeu_128(y_grad + idx, temp1);
-        xx_storeu_128(y_grad + idx + 8, temp2);
-      }
-    }
-  }
-#else
-  (void)pred_src;
-  (void)x_grad;
-  (void)y_grad;
-  (void)bw;
-  (void)bh;
-#endif  // OPFL_BICUBIC_GRAD
-}
-
 void av1_bicubic_grad_interpolation_highbd_sse4_1(const int16_t *pred_src,
                                                   int16_t *x_grad,
                                                   int16_t *y_grad, const int bw,
@@ -807,167 +578,6 @@
                   vx1, vy1);
 }
 
-static void opfl_mv_refinement_lowbd_8x4_sse4_1(
-    const __m128i dist_d0, const __m128i dist_d0d1, const uint8_t *p0,
-    int pstride0, const uint8_t *p1, int pstride1, const int16_t *gx0,
-    const int16_t *gy0, const int16_t *gx1, const int16_t *gy1, int gstride,
-    int d0, int d1, int grad_prec_bits, int mv_prec_bits, int *vx0, int *vy0,
-    int *vx1, int *vy1) {
-  int bHeight = 4;
-  __m128i u2_0 = _mm_setzero_si128();
-  __m128i v2_0 = _mm_setzero_si128();
-  __m128i uv_0 = _mm_setzero_si128();
-  __m128i uw_0 = _mm_setzero_si128();
-  __m128i vw_0 = _mm_setzero_si128();
-  __m128i u2_1 = _mm_setzero_si128();
-  __m128i v2_1 = _mm_setzero_si128();
-  __m128i uv_1 = _mm_setzero_si128();
-  __m128i uw_1 = _mm_setzero_si128();
-  __m128i vw_1 = _mm_setzero_si128();
-
-  do {
-    __m128i gradX0 = LoadAligned16(gx0);
-    __m128i gradX1 = LoadAligned16(gx1);
-    __m128i gradY0 = LoadAligned16(gy0);
-    __m128i gradY1 = LoadAligned16(gy1);
-    __m128i pred0 = _mm_cvtepu8_epi16(LoadLo8(p0));
-    __m128i pred1 = _mm_cvtepu8_epi16(LoadLo8(p1));
-
-#if OPFL_DOWNSAMP_QUINCUNX
-    const __m128i pred0_odd = _mm_cvtepu8_epi16(LoadLo8(p0 + pstride0));
-    const __m128i pred1_odd = _mm_cvtepu8_epi16(LoadLo8(p1 + pstride1));
-
-    down_sample(&gradX0, &gradX1, &gradY0, &gradY1, &pred0, &pred1, &pred0_odd,
-                &pred1_odd, gx0, gx1, gy0, gy1, gstride);
-#endif  // OPFL_DOWNSAMP_QUINCUNX
-
-    square_accumulate_8x4(gradX0, gradX1, gradY0, gradY1, &u2_0, &v2_0, &uv_0,
-                          &uw_0, &vw_0, &u2_1, &v2_1, &uv_1, &uw_1, &vw_1,
-                          &pred0, &pred1, dist_d0, dist_d0d1);
-
-#if OPFL_DOWNSAMP_QUINCUNX
-    gx0 += gstride << 1;
-    gx1 += gstride << 1;
-    gy0 += gstride << 1;
-    gy1 += gstride << 1;
-    p0 += pstride0 << 1;
-    p1 += pstride1 << 1;
-    bHeight -= 2;
-#else
-    gx0 += gstride;
-    gx1 += gstride;
-    gy0 += gstride;
-    gy1 += gstride;
-    p0 += pstride0;
-    p1 += pstride1;
-    bHeight -= 1;
-#endif  // OPFL_DOWNSAMP_QUINCUNX
-  } while (bHeight != 0);
-
-  calculate_mv_8x4(u2_0, v2_0, uv_0, uw_0, vw_0, u2_1, v2_1, uv_1, uw_1, vw_1,
-                   d0, d1, mv_prec_bits, grad_prec_bits, vx0, vy0, vx1, vy1);
-}
-
-static void opfl_mv_refinement_lowbd_8x8_sse4_1(
-    const __m128i dist_d0, const __m128i dist_d0d1, const uint8_t *p0,
-    int pstride0, const uint8_t *p1, int pstride1, const int16_t *gx0,
-    const int16_t *gy0, const int16_t *gx1, const int16_t *gy1, int gstride,
-    int d0, int d1, int grad_prec_bits, int mv_prec_bits, int *vx0, int *vy0,
-    int *vx1, int *vy1) {
-  int bHeight = 8;
-  __m128i u2 = _mm_setzero_si128();
-  __m128i uv = _mm_setzero_si128();
-  __m128i v2 = _mm_setzero_si128();
-  __m128i uw = _mm_setzero_si128();
-  __m128i vw = _mm_setzero_si128();
-
-  do {
-    __m128i gradX0 = LoadAligned16(gx0);
-    __m128i gradX1 = LoadAligned16(gx1);
-    __m128i gradY0 = LoadAligned16(gy0);
-    __m128i gradY1 = LoadAligned16(gy1);
-    __m128i pred0 = _mm_cvtepu8_epi16(LoadLo8(p0));
-    __m128i pred1 = _mm_cvtepu8_epi16(LoadLo8(p1));
-
-#if OPFL_DOWNSAMP_QUINCUNX
-    const __m128i pred0_odd = _mm_cvtepu8_epi16(LoadLo8(p0 + pstride0));
-    const __m128i pred1_odd = _mm_cvtepu8_epi16(LoadLo8(p1 + pstride1));
-
-    down_sample(&gradX0, &gradX1, &gradY0, &gradY1, &pred0, &pred1, &pred0_odd,
-                &pred1_odd, gx0, gx1, gy0, gy1, gstride);
-#endif  // OPFL_DOWNSAMP_QUINCUNX
-
-    square_accumulate_8x8(gradX0, gradX1, gradY0, gradY1, &u2, &v2, &uv, &uw,
-                          &vw, &pred0, &pred1, dist_d0, dist_d0d1);
-
-#if OPFL_DOWNSAMP_QUINCUNX
-    gx0 += gstride << 1;
-    gx1 += gstride << 1;
-    gy0 += gstride << 1;
-    gy1 += gstride << 1;
-    p0 += pstride0 << 1;
-    p1 += pstride1 << 1;
-    bHeight -= 2;
-#else
-    gx0 += gstride;
-    gx1 += gstride;
-    gy0 += gstride;
-    gy1 += gstride;
-    p0 += pstride0;
-    p1 += pstride1;
-    bHeight -= 1;
-#endif  // OPFL_DOWNSAMP_QUINCUNX
-  } while (bHeight != 0);
-
-  calculate_mv_8x8(u2, v2, uv, uw, vw, d0, d1, mv_prec_bits, grad_prec_bits,
-                   vx0, vy0, vx1, vy1);
-}
-
-static void opfl_mv_refinement_lowbd_sse4_1(
-    const __m128i dist_d0, const __m128i dist_d0d1, const uint8_t *p0,
-    int pstride0, const uint8_t *p1, int pstride1, const int16_t *gx0,
-    const int16_t *gy0, const int16_t *gx1, const int16_t *gy1, int gstride,
-    int bw, int bh, int d0, int d1, int grad_prec_bits, int mv_prec_bits,
-    int *vx0, int *vy0, int *vx1, int *vy1) {
-  (void)bh;
-  if (bw == 4)
-    opfl_mv_refinement_lowbd_8x4_sse4_1(
-        dist_d0, dist_d0d1, p0, pstride0, p1, pstride1, gx0, gy0, gx1, gy1,
-        gstride, d0, d1, grad_prec_bits, mv_prec_bits, vx0, vy0, vx1, vy1);
-  else
-    opfl_mv_refinement_lowbd_8x8_sse4_1(
-        dist_d0, dist_d0d1, p0, pstride0, p1, pstride1, gx0, gy0, gx1, gy1,
-        gstride, d0, d1, grad_prec_bits, mv_prec_bits, vx0, vy0, vx1, vy1);
-}
-
-// Function to compute optical flow offsets in nxn blocks
-int av1_opfl_mv_refinement_nxn_lowbd_sse4_1(
-    const uint8_t *p0, int pstride0, const uint8_t *p1, int pstride1,
-    const int16_t *gx0, const int16_t *gy0, const int16_t *gx1,
-    const int16_t *gy1, int gstride, int bw, int bh, int n, int d0, int d1,
-    int grad_prec_bits, int mv_prec_bits, int *vx0, int *vy0, int *vx1,
-    int *vy1) {
-  assert(bw % n == 0 && bh % n == 0);
-  int n_blocks = 0;
-
-  __m128i dist_d0, dist_d0d1;
-  set_distance(&dist_d0, &dist_d0d1, d0, d1);
-
-  for (int i = 0; i < bh; i += n) {
-    for (int j = 0; j < bw; j += 8) {
-      opfl_mv_refinement_lowbd_sse4_1(
-          dist_d0, dist_d0d1, p0 + (i * pstride0 + j), pstride0,
-          p1 + (i * pstride1 + j), pstride1, gx0 + (i * gstride + j),
-          gy0 + (i * gstride + j), gx1 + (i * gstride + j),
-          gy1 + (i * gstride + j), gstride, n, n, d0, d1, grad_prec_bits,
-          mv_prec_bits, vx0 + n_blocks, vy0 + n_blocks, vx1 + n_blocks,
-          vy1 + n_blocks);
-      n_blocks += (n == 4) ? 2 : 1;
-    }
-  }
-  return n_blocks;
-}
-
 static void opfl_mv_refinement_highbd_8x4_sse4_1(
     const __m128i dist_d0, const __m128i dist_d0d1, const uint16_t *p0,
     int pstride0, const uint16_t *p1, int pstride1, const int16_t *gx0,
@@ -1350,135 +960,6 @@
 }
 
 #if OPFL_COMBINE_INTERP_GRAD_LS
-static AOM_FORCE_INLINE void compute_pred_using_interp_grad_sse4_1(
-    const uint8_t *src1, const uint8_t *src2, int16_t *dst1, int16_t *dst2,
-    int bw, int bh, int d0, int d1) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i mul1 = _mm_set1_epi16(d0);
-  const __m128i mul2 = _mm_sub_epi16(zero, _mm_set1_epi16(d1));
-  const __m128i mul_val1 = _mm_unpacklo_epi16(mul1, mul2);
-  const __m128i mul_val2 = _mm_unpacklo_epi16(mul1, _mm_sub_epi16(zero, mul1));
-
-  if (bw < 16) {
-    for (int i = 0; i < bh; i++) {
-      const uint8_t *inp1 = src1 + i * bw;
-      const uint8_t *inp2 = src2 + i * bw;
-      int16_t *out1 = dst1 + i * bw;
-      int16_t *out2 = dst2 + i * bw;
-      for (int j = 0; j < bw; j = j + 8) {
-        const __m128i src_buf1 = _mm_cvtepu8_epi16(xx_loadl_64(inp1 + j));
-        const __m128i src_buf2 = _mm_cvtepu8_epi16(xx_loadl_64(inp2 + j));
-
-        __m128i temp1, temp2;
-        __m128i reg1 = _mm_unpacklo_epi16(src_buf1, src_buf2);
-        __m128i reg2 = _mm_unpackhi_epi16(src_buf1, src_buf2);
-
-        temp1 = _mm_madd_epi16(reg1, mul_val1);
-        temp2 = _mm_madd_epi16(reg2, mul_val1);
-        temp1 = _mm_packs_epi32(temp1, temp2);
-
-        reg1 = _mm_madd_epi16(reg1, mul_val2);
-        reg2 = _mm_madd_epi16(reg2, mul_val2);
-        temp2 = _mm_packs_epi32(reg1, reg2);
-
-        xx_store_128(out1 + j, temp1);
-        xx_store_128(out2 + j, temp2);
-      }
-    }
-  } else {
-    for (int i = 0; i < bh; i++) {
-      const uint8_t *inp1 = src1 + i * bw;
-      const uint8_t *inp2 = src2 + i * bw;
-      int16_t *out1 = dst1 + i * bw;
-      int16_t *out2 = dst2 + i * bw;
-      for (int j = 0; j < bw; j = j + 16) {
-        const __m128i src_buf1 = xx_load_128(inp1 + j);
-        const __m128i src_buf2 = xx_load_128(inp2 + j);
-
-        __m128i temp1 = _mm_unpacklo_epi8(src_buf1, zero);
-        __m128i temp2 = _mm_unpackhi_epi8(src_buf1, zero);
-        __m128i temp3 = _mm_unpacklo_epi8(src_buf2, zero);
-        __m128i temp4 = _mm_unpackhi_epi8(src_buf2, zero);
-
-        __m128i res1, res2, res3, res4;
-        __m128i reg1, reg2, reg3, reg4;
-        reg1 = _mm_unpacklo_epi16(temp1, temp3);
-        reg2 = _mm_unpackhi_epi16(temp1, temp3);
-        reg3 = _mm_unpacklo_epi16(temp2, temp4);
-        reg4 = _mm_unpackhi_epi16(temp2, temp4);
-
-        temp1 = _mm_madd_epi16(reg1, mul_val1);
-        temp2 = _mm_madd_epi16(reg2, mul_val1);
-        res1 = _mm_packs_epi32(temp1, temp2);
-
-        temp2 = _mm_madd_epi16(reg3, mul_val1);
-        temp3 = _mm_madd_epi16(reg4, mul_val1);
-        res2 = _mm_packs_epi32(temp2, temp3);
-
-        temp3 = _mm_madd_epi16(reg1, mul_val2);
-        temp4 = _mm_madd_epi16(reg2, mul_val2);
-        res3 = _mm_packs_epi32(temp3, temp4);
-
-        temp3 = _mm_madd_epi16(reg3, mul_val2);
-        temp4 = _mm_madd_epi16(reg4, mul_val2);
-        res4 = _mm_packs_epi32(temp3, temp4);
-
-        xx_store_128(out1 + j, res1);
-        xx_store_128(out1 + j + 8, res2);
-        xx_store_128(out2 + j, res3);
-        xx_store_128(out2 + j + 8, res4);
-      }
-    }
-  }
-}
-#endif  // OPFL_COMBINE_INTERP_GRAD_LS
-
-void av1_copy_pred_array_sse4_1(const uint8_t *src1, const uint8_t *src2,
-                                int16_t *dst1, int16_t *dst2, int bw, int bh,
-                                int d0, int d1) {
-#if OPFL_BILINEAR_GRAD || OPFL_BICUBIC_GRAD
-#if OPFL_COMBINE_INTERP_GRAD_LS
-  compute_pred_using_interp_grad_sse4_1(src1, src2, dst1, dst2, bw, bh, d0, d1);
-#else
-  (void)src2;
-  (void)dst2;
-  (void)d0;
-  (void)d1;
-  if (bw < 16) {
-    for (int i = 0; i < bh; i++) {
-      const uint8_t *inp1 = src1 + i * bw;
-      int16_t *out1 = dst1 + i * bw;
-      for (int j = 0; j < bw; j = j + 8) {
-        const __m128i src_buf = xx_loadl_64(inp1 + j);
-        xx_store_128(out1 + j, _mm_cvtepu8_epi16(src_buf));
-      }
-    }
-  } else {
-    const __m128i zero = _mm_setzero_si128();
-    for (int i = 0; i < bh; i++) {
-      const uint8_t *inp1 = src1 + i * bw;
-      int16_t *out1 = dst1 + i * bw;
-      for (int j = 0; j < bw; j = j + 16) {
-        const __m128i src_buf = xx_load_128(inp1 + j);
-        xx_store_128(out1 + j, _mm_unpacklo_epi8(src_buf, zero));
-        xx_store_128(out1 + j + 8, _mm_unpackhi_epi8(src_buf, zero));
-      }
-    }
-  }
-#endif  // OPFL_COMBINE_INTERP_GRAD_LS
-#else
-  (void)src1;
-  (void)dst1;
-  (void)src2;
-  (void)dst2;
-  (void)d0;
-  (void)d1;
-  (void)bw;
-  (void)bh;
-#endif  // OPFL_BILINEAR_GRAD || OPFL_BICUBIC_GRAD
-}
-
-#if OPFL_COMBINE_INTERP_GRAD_LS
 static AOM_FORCE_INLINE void compute_pred_using_interp_grad_highbd_sse4_1(
     const uint16_t *src1, const uint16_t *src2, int16_t *dst1, int16_t *dst2,
     int bw, int bh, int d0, int d1) {

diff --git a/av1/common/x86/reconinter_avx2.c b/av1/common/x86/reconinter_avx2.c
index 06641cd..9df5f09 100644
--- a/av1/common/x86/reconinter_avx2.c
+++ b/av1/common/x86/reconinter_avx2.c

@@ -20,122 +20,6 @@
 #include "aom_dsp/x86/synonyms_avx2.h"
 #include "av1/common/blockd.h"
 
-static INLINE __m256i calc_mask_avx2(const __m256i mask_base, const __m256i s0,
-                                     const __m256i s1) {
-  const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(s0, s1));
-  return _mm256_abs_epi16(
-      _mm256_add_epi16(mask_base, _mm256_srli_epi16(diff, 4)));
-  // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54)
-}
-void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
-                                          DIFFWTD_MASK_TYPE mask_type,
-                                          const uint8_t *src0, int src0_stride,
-                                          const uint8_t *src1, int src1_stride,
-                                          int h, int w) {
-  const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
-  const __m256i y_mask_base = _mm256_set1_epi16(38 - mb);
-  int i = 0;
-  if (4 == w) {
-    do {
-      const __m128i s0A = xx_loadl_32(src0);
-      const __m128i s0B = xx_loadl_32(src0 + src0_stride);
-      const __m128i s0C = xx_loadl_32(src0 + src0_stride * 2);
-      const __m128i s0D = xx_loadl_32(src0 + src0_stride * 3);
-      const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
-      const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D);
-      const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD);
-      const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD);
-
-      const __m128i s1A = xx_loadl_32(src1);
-      const __m128i s1B = xx_loadl_32(src1 + src1_stride);
-      const __m128i s1C = xx_loadl_32(src1 + src1_stride * 2);
-      const __m128i s1D = xx_loadl_32(src1 + src1_stride * 3);
-      const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
-      const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D);
-      const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD);
-      const __m256i s1ABCD_w = _mm256_cvtepu8_epi16(s1ABCD);
-      const __m256i m16 = calc_mask_avx2(y_mask_base, s0ABCD_w, s1ABCD_w);
-      const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
-      const __m128i x_m8 =
-          _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8));
-      xx_storeu_128(mask, x_m8);
-      src0 += (src0_stride << 2);
-      src1 += (src1_stride << 2);
-      mask += 16;
-      i += 4;
-    } while (i < h);
-  } else if (8 == w) {
-    do {
-      const __m128i s0A = xx_loadl_64(src0);
-      const __m128i s0B = xx_loadl_64(src0 + src0_stride);
-      const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
-      const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
-      const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C));
-      const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D));
-      const __m128i s1A = xx_loadl_64(src1);
-      const __m128i s1B = xx_loadl_64(src1 + src1_stride);
-      const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
-      const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
-      const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C));
-      const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D));
-      const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w);
-      const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w);
-      const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD);
-      yy_storeu_256(mask, m8);
-      src0 += src0_stride << 2;
-      src1 += src1_stride << 2;
-      mask += 32;
-      i += 4;
-    } while (i < h);
-  } else if (16 == w) {
-    do {
-      const __m128i s0A = xx_load_128(src0);
-      const __m128i s0B = xx_load_128(src0 + src0_stride);
-      const __m128i s1A = xx_load_128(src1);
-      const __m128i s1B = xx_load_128(src1 + src1_stride);
-      const __m256i s0AL = _mm256_cvtepu8_epi16(s0A);
-      const __m256i s0BL = _mm256_cvtepu8_epi16(s0B);
-      const __m256i s1AL = _mm256_cvtepu8_epi16(s1A);
-      const __m256i s1BL = _mm256_cvtepu8_epi16(s1B);
-
-      const __m256i m16AL = calc_mask_avx2(y_mask_base, s0AL, s1AL);
-      const __m256i m16BL = calc_mask_avx2(y_mask_base, s0BL, s1BL);
-
-      const __m256i m8 =
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8);
-      yy_storeu_256(mask, m8);
-      src0 += src0_stride << 1;
-      src1 += src1_stride << 1;
-      mask += 32;
-      i += 2;
-    } while (i < h);
-  } else {
-    do {
-      int j = 0;
-      do {
-        const __m256i s0 = yy_loadu_256(src0 + j);
-        const __m256i s1 = yy_loadu_256(src1 + j);
-        const __m256i s0L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s0));
-        const __m256i s1L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1));
-        const __m256i s0H =
-            _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s0, 1));
-        const __m256i s1H =
-            _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1));
-        const __m256i m16L = calc_mask_avx2(y_mask_base, s0L, s1L);
-        const __m256i m16H = calc_mask_avx2(y_mask_base, s0H, s1H);
-        const __m256i m8 =
-            _mm256_permute4x64_epi64(_mm256_packus_epi16(m16L, m16H), 0xd8);
-        yy_storeu_256(mask + j, m8);
-        j += 32;
-      } while (j < w);
-      src0 += src0_stride;
-      src1 += src1_stride;
-      mask += w;
-      i += 1;
-    } while (i < h);
-  }
-}
-
 static INLINE __m256i calc_mask_d16_avx2(const __m256i *data_src0,
                                          const __m256i *data_src1,
                                          const __m256i *round_const,

diff --git a/av1/common/x86/reconinter_sse4.c b/av1/common/x86/reconinter_sse4.c
index b883e3c..fa23775 100644
--- a/av1/common/x86/reconinter_sse4.c
+++ b/av1/common/x86/reconinter_sse4.c

@@ -17,84 +17,6 @@
 #include "aom_dsp/blend.h"
 #include "av1/common/blockd.h"
 
-static INLINE __m128i calc_mask(const __m128i mask_base, const __m128i s0,
-                                const __m128i s1) {
-  const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(s0, s1));
-  return _mm_abs_epi16(_mm_add_epi16(mask_base, _mm_srli_epi16(diff, 4)));
-  // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54)
-}
-
-void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask,
-                                            DIFFWTD_MASK_TYPE mask_type,
-                                            const uint8_t *src0, int stride0,
-                                            const uint8_t *src1, int stride1,
-                                            int h, int w) {
-  const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
-  const __m128i mask_base = _mm_set1_epi16(38 - mb);
-  int i = 0;
-  if (4 == w) {
-    do {
-      const __m128i s0A = _mm_cvtsi32_si128(*(uint32_t *)src0);
-      const __m128i s0B = _mm_cvtsi32_si128(*(uint32_t *)(src0 + stride0));
-      const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
-      const __m128i s0 = _mm_cvtepu8_epi16(s0AB);
-
-      const __m128i s1A = _mm_cvtsi32_si128(*(uint32_t *)src1);
-      const __m128i s1B = _mm_cvtsi32_si128(*(uint32_t *)(src1 + stride1));
-      const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
-      const __m128i s1 = _mm_cvtepu8_epi16(s1AB);
-
-      const __m128i m16 = calc_mask(mask_base, s0, s1);
-      const __m128i m8 = _mm_packus_epi16(m16, m16);
-
-      *(uint32_t *)mask = _mm_cvtsi128_si32(m8);
-      *(uint32_t *)(mask + w) = _mm_extract_epi32(m8, 1);
-      src0 += (stride0 << 1);
-      src1 += (stride1 << 1);
-      mask += 8;
-      i += 2;
-    } while (i < h);
-  } else if (8 == w) {
-    do {
-      __m128i s0 = _mm_loadl_epi64((__m128i const *)src0);
-      __m128i s1 = _mm_loadl_epi64((__m128i const *)src1);
-      s0 = _mm_cvtepu8_epi16(s0);
-      s1 = _mm_cvtepu8_epi16(s1);
-      const __m128i m16 = calc_mask(mask_base, s0, s1);
-      const __m128i m8 = _mm_packus_epi16(m16, m16);
-      _mm_storel_epi64((__m128i *)mask, m8);
-      src0 += stride0;
-      src1 += stride1;
-      mask += 8;
-      i += 1;
-    } while (i < h);
-  } else {
-    const __m128i zero = _mm_setzero_si128();
-    do {
-      int j = 0;
-      do {
-        const __m128i s0 = _mm_load_si128((__m128i const *)(src0 + j));
-        const __m128i s1 = _mm_load_si128((__m128i const *)(src1 + j));
-        const __m128i s0L = _mm_cvtepu8_epi16(s0);
-        const __m128i s1L = _mm_cvtepu8_epi16(s1);
-        const __m128i s0H = _mm_unpackhi_epi8(s0, zero);
-        const __m128i s1H = _mm_unpackhi_epi8(s1, zero);
-
-        const __m128i m16L = calc_mask(mask_base, s0L, s1L);
-        const __m128i m16H = calc_mask(mask_base, s0H, s1H);
-
-        const __m128i m8 = _mm_packus_epi16(m16L, m16H);
-        _mm_store_si128((__m128i *)(mask + j), m8);
-        j += 16;
-      } while (j < w);
-      src0 += stride0;
-      src1 += stride1;
-      mask += w;
-      i += 1;
-    } while (i < h);
-  }
-}
-
 void av1_build_compound_diffwtd_mask_d16_sse4_1(
     uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
     int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,

diff --git a/av1/common/x86/resize_ssse3.c b/av1/common/x86/resize_ssse3.c
deleted file mode 100644
index 337c480..0000000
--- a/av1/common/x86/resize_ssse3.c
+++ /dev/null

@@ -1,965 +0,0 @@
-/*
- *
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <tmmintrin.h>  // SSSE3
-
-#include "aom_dsp/x86/mem_sse2.h"
-#include "aom_dsp/x86/transpose_sse2.h"
-#include "av1/common/resize.h"
-#include "config/av1_rtcd.h"
-#include "config/aom_scale_rtcd.h"
-
-static INLINE __m128i scale_plane_2_to_1_phase_0_kernel(
-    const uint8_t *const src, const __m128i *const mask) {
-  const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0]));
-  const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16]));
-  const __m128i a_and = _mm_and_si128(a, *mask);
-  const __m128i b_and = _mm_and_si128(b, *mask);
-  return _mm_packus_epi16(a_and, b_and);
-}
-
-static INLINE void shuffle_filter_ssse3(const int16_t *const filter,
-                                        __m128i *const f) {
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-}
-
-static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter,
-                                            __m128i *const f) {
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  // It utilizes the fact that the high byte of filter[3] is always 0 to clean
-  // half of f[0] and f[4].
-  assert(filter[3] >= 0 && filter[3] < 256);
-  f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u));
-  f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u));
-  f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u));
-  f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au));
-  f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu));
-}
-
-static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
-                                                    const __m128i *const f) {
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i k_64 = _mm_set1_epi16(1 << 6);
-  const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
-  const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
-  const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
-  const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
-  // compensate the subtracted 64 in f[1]. x4 is always non negative.
-  const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64));
-  // add and saturate the results together
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, x1);
-  temp = _mm_adds_epi16(temp, x2);
-  temp = _mm_adds_epi16(temp, x4);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_adds_epi16(temp, k_64);
-  temp = _mm_srai_epi16(temp, 7);
-  return temp;
-}
-
-static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s,
-                                                   const __m128i *const f) {
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i k_64 = _mm_set1_epi16(1 << 6);
-  const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
-  const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
-  const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
-  const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
-  const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]);
-  // compensate the subtracted 64 in f[2]. x5 is always non negative.
-  const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64));
-  __m128i temp;
-
-  // add and saturate the results together
-  temp = _mm_adds_epi16(x0, x1);
-  temp = _mm_adds_epi16(temp, x2);
-  temp = _mm_adds_epi16(temp, x3);
-  temp = _mm_adds_epi16(temp, x4);
-  temp = _mm_adds_epi16(temp, x5);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_adds_epi16(temp, k_64);
-  temp = _mm_srai_epi16(temp, 7);
-  return temp;
-}
-
-static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
-                                        const __m128i *const f) {
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i k_64 = _mm_set1_epi16(1 << 6);
-  const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
-  const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
-  const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
-  const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
-  __m128i sum1, sum2;
-
-  // sum the results together, saturating only on the final step
-  // adding x0 with x2 and x1 with x3 is the only order that prevents
-  // outranges for all filters
-  sum1 = _mm_add_epi16(x0, x2);
-  sum2 = _mm_add_epi16(x1, x3);
-  // add the rounding offset early to avoid another saturated add
-  sum1 = _mm_add_epi16(sum1, k_64);
-  sum1 = _mm_adds_epi16(sum1, sum2);
-  // shift by 7 bit each 16 bit
-  sum1 = _mm_srai_epi16(sum1, 7);
-  return sum1;
-}
-
-static void scale_plane_2_to_1_phase_0(const uint8_t *src,
-                                       const ptrdiff_t src_stride, uint8_t *dst,
-                                       const ptrdiff_t dst_stride,
-                                       const int dst_w, const int dst_h) {
-  const int max_width = (dst_w + 15) & ~15;
-  const __m128i mask = _mm_set1_epi16(0x00FF);
-  int y = dst_h;
-
-  do {
-    int x = max_width;
-    do {
-      const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask);
-      _mm_storeu_si128((__m128i *)dst, d);
-      src += 32;
-      dst += 16;
-      x -= 16;
-    } while (x);
-    src += 2 * (src_stride - max_width);
-    dst += dst_stride - max_width;
-  } while (--y);
-}
-
-static void scale_plane_4_to_1_phase_0(const uint8_t *src,
-                                       const ptrdiff_t src_stride, uint8_t *dst,
-                                       const ptrdiff_t dst_stride,
-                                       const int dst_w, const int dst_h) {
-  const int max_width = (dst_w + 15) & ~15;
-  const __m128i mask = _mm_set1_epi32(0x000000FF);
-  int y = dst_h;
-
-  do {
-    int x = max_width;
-    do {
-      const __m128i d0 = scale_plane_2_to_1_phase_0_kernel(&src[0], &mask);
-      const __m128i d1 = scale_plane_2_to_1_phase_0_kernel(&src[32], &mask);
-      const __m128i d2 = _mm_packus_epi16(d0, d1);
-      _mm_storeu_si128((__m128i *)dst, d2);
-      src += 64;
-      dst += 16;
-      x -= 16;
-    } while (x);
-    src += 4 * (src_stride - max_width);
-    dst += dst_stride - max_width;
-  } while (--y);
-}
-
-static INLINE __m128i scale_plane_bilinear_kernel(const __m128i *const s,
-                                                  const __m128i c0c1) {
-  const __m128i k_64 = _mm_set1_epi16(1 << 6);
-  const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1);
-  const __m128i t1 = _mm_maddubs_epi16(s[1], c0c1);
-  // round and shift by 7 bit each 16 bit
-  const __m128i t2 = _mm_adds_epi16(t0, k_64);
-  const __m128i t3 = _mm_adds_epi16(t1, k_64);
-  const __m128i t4 = _mm_srai_epi16(t2, 7);
-  const __m128i t5 = _mm_srai_epi16(t3, 7);
-  return _mm_packus_epi16(t4, t5);
-}
-
-static void scale_plane_2_to_1_bilinear(const uint8_t *src,
-                                        const ptrdiff_t src_stride,
-                                        uint8_t *dst,
-                                        const ptrdiff_t dst_stride,
-                                        const int dst_w, const int dst_h,
-                                        const __m128i c0c1) {
-  const int max_width = (dst_w + 15) & ~15;
-  int y = dst_h;
-
-  do {
-    int x = max_width;
-    do {
-      __m128i s[2], d[2];
-
-      // Horizontal
-      // Even rows
-      s[0] = _mm_loadu_si128((const __m128i *)(src + 0));
-      s[1] = _mm_loadu_si128((const __m128i *)(src + 16));
-      d[0] = scale_plane_bilinear_kernel(s, c0c1);
-
-      // odd rows
-      s[0] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
-      s[1] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
-      d[1] = scale_plane_bilinear_kernel(s, c0c1);
-
-      // Vertical
-      s[0] = _mm_unpacklo_epi8(d[0], d[1]);
-      s[1] = _mm_unpackhi_epi8(d[0], d[1]);
-      d[0] = scale_plane_bilinear_kernel(s, c0c1);
-
-      _mm_storeu_si128((__m128i *)dst, d[0]);
-      src += 32;
-      dst += 16;
-      x -= 16;
-    } while (x);
-    src += 2 * (src_stride - max_width);
-    dst += dst_stride - max_width;
-  } while (--y);
-}
-
-static void scale_plane_4_to_1_bilinear(const uint8_t *src,
-                                        const ptrdiff_t src_stride,
-                                        uint8_t *dst,
-                                        const ptrdiff_t dst_stride,
-                                        const int dst_w, const int dst_h,
-                                        const __m128i c0c1) {
-  const int max_width = (dst_w + 15) & ~15;
-  int y = dst_h;
-
-  do {
-    int x = max_width;
-    do {
-      __m128i s[8], d[8];
-
-      // Note: Using _mm_packus_epi32() in SSE4.1 could be faster.
-      //       Here we tried to not use shuffle instructions which would be slow
-      //       on some x86 CPUs.
-
-      // Horizontal
-      // 000 001 xx xx 004 005 xx xx  008 009 xx xx 00C 00D xx xx
-      // 010 011 xx xx 014 015 xx xx  018 019 xx xx 01C 01D xx xx
-      // 020 021 xx xx 024 025 xx xx  028 029 xx xx 02C 02D xx xx
-      // 030 031 xx xx 034 035 xx xx  038 039 xx xx 03C 03D xx xx
-      // 100 101 xx xx 104 105 xx xx  108 109 xx xx 10C 10D xx xx
-      // 110 111 xx xx 114 115 xx xx  118 119 xx xx 11C 11D xx xx
-      // 120 121 xx xx 124 125 xx xx  128 129 xx xx 12C 12D xx xx
-      // 130 131 xx xx 134 135 xx xx  138 139 xx xx 13C 13D xx xx
-      s[0] = _mm_loadu_si128((const __m128i *)(&src[0]));
-      s[1] = _mm_loadu_si128((const __m128i *)(&src[16]));
-      s[2] = _mm_loadu_si128((const __m128i *)(&src[32]));
-      s[3] = _mm_loadu_si128((const __m128i *)(&src[48]));
-      s[4] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
-      s[5] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
-      s[6] = _mm_loadu_si128((const __m128i *)(src + src_stride + 32));
-      s[7] = _mm_loadu_si128((const __m128i *)(src + src_stride + 48));
-
-      // 000 001 100 101 xx xx xx xx  004 005 104 105 xx xx xx xx
-      // 008 009 108 109 xx xx xx xx  00C 00D 10C 10D xx xx xx xx
-      // 010 011 110 111 xx xx xx xx  014 015 114 115 xx xx xx xx
-      // 018 019 118 119 xx xx xx xx  01C 01D 11C 11D xx xx xx xx
-      // 020 021 120 121 xx xx xx xx  024 025 124 125 xx xx xx xx
-      // 028 029 128 129 xx xx xx xx  02C 02D 12C 12D xx xx xx xx
-      // 030 031 130 131 xx xx xx xx  034 035 134 135 xx xx xx xx
-      // 038 039 138 139 xx xx xx xx  03C 03D 13C 13D xx xx xx xx
-      d[0] = _mm_unpacklo_epi16(s[0], s[4]);
-      d[1] = _mm_unpackhi_epi16(s[0], s[4]);
-      d[2] = _mm_unpacklo_epi16(s[1], s[5]);
-      d[3] = _mm_unpackhi_epi16(s[1], s[5]);
-      d[4] = _mm_unpacklo_epi16(s[2], s[6]);
-      d[5] = _mm_unpackhi_epi16(s[2], s[6]);
-      d[6] = _mm_unpacklo_epi16(s[3], s[7]);
-      d[7] = _mm_unpackhi_epi16(s[3], s[7]);
-
-      // 000 001 100 101 008 009 108 109  xx xx xx xx xx xx xx xx
-      // 004 005 104 105 00C 00D 10C 10D  xx xx xx xx xx xx xx xx
-      // 010 011 110 111 018 019 118 119  xx xx xx xx xx xx xx xx
-      // 014 015 114 115 01C 01D 11C 11D  xx xx xx xx xx xx xx xx
-      // 020 021 120 121 028 029 128 129  xx xx xx xx xx xx xx xx
-      // 024 025 124 125 02C 02D 12C 12D  xx xx xx xx xx xx xx xx
-      // 030 031 130 131 038 039 138 139  xx xx xx xx xx xx xx xx
-      // 034 035 134 135 03C 03D 13C 13D  xx xx xx xx xx xx xx xx
-      s[0] = _mm_unpacklo_epi32(d[0], d[1]);
-      s[1] = _mm_unpackhi_epi32(d[0], d[1]);
-      s[2] = _mm_unpacklo_epi32(d[2], d[3]);
-      s[3] = _mm_unpackhi_epi32(d[2], d[3]);
-      s[4] = _mm_unpacklo_epi32(d[4], d[5]);
-      s[5] = _mm_unpackhi_epi32(d[4], d[5]);
-      s[6] = _mm_unpacklo_epi32(d[6], d[7]);
-      s[7] = _mm_unpackhi_epi32(d[6], d[7]);
-
-      // 000 001 100 101 004 005 104 105  008 009 108 109 00C 00D 10C 10D
-      // 010 011 110 111 014 015 114 115  018 019 118 119 01C 01D 11C 11D
-      // 020 021 120 121 024 025 124 125  028 029 128 129 02C 02D 12C 12D
-      // 030 031 130 131 034 035 134 135  038 039 138 139 03C 03D 13C 13D
-      d[0] = _mm_unpacklo_epi32(s[0], s[1]);
-      d[1] = _mm_unpacklo_epi32(s[2], s[3]);
-      d[2] = _mm_unpacklo_epi32(s[4], s[5]);
-      d[3] = _mm_unpacklo_epi32(s[6], s[7]);
-
-      d[0] = scale_plane_bilinear_kernel(&d[0], c0c1);
-      d[1] = scale_plane_bilinear_kernel(&d[2], c0c1);
-
-      // Vertical
-      d[0] = scale_plane_bilinear_kernel(d, c0c1);
-
-      _mm_storeu_si128((__m128i *)dst, d[0]);
-      src += 64;
-      dst += 16;
-      x -= 16;
-    } while (x);
-    src += 4 * (src_stride - max_width);
-    dst += dst_stride - max_width;
-  } while (--y);
-}
-
-static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
-                                       uint8_t *dst, const int dst_stride,
-                                       const int w, const int h,
-                                       const int16_t *const coef,
-                                       uint8_t *const temp_buffer) {
-  const int width_hor = (w + 1) & ~1;
-  const int width_ver = (w + 7) & ~7;
-  const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
-  const int height_ver = (h + 1) & ~1;
-  int x, y = height_hor;
-  uint8_t *t = temp_buffer;
-  __m128i s[11], d[4];
-  __m128i f[4];
-
-  assert(w && h);
-
-  shuffle_filter_ssse3(coef, f);
-  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3;
-
-  // horizontal 2x8
-  do {
-    load_8bit_8x8(src + 4, src_stride, s);
-    // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
-    // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
-    // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75 (overlapped)
-    // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77 (overlapped)
-    transpose_16bit_4x8(s, s);
-    x = width_hor;
-
-    do {
-      src += 8;
-      load_8bit_8x8(src, src_stride, &s[2]);
-      // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
-      // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
-      // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
-      // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
-      transpose_16bit_4x8(&s[2], &s[2]);
-
-      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 10 20 30 40 50 60 70
-      d[1] = convolve8_8_ssse3(&s[2], f);  // 01 11 21 31 41 51 61 71
-
-      // 00 10 20 30 40 50 60 70  xx xx xx xx xx xx xx xx
-      // 01 11 21 31 41 51 61 71  xx xx xx xx xx xx xx xx
-      d[0] = _mm_packus_epi16(d[0], d[0]);
-      d[1] = _mm_packus_epi16(d[1], d[1]);
-      // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
-      d[0] = _mm_unpacklo_epi16(d[0], d[1]);
-      store_8bit_4x4_sse2(d[0], t, 2 * width_hor);
-
-      s[0] = s[4];
-      s[1] = s[5];
-
-      t += 4;
-      x -= 2;
-    } while (x);
-    src += 8 * src_stride - 4 * width_hor;
-    t += 6 * width_hor;
-    y -= 8;
-  } while (y);
-
-  // vertical 8x2
-  x = width_ver;
-  t = temp_buffer;
-  do {
-    // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
-    // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
-    s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
-    s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
-    t += 4 * width_hor;
-    y = height_ver;
-
-    do {
-      // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
-      // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
-      // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 77
-      // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 77
-      loadu_8bit_16x4(t, 2 * width_hor, &s[2]);
-      t += 8 * width_hor;
-
-      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 01 02 03 04 05 06 07
-      d[1] = convolve8_8_ssse3(&s[2], f);  // 10 11 12 13 14 15 16 17
-
-      // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
-      d[0] = _mm_packus_epi16(d[0], d[1]);
-      _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
-      _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
-
-      s[0] = s[4];
-      s[1] = s[5];
-
-      dst += 2 * dst_stride;
-      y -= 2;
-    } while (y);
-    t -= width_hor * (4 * height_ver + 4);
-    t += 16;
-    dst -= height_ver * dst_stride;
-    dst += 8;
-    x -= 8;
-  } while (x);
-}
-
-static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
-                                       uint8_t *dst, const int dst_stride,
-                                       const int w, const int h,
-                                       const int16_t *const coef,
-                                       uint8_t *const temp_buffer) {
-  const int width_hor = (w + 3) & ~3;
-  const int width_ver = (w + 7) & ~7;
-  const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
-  const int height_ver = (h + 3) & ~3;
-  int x, y = height_hor;
-  uint8_t *t = temp_buffer;
-  __m128i s[11], d[4];
-  __m128i f[4];
-
-  assert(w && h);
-
-  shuffle_filter_ssse3(coef, f);
-  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1;
-
-  // horizontal 4x8
-  do {
-    load_8bit_8x8(src + 2, src_stride, s);
-    // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
-    // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
-    // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
-    // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77 (overlapped)
-    transpose_16bit_4x8(s, s);
-    x = width_hor;
-
-    do {
-      src += 8;
-      load_8bit_8x8(src, src_stride, &s[3]);
-      // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
-      // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
-      // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
-      // 0C 0D 1C 1D 2C 2D 3C 3D  4C 4D 5C 5D 6C 6D 7C 7D
-      transpose_16bit_4x8(&s[3], &s[3]);
-
-      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 10 20 30 40 50 60 70
-      d[1] = convolve8_8_ssse3(&s[1], f);  // 01 11 21 31 41 51 61 71
-      d[2] = convolve8_8_ssse3(&s[2], f);  // 02 12 22 32 42 52 62 72
-      d[3] = convolve8_8_ssse3(&s[3], f);  // 03 13 23 33 43 53 63 73
-
-      // 00 10 20 30 40 50 60 70  02 12 22 32 42 52 62 72
-      // 01 11 21 31 41 51 61 71  03 13 23 33 43 53 63 73
-      d[0] = _mm_packus_epi16(d[0], d[2]);
-      d[1] = _mm_packus_epi16(d[1], d[3]);
-      // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
-      // 02 12 03 13 22 32 23 33  42 52 43 53 62 72 63 73
-      d[2] = _mm_unpacklo_epi16(d[0], d[1]);
-      d[3] = _mm_unpackhi_epi16(d[0], d[1]);
-      // 00 10 01 11 02 12 03 13  20 30 21 31 22 32 23 33
-      // 40 50 41 51 42 52 43 53  60 70 61 71 62 72 63 73
-      d[0] = _mm_unpacklo_epi32(d[2], d[3]);
-      d[1] = _mm_unpackhi_epi32(d[2], d[3]);
-      store_8bit_8x4_from_16x2(d, t, 2 * width_hor);
-
-      s[0] = s[4];
-      s[1] = s[5];
-      s[2] = s[6];
-
-      t += 8;
-      x -= 4;
-    } while (x);
-    src += 8 * src_stride - 2 * width_hor;
-    t += 6 * width_hor;
-    y -= 8;
-  } while (y);
-
-  // vertical 8x4
-  x = width_ver;
-  t = temp_buffer;
-  do {
-    // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
-    // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
-    // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
-    s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
-    s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
-    s[2] = _mm_loadu_si128((const __m128i *)(t + 4 * width_hor));
-    t += 6 * width_hor;
-    y = height_ver;
-
-    do {
-      // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
-      // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 77
-      // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 77
-      // C0 D0 C1 D1 C2 D2 C3 D3  C4 D4 C5 D5 C6 D6 C7 77
-      loadu_8bit_16x4(t, 2 * width_hor, &s[3]);
-      t += 8 * width_hor;
-
-      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 01 02 03 04 05 06 07
-      d[1] = convolve8_8_ssse3(&s[1], f);  // 10 11 12 13 14 15 16 17
-      d[2] = convolve8_8_ssse3(&s[2], f);  // 20 21 22 23 24 25 26 27
-      d[3] = convolve8_8_ssse3(&s[3], f);  // 30 31 32 33 34 35 36 37
-
-      // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
-      // 20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37
-      d[0] = _mm_packus_epi16(d[0], d[1]);
-      d[1] = _mm_packus_epi16(d[2], d[3]);
-      store_8bit_8x4_from_16x2(d, dst, dst_stride);
-
-      s[0] = s[4];
-      s[1] = s[5];
-      s[2] = s[6];
-
-      dst += 4 * dst_stride;
-      y -= 4;
-    } while (y);
-    t -= width_hor * (2 * height_ver + 6);
-    t += 16;
-    dst -= height_ver * dst_stride;
-    dst += 8;
-    x -= 8;
-  } while (x);
-}
-
-typedef void (*shuffle_filter_funcs)(const int16_t *const filter,
-                                     __m128i *const f);
-
-typedef __m128i (*convolve8_funcs)(const __m128i *const s,
-                                   const __m128i *const f);
-
-static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
-                                       uint8_t *dst, const int dst_stride,
-                                       const int w, const int h,
-                                       const InterpKernel *const coef,
-                                       const int phase,
-                                       uint8_t *const temp_buffer) {
-  static const int step_q4 = 16 * 4 / 3;
-  const int width_hor = (w + 5) - ((w + 5) % 6);
-  const int stride_hor = 2 * width_hor + 4;  // store 4 extra pixels
-  const int width_ver = (w + 7) & ~7;
-  // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows
-  // above and (SUBPEL_TAPS / 2) extra rows below.
-  const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
-  const int height_ver = (h + 5) - ((h + 5) % 6);
-  int x, y = height_hor;
-  uint8_t *t = temp_buffer;
-  __m128i s[12], d[6], dd[4];
-  __m128i f0[4], f1[5], f2[5];
-  // The offset of the first row is always less than 1 pixel.
-  const int offset1_q4 = phase + 1 * step_q4;
-  const int offset2_q4 = phase + 2 * step_q4;
-  // offset_idxx indicates the pixel offset is even (0) or odd (1).
-  // It's used to choose the src offset and filter coefficient offset.
-  const int offset_idx1 = (offset1_q4 >> 4) & 1;
-  const int offset_idx2 = (offset2_q4 >> 4) & 1;
-  static const shuffle_filter_funcs shuffle_filter_func_list[2] = {
-    shuffle_filter_ssse3, shuffle_filter_odd_ssse3
-  };
-  static const convolve8_funcs convolve8_func_list[2] = {
-    convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3
-  };
-
-  assert(w && h);
-
-  shuffle_filter_ssse3(coef[(phase + 0 * step_q4) & SUBPEL_MASK], f0);
-  shuffle_filter_func_list[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1);
-  shuffle_filter_func_list[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2);
-
-  // Sub 64 to avoid overflow.
-  // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here.
-  // Coef 128 is in either fx[1] or fx[2] depending on the phase idx.
-  // When filter phase idx is 1, the two biggest coefficients are shuffled
-  // together, and the sum of them are always no less than 128. Sub 64 here.
-  // After the subtraction, when the sum of all positive coefficients are no
-  // larger than 128, and the sum of all negative coefficients are no
-  // less than -128, there will be no overflow in the convolve8 functions.
-  f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64));
-  f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64));
-  f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64));
-
-  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1;
-
-  // horizontal 6x8
-  do {
-    load_8bit_8x8(src, src_stride, s);
-    // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
-    // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
-    // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
-    // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
-    transpose_16bit_4x8(s, s);
-    x = width_hor;
-
-    do {
-      src += 8;
-      load_8bit_8x8(src, src_stride, &s[4]);
-      // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
-      // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
-      // OC 0D 1C 1D 2C 2D 3C 3D  4C 4D 5C 5D 6C 6D 7C 7D
-      // 0E 0F 1E 1F 2E 2F 3E 3F  4E 4F 5E 5F 6E 6F 7E 7F
-      transpose_16bit_4x8(&s[4], &s[4]);
-
-      // 00 10 20 30 40 50 60 70
-      // 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72
-      // 03 13 23 33 43 53 63 73
-      // 04 14 24 34 44 54 64 74
-      // 05 15 25 35 45 55 65 75
-      d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
-      d[1] = convolve8_func_list[offset_idx1](&s[offset1_q4 >> 5], f1);
-      d[2] = convolve8_func_list[offset_idx2](&s[offset2_q4 >> 5], f2);
-      d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
-      d[4] = convolve8_func_list[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
-      d[5] = convolve8_func_list[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
-
-      // 00 10 20 30 40 50 60 70  02 12 22 32 42 52 62 72
-      // 01 11 21 31 41 51 61 71  03 13 23 33 43 53 63 73
-      // 04 14 24 34 44 54 64 74  xx xx xx xx xx xx xx xx
-      // 05 15 25 35 45 55 65 75  xx xx xx xx xx xx xx xx
-      dd[0] = _mm_packus_epi16(d[0], d[2]);
-      dd[1] = _mm_packus_epi16(d[1], d[3]);
-      dd[2] = _mm_packus_epi16(d[4], d[4]);
-      dd[3] = _mm_packus_epi16(d[5], d[5]);
-
-      // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
-      // 02 12 03 13 22 32 23 33  42 52 43 53 62 72 63 73
-      // 04 14 05 15 24 34 25 35  44 54 45 55 64 74 65 75
-      d[0] = _mm_unpacklo_epi16(dd[0], dd[1]);
-      d[1] = _mm_unpackhi_epi16(dd[0], dd[1]);
-      d[2] = _mm_unpacklo_epi16(dd[2], dd[3]);
-
-      // 00 10 01 11 02 12 03 13  20 30 21 31 22 32 23 33
-      // 40 50 41 51 42 52 43 53  60 70 61 71 62 72 63 73
-      // 04 14 05 15 xx xx xx xx  24 34 25 35 xx xx xx xx
-      // 44 54 45 55 xx xx xx xx  64 74 65 75 xx xx xx xx
-      dd[0] = _mm_unpacklo_epi32(d[0], d[1]);
-      dd[1] = _mm_unpackhi_epi32(d[0], d[1]);
-      dd[2] = _mm_unpacklo_epi32(d[2], d[2]);
-      dd[3] = _mm_unpackhi_epi32(d[2], d[2]);
-
-      // 00 10 01 11 02 12 03 13  04 14 05 15 xx xx xx xx
-      // 20 30 21 31 22 32 23 33  24 34 25 35 xx xx xx xx
-      // 40 50 41 51 42 52 43 53  44 54 45 55 xx xx xx xx
-      // 60 70 61 71 62 72 63 73  64 74 65 75 xx xx xx xx
-      d[0] = _mm_unpacklo_epi64(dd[0], dd[2]);
-      d[1] = _mm_unpackhi_epi64(dd[0], dd[2]);
-      d[2] = _mm_unpacklo_epi64(dd[1], dd[3]);
-      d[3] = _mm_unpackhi_epi64(dd[1], dd[3]);
-
-      // store 4 extra pixels
-      storeu_8bit_16x4(d, t, stride_hor);
-
-      s[0] = s[4];
-      s[1] = s[5];
-      s[2] = s[6];
-      s[3] = s[7];
-
-      t += 12;
-      x -= 6;
-    } while (x);
-    src += 8 * src_stride - 4 * width_hor / 3;
-    t += 3 * stride_hor + 4;
-    y -= 8;
-  } while (y);
-
-  // vertical 8x6
-  x = width_ver;
-  t = temp_buffer;
-  do {
-    // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
-    // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
-    // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
-    // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
-    loadu_8bit_16x4(t, stride_hor, s);
-    y = height_ver;
-
-    do {
-      // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 97
-      // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 B7
-      // C0 D0 C1 D1 C2 D2 C3 D3  C4 D4 C5 D5 C6 D6 C7 D7
-      // E0 F0 E1 F1 E2 F2 E3 F3  E4 F4 E5 F5 E6 F6 E7 F7
-      t += 4 * stride_hor;
-      loadu_8bit_16x4(t, stride_hor, &s[4]);
-
-      d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
-      d[1] = convolve8_func_list[offset_idx1](&s[offset1_q4 >> 5], f1);
-      d[2] = convolve8_func_list[offset_idx2](&s[offset2_q4 >> 5], f2);
-      d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
-      d[4] = convolve8_func_list[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
-      d[5] = convolve8_func_list[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
-
-      // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
-      // 20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37
-      // 40 41 42 43 44 45 46 47  50 51 52 53 54 55 56 57
-      d[0] = _mm_packus_epi16(d[0], d[1]);
-      d[2] = _mm_packus_epi16(d[2], d[3]);
-      d[4] = _mm_packus_epi16(d[4], d[5]);
-
-      _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
-      _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
-      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]);
-      _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]);
-      _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]);
-      _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]);
-
-      s[0] = s[4];
-      s[1] = s[5];
-      s[2] = s[6];
-      s[3] = s[7];
-
-      dst += 6 * dst_stride;
-      y -= 6;
-    } while (y);
-    t -= stride_hor * 2 * height_ver / 3;
-    t += 16;
-    dst -= height_ver * dst_stride;
-    dst += 8;
-    x -= 8;
-  } while (x);
-}
-
-static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s,
-                                                  const __m128i *const f) {
-  __m128i ss[4], temp;
-
-  ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
-  ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
-  ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
-  ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
-  temp = convolve8_8_ssse3(ss, f);
-  return _mm_packus_epi16(temp, temp);
-}
-
-// Only calculate odd columns since even columns are just src pixels' copies.
-static void scale_1_to_2_phase_0_row(const uint8_t *src, uint8_t *dst,
-                                     const int w, const __m128i *const f) {
-  int x = w;
-
-  do {
-    __m128i s[8], temp;
-    s[0] = _mm_loadl_epi64((const __m128i *)(src + 0));
-    s[1] = _mm_loadl_epi64((const __m128i *)(src + 1));
-    s[2] = _mm_loadl_epi64((const __m128i *)(src + 2));
-    s[3] = _mm_loadl_epi64((const __m128i *)(src + 3));
-    s[4] = _mm_loadl_epi64((const __m128i *)(src + 4));
-    s[5] = _mm_loadl_epi64((const __m128i *)(src + 5));
-    s[6] = _mm_loadl_epi64((const __m128i *)(src + 6));
-    s[7] = _mm_loadl_epi64((const __m128i *)(src + 7));
-    temp = scale_1_to_2_phase_0_kernel(s, f);
-    _mm_storel_epi64((__m128i *)dst, temp);
-    src += 8;
-    dst += 8;
-    x -= 8;
-  } while (x);
-}
-
-static void scale_plane_1_to_2_phase_0(const uint8_t *src,
-                                       const ptrdiff_t src_stride, uint8_t *dst,
-                                       const ptrdiff_t dst_stride,
-                                       const int src_w, const int src_h,
-                                       const int16_t *const coef,
-                                       uint8_t *const temp_buffer) {
-  int max_width;
-  int y;
-  uint8_t *tmp[9];
-  __m128i f[4];
-
-  max_width = (src_w + 7) & ~7;
-  tmp[0] = temp_buffer + 0 * max_width;
-  tmp[1] = temp_buffer + 1 * max_width;
-  tmp[2] = temp_buffer + 2 * max_width;
-  tmp[3] = temp_buffer + 3 * max_width;
-  tmp[4] = temp_buffer + 4 * max_width;
-  tmp[5] = temp_buffer + 5 * max_width;
-  tmp[6] = temp_buffer + 6 * max_width;
-  tmp[7] = temp_buffer + 7 * max_width;
-
-  shuffle_filter_ssse3(coef, f);
-
-  scale_1_to_2_phase_0_row(src - 3 * src_stride - 3, tmp[0], max_width, f);
-  scale_1_to_2_phase_0_row(src - 2 * src_stride - 3, tmp[1], max_width, f);
-  scale_1_to_2_phase_0_row(src - 1 * src_stride - 3, tmp[2], max_width, f);
-  scale_1_to_2_phase_0_row(src + 0 * src_stride - 3, tmp[3], max_width, f);
-  scale_1_to_2_phase_0_row(src + 1 * src_stride - 3, tmp[4], max_width, f);
-  scale_1_to_2_phase_0_row(src + 2 * src_stride - 3, tmp[5], max_width, f);
-  scale_1_to_2_phase_0_row(src + 3 * src_stride - 3, tmp[6], max_width, f);
-
-  y = src_h;
-  do {
-    int x;
-    scale_1_to_2_phase_0_row(src + 4 * src_stride - 3, tmp[7], max_width, f);
-    for (x = 0; x < max_width; x += 8) {
-      __m128i s[8], C, D, CD;
-
-      // Even rows
-      const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x));
-      const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
-      const __m128i ab = _mm_unpacklo_epi8(a, b);
-      _mm_storeu_si128((__m128i *)(dst + 2 * x), ab);
-
-      // Odd rows
-      // Even columns
-      load_8bit_8x8(src + x - 3 * src_stride, src_stride, s);
-      C = scale_1_to_2_phase_0_kernel(s, f);
-
-      // Odd columns
-      s[0] = _mm_loadl_epi64((const __m128i *)(tmp[0] + x));
-      s[1] = _mm_loadl_epi64((const __m128i *)(tmp[1] + x));
-      s[2] = _mm_loadl_epi64((const __m128i *)(tmp[2] + x));
-      s[3] = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
-      s[4] = _mm_loadl_epi64((const __m128i *)(tmp[4] + x));
-      s[5] = _mm_loadl_epi64((const __m128i *)(tmp[5] + x));
-      s[6] = _mm_loadl_epi64((const __m128i *)(tmp[6] + x));
-      s[7] = _mm_loadl_epi64((const __m128i *)(tmp[7] + x));
-      D = scale_1_to_2_phase_0_kernel(s, f);
-
-      CD = _mm_unpacklo_epi8(C, D);
-      _mm_storeu_si128((__m128i *)(dst + dst_stride + 2 * x), CD);
-    }
-
-    src += src_stride;
-    dst += 2 * dst_stride;
-    tmp[8] = tmp[0];
-    tmp[0] = tmp[1];
-    tmp[1] = tmp[2];
-    tmp[2] = tmp[3];
-    tmp[3] = tmp[4];
-    tmp[4] = tmp[5];
-    tmp[5] = tmp[6];
-    tmp[6] = tmp[7];
-    tmp[7] = tmp[8];
-  } while (--y);
-}
-
-void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
-                                       YV12_BUFFER_CONFIG *dst,
-                                       const InterpFilter filter,
-                                       const int phase, const int num_planes) {
-  // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
-  // the static analysis warnings.
-  for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
-    const int is_uv = i > 0;
-    const int src_w = src->crop_widths[is_uv];
-    const int src_h = src->crop_heights[is_uv];
-    const int src_y_w = (src->crop_widths[0] + 1) & ~1;
-    const int dst_w = dst->crop_widths[is_uv];
-    const int dst_h = dst->crop_heights[is_uv];
-    const int dst_y_w = (dst->crop_widths[0] + 1) & ~1;
-    const int dst_y_h = (dst->crop_heights[0] + 1) & ~1;
-
-    if (2 * dst_w == src_w && 2 * dst_h == src_h) {
-      // 2 to 1
-      if (phase == 0) {
-        scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv],
-                                   dst->buffers[i], dst->strides[is_uv], dst_w,
-                                   dst_h);
-      } else if (filter == BILINEAR) {
-        const int16_t c0 = av1_bilinear_filters[phase][3];
-        const int16_t c1 = av1_bilinear_filters[phase][4];
-        const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8));  // c0 and c1 >= 0
-        scale_plane_2_to_1_bilinear(src->buffers[i], src->strides[is_uv],
-                                    dst->buffers[i], dst->strides[is_uv], dst_w,
-                                    dst_h, c0c1);
-      } else {
-        const int buffer_stride = (dst_y_w + 3) & ~3;
-        const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
-        uint8_t *const temp_buffer =
-            (uint8_t *)malloc(buffer_stride * buffer_height);
-        if (temp_buffer) {
-          const InterpKernel *interp_kernel =
-              (const InterpKernel *)av1_interp_filter_params_list[filter]
-                  .filter_ptr;
-          scale_plane_2_to_1_general(src->buffers[i], src->strides[is_uv],
-                                     dst->buffers[i], dst->strides[is_uv],
-                                     dst_w, dst_h, interp_kernel[phase],
-                                     temp_buffer);
-          free(temp_buffer);
-        }
-      }
-    } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
-      // 4 to 1
-      if (phase == 0) {
-        scale_plane_4_to_1_phase_0(src->buffers[i], src->strides[is_uv],
-                                   dst->buffers[i], dst->strides[is_uv], dst_w,
-                                   dst_h);
-      } else if (filter == BILINEAR) {
-        const int16_t c0 = av1_bilinear_filters[phase][3];
-        const int16_t c1 = av1_bilinear_filters[phase][4];
-        const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8));  // c0 and c1 >= 0
-        scale_plane_4_to_1_bilinear(src->buffers[i], src->strides[is_uv],
-                                    dst->buffers[i], dst->strides[is_uv], dst_w,
-                                    dst_h, c0c1);
-      } else {
-        const int buffer_stride = (dst_y_w + 1) & ~1;
-        const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
-        // When dst_w is 1 or 2, we need extra padding to avoid heap read
-        // overflow
-        const int extra_padding = 16;
-        uint8_t *const temp_buffer =
-            (uint8_t *)malloc(buffer_stride * buffer_height + extra_padding);
-        if (temp_buffer) {
-          const InterpKernel *interp_kernel =
-              (const InterpKernel *)av1_interp_filter_params_list[filter]
-                  .filter_ptr;
-          scale_plane_4_to_1_general(src->buffers[i], src->strides[is_uv],
-                                     dst->buffers[i], dst->strides[is_uv],
-                                     dst_w, dst_h, interp_kernel[phase],
-                                     temp_buffer);
-          free(temp_buffer);
-        }
-      }
-    } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
-      // 4 to 3
-      const int buffer_stride_hor = (dst_y_w + 5) - ((dst_y_w + 5) % 6) + 2;
-      const int buffer_stride_ver = (dst_y_w + 7) & ~7;
-      const int buffer_height = (4 * dst_y_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
-      // When the vertical filter reads more pixels than the horizontal filter
-      // generated in each row, we need extra padding to avoid heap read
-      // overflow. For example, the horizontal filter generates 18 pixels but
-      // the vertical filter reads 24 pixels in a row. The difference is
-      // multiplied by 2 since two rows are interlaced together in the
-      // optimization.
-      const int extra_padding =
-          (buffer_stride_ver > buffer_stride_hor)
-              ? 2 * (buffer_stride_ver - buffer_stride_hor)
-              : 0;
-      const int buffer_size = buffer_stride_hor * buffer_height + extra_padding;
-      uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size);
-      if (temp_buffer) {
-        const InterpKernel *interp_kernel =
-            (const InterpKernel *)av1_interp_filter_params_list[filter]
-                .filter_ptr;
-        scale_plane_4_to_3_general(src->buffers[i], src->strides[is_uv],
-                                   dst->buffers[i], dst->strides[is_uv], dst_w,
-                                   dst_h, interp_kernel, phase, temp_buffer);
-        free(temp_buffer);
-      }
-    } else if (dst_w == src_w * 2 && dst_h == src_h * 2) {
-      // 1 to 2
-      uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_y_w + 7) & ~7));
-      if (temp_buffer) {
-        const InterpKernel *interp_kernel =
-            (const InterpKernel *)av1_interp_filter_params_list[filter]
-                .filter_ptr;
-        scale_plane_1_to_2_phase_0(src->buffers[i], src->strides[is_uv],
-                                   dst->buffers[i], dst->strides[is_uv], src_w,
-                                   src_h, interp_kernel[8], temp_buffer);
-        free(temp_buffer);
-      }
-    } else {
-      av1_resize_plane(src->buffers[i], src_h, src_w, src->strides[is_uv],
-                       dst->buffers[i], dst_h, dst_w, dst->strides[is_uv]);
-    }
-  }
-  aom_extend_frame_borders(dst, num_planes);
-}

diff --git a/av1/common/x86/selfguided_avx2.c b/av1/common/x86/selfguided_avx2.c
index 5636cf0..3cb63fb 100644
--- a/av1/common/x86/selfguided_avx2.c
+++ b/av1/common/x86/selfguided_avx2.c

@@ -19,12 +19,6 @@
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_dsp/x86/synonyms_avx2.h"
 
-// Load 8 bytes from the possibly-misaligned pointer p, extend each byte to
-// 32-bit precision and return them in an AVX2 register.
-static __m256i yy256_load_extend_8_32(const void *p) {
-  return _mm256_cvtepu8_epi32(xx_loadl_64(p));
-}
-
 // Load 8 halfwords from the possibly-misaligned pointer p, extend each
 // halfword to 32-bit precision and return them in an AVX2 register.
 static __m256i yy256_load_extend_16_32(const void *p) {
@@ -84,50 +78,6 @@
   return dest;
 }
 
-static void integral_images(const uint8_t *src, int src_stride, int width,
-                            int height, int32_t *A, int32_t *B,
-                            int buf_stride) {
-  const __m256i zero = _mm256_setzero_si256();
-  // Write out the zero top row
-  memset_zero_avx(A, &zero, (width + 8));
-  memset_zero_avx(B, &zero, (width + 8));
-  for (int i = 0; i < height; ++i) {
-    // Zero the left column.
-    A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
-
-    // ldiff is the difference H - D where H is the output sample immediately
-    // to the left and D is the output sample above it. These are scalars,
-    // replicated across the eight lanes.
-    __m256i ldiff1 = zero, ldiff2 = zero;
-    for (int j = 0; j < width; j += 8) {
-      const int ABj = 1 + j;
-
-      const __m256i above1 = yy_load_256(B + ABj + i * buf_stride);
-      const __m256i above2 = yy_load_256(A + ABj + i * buf_stride);
-
-      const __m256i x1 = yy256_load_extend_8_32(src + j + i * src_stride);
-      const __m256i x2 = _mm256_madd_epi16(x1, x1);
-
-      const __m256i sc1 = scan_32(x1);
-      const __m256i sc2 = scan_32(x2);
-
-      const __m256i row1 =
-          _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1);
-      const __m256i row2 =
-          _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2);
-
-      yy_store_256(B + ABj + (i + 1) * buf_stride, row1);
-      yy_store_256(A + ABj + (i + 1) * buf_stride, row2);
-
-      // Calculate the new H - D.
-      ldiff1 = _mm256_set1_epi32(
-          _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7));
-      ldiff2 = _mm256_set1_epi32(
-          _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7));
-    }
-  }
-}
-
 // Compute two integral images from src. B sums elements; A sums their squares
 //
 // A and B should be aligned to 32 bytes. buf_stride should be a multiple of 8.
@@ -324,22 +274,19 @@
 // across A, B with "cross sums" (see cross_sum implementation above).
 static void final_filter(int32_t *dst, int dst_stride, const int32_t *A,
                          const int32_t *B, int buf_stride, const void *dgd8,
-                         int dgd_stride, int width, int height, int highbd) {
+                         int dgd_stride, int width, int height) {
   const int nb = 5;
   const __m256i rounding =
       round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-  const uint8_t *dgd_real =
-      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+  const uint8_t *dgd_real = (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8);
 
   for (int i = 0; i < height; ++i) {
     for (int j = 0; j < width; j += 8) {
       const __m256i a = cross_sum(A + i * buf_stride + j, buf_stride);
       const __m256i b = cross_sum(B + i * buf_stride + j, buf_stride);
 
-      const __m128i raw =
-          xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
-      const __m256i src =
-          highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+      const __m128i raw = xx_loadu_128(dgd_real + ((i * dgd_stride + j) << 1));
+      const __m256i src = _mm256_cvtepu16_epi32(raw);
 
       __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
       __m256i w = _mm256_srai_epi32(_mm256_add_epi32(v, rounding),
@@ -494,7 +441,7 @@
 static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
                               const int32_t *B, int buf_stride,
                               const void *dgd8, int dgd_stride, int width,
-                              int height, int highbd) {
+                              int height) {
   const int nb0 = 5;
   const int nb1 = 4;
 
@@ -503,8 +450,7 @@
   const __m256i rounding1 =
       round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
 
-  const uint8_t *dgd_real =
-      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+  const uint8_t *dgd_real = (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8);
 
   for (int i = 0; i < height; ++i) {
     if (!(i & 1)) {  // even row
@@ -515,9 +461,8 @@
             cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride);
 
         const __m128i raw =
-            xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
-        const __m256i src =
-            highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+            xx_loadu_128(dgd_real + ((i * dgd_stride + j) << 1));
+        const __m256i src = _mm256_cvtepu16_epi32(raw);
 
         __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
         __m256i w =
@@ -532,9 +477,8 @@
         const __m256i b = cross_sum_fast_odd_row(B + i * buf_stride + j);
 
         const __m128i raw =
-            xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
-        const __m256i src =
-            highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+            xx_loadu_128(dgd_real + ((i * dgd_stride + j) << 1));
+        const __m256i src = _mm256_cvtepu16_epi32(raw);
 
         __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
         __m256i w =
@@ -550,8 +494,7 @@
 int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
                                     int dgd_stride, int32_t *flt0,
                                     int32_t *flt1, int flt_stride,
-                                    int sgr_params_idx, int bit_depth,
-                                    int highbd) {
+                                    int sgr_params_idx, int bit_depth) {
   // The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl,
   // Ctl and Dtl is 32-byte aligned.
   const int buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3);
@@ -598,12 +541,8 @@
 
   // Generate integral images from the input. C will contain sums of squares; D
   // will contain just sums
-  if (highbd)
-    integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
-                           height_ext, Ctl, Dtl, buf_stride);
-  else
-    integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
-                    buf_stride);
+  integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
+                         height_ext, Ctl, Dtl, buf_stride);
 
   const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   // Write to flt0 and flt1
@@ -618,14 +557,14 @@
     calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth,
                  sgr_params_idx, 0);
     final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
-                      width, height, highbd);
+                      width, height);
   }
 
   if (params->r[1] > 0) {
     calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx,
             1);
     final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
-                 height, highbd);
+                 height);
   }
   aom_free(buf);
   return 0;
@@ -635,12 +574,12 @@
                                            int height, int stride, int eps,
                                            const int *xqd, uint8_t *dst8,
                                            int dst_stride, int32_t *tmpbuf,
-                                           int bit_depth, int highbd) {
+                                           int bit_depth) {
   int32_t *flt0 = tmpbuf;
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   assert(width * height <= RESTORATION_UNITPELS_MAX);
   const int ret = av1_selfguided_restoration_avx2(
-      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth);
   (void)ret;
   assert(!ret);
   const sgr_params_type *const params = &av1_sgr_params[eps];
@@ -659,16 +598,10 @@
       const uint8_t *dat8ij = dat8 + i * stride + j;
       __m256i ep_0, ep_1;
       __m128i src_0, src_1;
-      if (highbd) {
-        src_0 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij));
-        src_1 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij + 8));
-        ep_0 = _mm256_cvtepu16_epi32(src_0);
-        ep_1 = _mm256_cvtepu16_epi32(src_1);
-      } else {
-        src_0 = xx_loadu_128(dat8ij);
-        ep_0 = _mm256_cvtepu8_epi32(src_0);
-        ep_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(src_0, 8));
-      }
+      src_0 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij));
+      src_1 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij + 8));
+      ep_0 = _mm256_cvtepu16_epi32(src_0);
+      ep_1 = _mm256_cvtepu16_epi32(src_1);
 
       const __m256i u_0 = _mm256_slli_epi32(ep_0, SGRPROJ_RST_BITS);
       const __m256i u_1 = _mm256_slli_epi32(ep_1, SGRPROJ_RST_BITS);
@@ -699,27 +632,14 @@
       const __m256i w_1 = _mm256_srai_epi32(
           _mm256_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
 
-      if (highbd) {
-        // Pack into 16 bits and clamp to [0, 2^bit_depth)
-        // Note that packing into 16 bits messes up the order of the bits,
-        // so we use a permute function to correct this
-        const __m256i tmp = _mm256_packus_epi32(w_0, w_1);
-        const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
-        const __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1);
-        const __m256i res = _mm256_min_epi16(tmp2, max);
-        yy_storeu_256(CONVERT_TO_SHORTPTR(dst8 + m), res);
-      } else {
-        // Pack into 8 bits and clamp to [0, 256)
-        // Note that each pack messes up the order of the bits,
-        // so we use a permute function to correct this
-        const __m256i tmp = _mm256_packs_epi32(w_0, w_1);
-        const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
-        const __m256i res =
-            _mm256_packus_epi16(tmp2, tmp2 /* "don't care" value */);
-        const __m128i res2 =
-            _mm256_castsi256_si128(_mm256_permute4x64_epi64(res, 0xd8));
-        xx_storeu_128(dst8 + m, res2);
-      }
+      // Pack into 16 bits and clamp to [0, 2^bit_depth)
+      // Note that packing into 16 bits messes up the order of the bits,
+      // so we use a permute function to correct this
+      const __m256i tmp = _mm256_packus_epi32(w_0, w_1);
+      const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
+      const __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1);
+      const __m256i res = _mm256_min_epi16(tmp2, max);
+      yy_storeu_256(CONVERT_TO_SHORTPTR(dst8 + m), res);
     }
   }
 }

diff --git a/av1/common/x86/selfguided_sse4.c b/av1/common/x86/selfguided_sse4.c
index 1767d50..ceabb31 100644
--- a/av1/common/x86/selfguided_sse4.c
+++ b/av1/common/x86/selfguided_sse4.c

@@ -18,12 +18,6 @@
 #include "av1/common/restoration.h"
 #include "aom_dsp/x86/synonyms.h"
 
-// Load 4 bytes from the possibly-misaligned pointer p, extend each byte to
-// 32-bit precision and return them in an SSE register.
-static __m128i xx_load_extend_8_32(const void *p) {
-  return _mm_cvtepu8_epi32(xx_loadl_32(p));
-}
-
 // Load 4 halfwords from the possibly-misaligned pointer p, extend each
 // halfword to 32-bit precision and return them in an SSE register.
 static __m128i xx_load_extend_16_32(const void *p) {
@@ -38,53 +32,6 @@
   return _mm_add_epi32(x01, _mm_slli_si128(x01, 8));
 }
 
-// Compute two integral images from src. B sums elements; A sums their
-// squares. The images are offset by one pixel, so will have width and height
-// equal to width + 1, height + 1 and the first row and column will be zero.
-//
-// A+1 and B+1 should be aligned to 16 bytes. buf_stride should be a multiple
-// of 4.
-static void integral_images(const uint8_t *src, int src_stride, int width,
-                            int height, int32_t *A, int32_t *B,
-                            int buf_stride) {
-  // Write out the zero top row
-  memset(A, 0, sizeof(*A) * (width + 1));
-  memset(B, 0, sizeof(*B) * (width + 1));
-
-  const __m128i zero = _mm_setzero_si128();
-  for (int i = 0; i < height; ++i) {
-    // Zero the left column.
-    A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
-
-    // ldiff is the difference H - D where H is the output sample immediately
-    // to the left and D is the output sample above it. These are scalars,
-    // replicated across the four lanes.
-    __m128i ldiff1 = zero, ldiff2 = zero;
-    for (int j = 0; j < width; j += 4) {
-      const int ABj = 1 + j;
-
-      const __m128i above1 = xx_load_128(B + ABj + i * buf_stride);
-      const __m128i above2 = xx_load_128(A + ABj + i * buf_stride);
-
-      const __m128i x1 = xx_load_extend_8_32(src + j + i * src_stride);
-      const __m128i x2 = _mm_madd_epi16(x1, x1);
-
-      const __m128i sc1 = scan_32(x1);
-      const __m128i sc2 = scan_32(x2);
-
-      const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1);
-      const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2);
-
-      xx_store_128(B + ABj + (i + 1) * buf_stride, row1);
-      xx_store_128(A + ABj + (i + 1) * buf_stride, row2);
-
-      // Calculate the new H - D.
-      ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff);
-      ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff);
-    }
-  }
-}
-
 // Compute two integral images from src. B sums elements; A sums their squares
 //
 // A and B should be aligned to 16 bytes. buf_stride should be a multiple of 4.
@@ -280,21 +227,18 @@
 // across A, B with "cross sums" (see cross_sum implementation above).
 static void final_filter(int32_t *dst, int dst_stride, const int32_t *A,
                          const int32_t *B, int buf_stride, const void *dgd8,
-                         int dgd_stride, int width, int height, int highbd) {
+                         int dgd_stride, int width, int height) {
   const int nb = 5;
   const __m128i rounding =
       round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-  const uint8_t *dgd_real =
-      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+  const uint8_t *dgd_real = (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8);
 
   for (int i = 0; i < height; ++i) {
     for (int j = 0; j < width; j += 4) {
       const __m128i a = cross_sum(A + i * buf_stride + j, buf_stride);
       const __m128i b = cross_sum(B + i * buf_stride + j, buf_stride);
-      const __m128i raw =
-          xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
-      const __m128i src =
-          highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+      const __m128i raw = xx_loadl_64(dgd_real + ((i * dgd_stride + j) << 1));
+      const __m128i src = _mm_cvtepu16_epi32(raw);
 
       __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
       __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding),
@@ -453,7 +397,7 @@
 static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
                               const int32_t *B, int buf_stride,
                               const void *dgd8, int dgd_stride, int width,
-                              int height, int highbd) {
+                              int height) {
   const int nb0 = 5;
   const int nb1 = 4;
 
@@ -462,8 +406,7 @@
   const __m128i rounding1 =
       round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
 
-  const uint8_t *dgd_real =
-      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+  const uint8_t *dgd_real = (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8);
 
   for (int i = 0; i < height; ++i) {
     if (!(i & 1)) {  // even row
@@ -472,10 +415,8 @@
             cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride);
         const __m128i b =
             cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride);
-        const __m128i raw =
-            xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
-        const __m128i src =
-            highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+        const __m128i raw = xx_loadl_64(dgd_real + ((i * dgd_stride + j) << 1));
+        const __m128i src = _mm_cvtepu16_epi32(raw);
 
         __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
         __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding0),
@@ -487,10 +428,8 @@
       for (int j = 0; j < width; j += 4) {
         const __m128i a = cross_sum_fast_odd_row(A + i * buf_stride + j);
         const __m128i b = cross_sum_fast_odd_row(B + i * buf_stride + j);
-        const __m128i raw =
-            xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
-        const __m128i src =
-            highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+        const __m128i raw = xx_loadl_64(dgd_real + ((i * dgd_stride + j) << 1));
+        const __m128i src = _mm_cvtepu16_epi32(raw);
 
         __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
         __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding1),
@@ -505,8 +444,7 @@
 int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
                                       int height, int dgd_stride, int32_t *flt0,
                                       int32_t *flt1, int flt_stride,
-                                      int sgr_params_idx, int bit_depth,
-                                      int highbd) {
+                                      int sgr_params_idx, int bit_depth) {
   int32_t *buf = (int32_t *)aom_memalign(
       16, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS);
   if (!buf) return -1;
@@ -550,12 +488,8 @@
 
   // Generate integral images from the input. C will contain sums of squares; D
   // will contain just sums
-  if (highbd)
-    integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
-                           height_ext, Ctl, Dtl, buf_stride);
-  else
-    integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
-                    buf_stride);
+  integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
+                         height_ext, Ctl, Dtl, buf_stride);
 
   const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
   // Write to flt0 and flt1
@@ -570,14 +504,14 @@
     calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth,
                  sgr_params_idx, 0);
     final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
-                      width, height, highbd);
+                      width, height);
   }
 
   if (params->r[1] > 0) {
     calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx,
             1);
     final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
-                 height, highbd);
+                 height);
   }
   aom_free(buf);
   return 0;
@@ -587,12 +521,12 @@
                                              int height, int stride, int eps,
                                              const int *xqd, uint8_t *dst8,
                                              int dst_stride, int32_t *tmpbuf,
-                                             int bit_depth, int highbd) {
+                                             int bit_depth) {
   int32_t *flt0 = tmpbuf;
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   assert(width * height <= RESTORATION_UNITPELS_MAX);
   const int ret = av1_selfguided_restoration_sse4_1(
-      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth);
   (void)ret;
   assert(!ret);
   const sgr_params_type *const params = &av1_sgr_params[eps];
@@ -610,11 +544,7 @@
 
       const uint8_t *dat8ij = dat8 + i * stride + j;
       __m128i src;
-      if (highbd) {
-        src = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij));
-      } else {
-        src = _mm_cvtepu8_epi16(xx_loadl_64(dat8ij));
-      }
+      src = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij));
 
       const __m128i u = _mm_slli_epi16(src, SGRPROJ_RST_BITS);
       const __m128i u_0 = _mm_cvtepu16_epi32(u);
@@ -646,18 +576,11 @@
       const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding),
                                          SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
 
-      if (highbd) {
-        // Pack into 16 bits and clamp to [0, 2^bit_depth)
-        const __m128i tmp = _mm_packus_epi32(w_0, w_1);
-        const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1);
-        const __m128i res = _mm_min_epi16(tmp, max);
-        xx_storeu_128(CONVERT_TO_SHORTPTR(dst8 + m), res);
-      } else {
-        // Pack into 8 bits and clamp to [0, 256)
-        const __m128i tmp = _mm_packs_epi32(w_0, w_1);
-        const __m128i res = _mm_packus_epi16(tmp, tmp /* "don't care" value */);
-        xx_storel_64(dst8 + m, res);
-      }
+      // Pack into 16 bits and clamp to [0, 2^bit_depth)
+      const __m128i tmp = _mm_packus_epi32(w_0, w_1);
+      const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1);
+      const __m128i res = _mm_min_epi16(tmp, max);
+      xx_storeu_128(CONVERT_TO_SHORTPTR(dst8 + m), res);
     }
   }
 }

diff --git a/av1/common/x86/warp_plane_avx2.c b/av1/common/x86/warp_plane_avx2.c
deleted file mode 100644
index 5729336..0000000
--- a/av1/common/x86/warp_plane_avx2.c
+++ /dev/null

@@ -1,1322 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <immintrin.h>
-#include "config/av1_rtcd.h"
-#include "av1/common/warped_motion.h"
-#include "aom_dsp/x86/synonyms.h"
-
-DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask01_avx2[32]) = {
-  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask23_avx2[32]) = {
-  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
-  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask45_avx2[32]) = {
-  4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
-  4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask67_avx2[32]) = {
-  6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
-  6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask0_avx2[32]) = {
-  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
-  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask1_avx2[32]) = {
-  4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
-  4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask2_avx2[32]) = {
-  8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11,
-  8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask3_avx2[32]) = {
-  12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15,
-  12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
-};
-
-DECLARE_ALIGNED(32, static const uint8_t,
-                shuffle_src0[32]) = { 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3,
-                                      5, 5, 7, 7, 9, 0, 2, 2, 4, 4, 6,
-                                      6, 8, 1, 3, 3, 5, 5, 7, 7, 9 };
-
-DECLARE_ALIGNED(32, static const uint8_t,
-                shuffle_src1[32]) = { 4,  6,  6,  8,  8,  10, 10, 12, 5,  7, 7,
-                                      9,  9,  11, 11, 13, 4,  6,  6,  8,  8, 10,
-                                      10, 12, 5,  7,  7,  9,  9,  11, 11, 13 };
-
-DECLARE_ALIGNED(32, static const uint8_t,
-                shuffle_src2[32]) = { 1, 3, 3, 5, 5,  7, 7, 9, 2, 4, 4,
-                                      6, 6, 8, 8, 10, 1, 3, 3, 5, 5, 7,
-                                      7, 9, 2, 4, 4,  6, 6, 8, 8, 10 };
-
-DECLARE_ALIGNED(32, static const uint8_t,
-                shuffle_src3[32]) = { 5,  7,  7,  9,  9,  11, 11, 13, 6,  8, 8,
-                                      10, 10, 12, 12, 14, 5,  7,  7,  9,  9, 11,
-                                      11, 13, 6,  8,  8,  10, 10, 12, 12, 14 };
-
-static INLINE void filter_src_pixels_avx2(const __m256i src, __m256i *horz_out,
-                                          __m256i *coeff,
-                                          const __m256i *shuffle_src,
-                                          const __m256i *round_const,
-                                          const __m128i *shift, int row) {
-  const __m256i src_0 = _mm256_shuffle_epi8(src, shuffle_src[0]);
-  const __m256i src_1 = _mm256_shuffle_epi8(src, shuffle_src[1]);
-  const __m256i src_2 = _mm256_shuffle_epi8(src, shuffle_src[2]);
-  const __m256i src_3 = _mm256_shuffle_epi8(src, shuffle_src[3]);
-
-  const __m256i res_02 = _mm256_maddubs_epi16(src_0, coeff[0]);
-  const __m256i res_46 = _mm256_maddubs_epi16(src_1, coeff[1]);
-  const __m256i res_13 = _mm256_maddubs_epi16(src_2, coeff[2]);
-  const __m256i res_57 = _mm256_maddubs_epi16(src_3, coeff[3]);
-
-  const __m256i res_even = _mm256_add_epi16(res_02, res_46);
-  const __m256i res_odd = _mm256_add_epi16(res_13, res_57);
-  const __m256i res =
-      _mm256_add_epi16(_mm256_add_epi16(res_even, res_odd), *round_const);
-  horz_out[row] = _mm256_srl_epi16(res, *shift);
-}
-
-static INLINE void prepare_horizontal_filter_coeff_avx2(int alpha, int beta,
-                                                        int sx,
-                                                        __m256i *coeff) {
-  __m128i tmp_0 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 0 * alpha)) >>
-                                  WARPEDDIFF_PREC_BITS]);
-  __m128i tmp_1 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 1 * alpha)) >>
-                                  WARPEDDIFF_PREC_BITS]);
-  __m128i tmp_2 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 2 * alpha)) >>
-                                  WARPEDDIFF_PREC_BITS]);
-  __m128i tmp_3 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 3 * alpha)) >>
-                                  WARPEDDIFF_PREC_BITS]);
-
-  __m128i tmp_4 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 4 * alpha)) >>
-                                  WARPEDDIFF_PREC_BITS]);
-  __m128i tmp_5 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 5 * alpha)) >>
-                                  WARPEDDIFF_PREC_BITS]);
-  __m128i tmp_6 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 6 * alpha)) >>
-                                  WARPEDDIFF_PREC_BITS]);
-  __m128i tmp_7 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 7 * alpha)) >>
-                                  WARPEDDIFF_PREC_BITS]);
-
-  __m256i tmp0_256 = _mm256_castsi128_si256(tmp_0);
-  __m256i tmp2_256 = _mm256_castsi128_si256(tmp_2);
-  __m256i tmp1_256 = _mm256_castsi128_si256(tmp_1);
-  __m256i tmp3_256 = _mm256_castsi128_si256(tmp_3);
-
-  __m256i tmp4_256 = _mm256_castsi128_si256(tmp_4);
-  __m256i tmp6_256 = _mm256_castsi128_si256(tmp_6);
-  __m256i tmp5_256 = _mm256_castsi128_si256(tmp_5);
-  __m256i tmp7_256 = _mm256_castsi128_si256(tmp_7);
-
-  __m128i tmp_8 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 0 * alpha) >>
-                                  WARPEDDIFF_PREC_BITS]);
-  tmp0_256 = _mm256_inserti128_si256(tmp0_256, tmp_8, 1);
-
-  __m128i tmp_9 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 1 * alpha) >>
-                                  WARPEDDIFF_PREC_BITS]);
-  tmp1_256 = _mm256_inserti128_si256(tmp1_256, tmp_9, 1);
-
-  __m128i tmp_10 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 2 * alpha) >>
-                                  WARPEDDIFF_PREC_BITS]);
-  tmp2_256 = _mm256_inserti128_si256(tmp2_256, tmp_10, 1);
-
-  __m128i tmp_11 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 3 * alpha) >>
-                                  WARPEDDIFF_PREC_BITS]);
-  tmp3_256 = _mm256_inserti128_si256(tmp3_256, tmp_11, 1);
-
-  tmp_2 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 4 * alpha) >>
-                                  WARPEDDIFF_PREC_BITS]);
-  tmp4_256 = _mm256_inserti128_si256(tmp4_256, tmp_2, 1);
-
-  tmp_3 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 5 * alpha) >>
-                                  WARPEDDIFF_PREC_BITS]);
-  tmp5_256 = _mm256_inserti128_si256(tmp5_256, tmp_3, 1);
-
-  tmp_6 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 6 * alpha) >>
-                                  WARPEDDIFF_PREC_BITS]);
-  tmp6_256 = _mm256_inserti128_si256(tmp6_256, tmp_6, 1);
-
-  tmp_7 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 7 * alpha) >>
-                                  WARPEDDIFF_PREC_BITS]);
-  tmp7_256 = _mm256_inserti128_si256(tmp7_256, tmp_7, 1);
-
-  const __m256i tmp_12 = _mm256_unpacklo_epi16(tmp0_256, tmp2_256);
-  const __m256i tmp_13 = _mm256_unpacklo_epi16(tmp1_256, tmp3_256);
-  const __m256i tmp_14 = _mm256_unpacklo_epi16(tmp4_256, tmp6_256);
-  const __m256i tmp_15 = _mm256_unpacklo_epi16(tmp5_256, tmp7_256);
-
-  const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
-  const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
-  const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
-  const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
-
-  coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
-  coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
-  coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
-  coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
-}
-
-static INLINE void prepare_horizontal_filter_coeff_beta0_avx2(int alpha, int sx,
-                                                              __m256i *coeff) {
-  __m128i tmp_0 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
-  __m128i tmp_1 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
-  __m128i tmp_2 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
-  __m128i tmp_3 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
-  __m128i tmp_4 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
-  __m128i tmp_5 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
-  __m128i tmp_6 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
-  __m128i tmp_7 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
-
-  tmp_0 = _mm_unpacklo_epi16(tmp_0, tmp_2);
-  tmp_1 = _mm_unpacklo_epi16(tmp_1, tmp_3);
-  tmp_4 = _mm_unpacklo_epi16(tmp_4, tmp_6);
-  tmp_5 = _mm_unpacklo_epi16(tmp_5, tmp_7);
-
-  const __m256i tmp_12 = _mm256_broadcastsi128_si256(tmp_0);
-  const __m256i tmp_13 = _mm256_broadcastsi128_si256(tmp_1);
-  const __m256i tmp_14 = _mm256_broadcastsi128_si256(tmp_4);
-  const __m256i tmp_15 = _mm256_broadcastsi128_si256(tmp_5);
-
-  const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
-  const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
-  const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
-  const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
-
-  coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
-  coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
-  coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
-  coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
-}
-
-static INLINE void prepare_horizontal_filter_coeff_alpha0_avx2(int beta, int sx,
-                                                               __m256i *coeff) {
-  const __m128i tmp_0 =
-      _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
-  const __m128i tmp_1 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + beta) >> WARPEDDIFF_PREC_BITS]);
-
-  const __m256i res_0 =
-      _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_0), tmp_1, 0x1);
-
-  coeff[0] = _mm256_shuffle_epi8(
-      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask01_avx2));
-  coeff[1] = _mm256_shuffle_epi8(
-      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask23_avx2));
-  coeff[2] = _mm256_shuffle_epi8(
-      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask45_avx2));
-  coeff[3] = _mm256_shuffle_epi8(
-      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask67_avx2));
-}
-
-static INLINE void horizontal_filter_avx2(const __m256i src, __m256i *horz_out,
-                                          int sx, int alpha, int beta, int row,
-                                          const __m256i *shuffle_src,
-                                          const __m256i *round_const,
-                                          const __m128i *shift) {
-  __m256i coeff[4];
-  prepare_horizontal_filter_coeff_avx2(alpha, beta, sx, coeff);
-  filter_src_pixels_avx2(src, horz_out, coeff, shuffle_src, round_const, shift,
-                         row);
-}
-static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
-                                                   __m256i *coeff) {
-  const __m128i tmp_0 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
-  const __m128i tmp_1 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
-  const __m128i tmp_2 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
-  const __m128i tmp_3 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
-  const __m128i tmp_4 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
-  const __m128i tmp_5 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
-  const __m128i tmp_6 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
-  const __m128i tmp_7 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
-
-  const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
-  const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
-  const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
-  const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
-
-  const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
-  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
-  const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
-  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
-
-  coeff[0] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_12, tmp_14));
-  coeff[1] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_12, tmp_14));
-  coeff[2] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_13, tmp_15));
-  coeff[3] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_13, tmp_15));
-}
-
-static INLINE void warp_horizontal_filter_avx2(
-    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
-    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
-    const __m256i *round_const, const __m128i *shift,
-    const __m256i *shuffle_src) {
-  int k, iy, sx, row = 0;
-  __m256i coeff[4];
-  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
-    iy = iy4 + k;
-    iy = clamp(iy, 0, height - 1);
-    const __m128i src_0 =
-        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-    iy = iy4 + k + 1;
-    iy = clamp(iy, 0, height - 1);
-    const __m128i src_1 =
-        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-    const __m256i src_01 =
-        _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
-    sx = sx4 + beta * (k + 4);
-    horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, shuffle_src,
-                           round_const, shift);
-    row += 1;
-  }
-  iy = iy4 + k;
-  iy = clamp(iy, 0, height - 1);
-  const __m256i src_01 = _mm256_castsi128_si256(
-      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
-  sx = sx4 + beta * (k + 4);
-  prepare_horizontal_filter_coeff(alpha, sx, coeff);
-  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
-                         shift, row);
-}
-
-static INLINE void warp_horizontal_filter_alpha0_avx2(
-    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
-    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
-    const __m256i *round_const, const __m128i *shift,
-    const __m256i *shuffle_src) {
-  (void)alpha;
-  int k, iy, sx, row = 0;
-  __m256i coeff[4];
-  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
-    iy = iy4 + k;
-    iy = clamp(iy, 0, height - 1);
-    const __m128i src_0 =
-        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-    iy = iy4 + k + 1;
-    iy = clamp(iy, 0, height - 1);
-    const __m128i src_1 =
-        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-    const __m256i src_01 =
-        _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
-    sx = sx4 + beta * (k + 4);
-    prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
-    filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
-                           shift, row);
-    row += 1;
-  }
-  iy = iy4 + k;
-  iy = clamp(iy, 0, height - 1);
-  const __m256i src_01 = _mm256_castsi128_si256(
-      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
-  sx = sx4 + beta * (k + 4);
-  prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
-  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
-                         shift, row);
-}
-
-static INLINE void warp_horizontal_filter_beta0_avx2(
-    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
-    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
-    const __m256i *round_const, const __m128i *shift,
-    const __m256i *shuffle_src) {
-  (void)beta;
-  int k, iy, row = 0;
-  __m256i coeff[4];
-  prepare_horizontal_filter_coeff_beta0_avx2(alpha, sx4, coeff);
-  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
-    iy = iy4 + k;
-    iy = clamp(iy, 0, height - 1);
-    const __m128i src_0 =
-        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-    iy = iy4 + k + 1;
-    iy = clamp(iy, 0, height - 1);
-    const __m128i src_1 =
-        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-    const __m256i src_01 =
-        _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
-    filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
-                           shift, row);
-    row += 1;
-  }
-  iy = iy4 + k;
-  iy = clamp(iy, 0, height - 1);
-  const __m256i src_01 = _mm256_castsi128_si256(
-      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
-  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
-                         shift, row);
-}
-
-static INLINE void warp_horizontal_filter_alpha0_beta0_avx2(
-    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
-    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
-    const __m256i *round_const, const __m128i *shift,
-    const __m256i *shuffle_src) {
-  (void)alpha;
-  int k, iy, row = 0;
-  __m256i coeff[4];
-  prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx4, coeff);
-  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
-    iy = iy4 + k;
-    iy = clamp(iy, 0, height - 1);
-    const __m128i src0 =
-        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-    iy = iy4 + k + 1;
-    iy = clamp(iy, 0, height - 1);
-    const __m128i src1 =
-        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-    const __m256i src_01 =
-        _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
-    filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
-                           shift, row);
-    row += 1;
-  }
-  iy = iy4 + k;
-  iy = clamp(iy, 0, height - 1);
-  const __m256i src_01 = _mm256_castsi128_si256(
-      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
-  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
-                         shift, row);
-}
-
-static INLINE void unpack_weights_and_set_round_const_avx2(
-    ConvolveParams *conv_params, const int round_bits, const int offset_bits,
-    __m256i *res_sub_const, __m256i *round_bits_const, __m256i *wt) {
-  (void)wt;
-  *res_sub_const =
-      _mm256_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
-                        (1 << (offset_bits - conv_params->round_1 - 1)));
-  *round_bits_const = _mm256_set1_epi16(((1 << round_bits) >> 1));
-
-  const int w0 = conv_params->fwd_offset;
-  const int w1 = conv_params->bck_offset;
-  const __m256i wt0 = _mm256_set1_epi16((short)w0);
-  const __m256i wt1 = _mm256_set1_epi16((short)w1);
-  *wt = _mm256_unpacklo_epi16(wt0, wt1);
-}
-
-static INLINE void prepare_vertical_filter_coeffs_avx2(int gamma, int delta,
-                                                       int sy,
-                                                       __m256i *coeffs) {
-  __m128i filt_00 =
-      _mm_loadu_si128((__m128i *)(av1_warped_filter +
-                                  ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  __m128i filt_01 =
-      _mm_loadu_si128((__m128i *)(av1_warped_filter +
-                                  ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  __m128i filt_02 =
-      _mm_loadu_si128((__m128i *)(av1_warped_filter +
-                                  ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  __m128i filt_03 =
-      _mm_loadu_si128((__m128i *)(av1_warped_filter +
-                                  ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-  __m128i filt_10 = _mm_loadu_si128(
-      (__m128i *)(av1_warped_filter +
-                  (((sy + delta) + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  __m128i filt_11 = _mm_loadu_si128(
-      (__m128i *)(av1_warped_filter +
-                  (((sy + delta) + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  __m128i filt_12 = _mm_loadu_si128(
-      (__m128i *)(av1_warped_filter +
-                  (((sy + delta) + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  __m128i filt_13 = _mm_loadu_si128(
-      (__m128i *)(av1_warped_filter +
-                  (((sy + delta) + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-  __m256i filt_0 =
-      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
-  __m256i filt_1 =
-      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
-  __m256i filt_2 =
-      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
-  __m256i filt_3 =
-      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
-
-  __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
-  __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
-  __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
-  __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
-
-  coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
-  coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
-  coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
-  coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
-
-  filt_00 =
-      _mm_loadu_si128((__m128i *)(av1_warped_filter +
-                                  ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  filt_01 =
-      _mm_loadu_si128((__m128i *)(av1_warped_filter +
-                                  ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  filt_02 =
-      _mm_loadu_si128((__m128i *)(av1_warped_filter +
-                                  ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  filt_03 =
-      _mm_loadu_si128((__m128i *)(av1_warped_filter +
-                                  ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-  filt_10 = _mm_loadu_si128(
-      (__m128i *)(av1_warped_filter +
-                  (((sy + delta) + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  filt_11 = _mm_loadu_si128(
-      (__m128i *)(av1_warped_filter +
-                  (((sy + delta) + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  filt_12 = _mm_loadu_si128(
-      (__m128i *)(av1_warped_filter +
-                  (((sy + delta) + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  filt_13 = _mm_loadu_si128(
-      (__m128i *)(av1_warped_filter +
-                  (((sy + delta) + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-  filt_0 =
-      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
-  filt_1 =
-      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
-  filt_2 =
-      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
-  filt_3 =
-      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
-
-  res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
-  res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
-  res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
-  res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
-
-  coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
-  coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
-  coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
-  coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
-}
-
-static INLINE void prepare_vertical_filter_coeffs_delta0_avx2(int gamma, int sy,
-                                                              __m256i *coeffs) {
-  __m128i filt_00 =
-      _mm_loadu_si128((__m128i *)(av1_warped_filter +
-                                  ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  __m128i filt_01 =
-      _mm_loadu_si128((__m128i *)(av1_warped_filter +
-                                  ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  __m128i filt_02 =
-      _mm_loadu_si128((__m128i *)(av1_warped_filter +
-                                  ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  __m128i filt_03 =
-      _mm_loadu_si128((__m128i *)(av1_warped_filter +
-                                  ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-  __m256i filt_0 = _mm256_broadcastsi128_si256(filt_00);
-  __m256i filt_1 = _mm256_broadcastsi128_si256(filt_01);
-  __m256i filt_2 = _mm256_broadcastsi128_si256(filt_02);
-  __m256i filt_3 = _mm256_broadcastsi128_si256(filt_03);
-
-  __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
-  __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
-  __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
-  __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
-
-  coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
-  coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
-  coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
-  coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
-
-  filt_00 =
-      _mm_loadu_si128((__m128i *)(av1_warped_filter +
-                                  ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  filt_01 =
-      _mm_loadu_si128((__m128i *)(av1_warped_filter +
-                                  ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  filt_02 =
-      _mm_loadu_si128((__m128i *)(av1_warped_filter +
-                                  ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  filt_03 =
-      _mm_loadu_si128((__m128i *)(av1_warped_filter +
-                                  ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-  filt_0 = _mm256_broadcastsi128_si256(filt_00);
-  filt_1 = _mm256_broadcastsi128_si256(filt_01);
-  filt_2 = _mm256_broadcastsi128_si256(filt_02);
-  filt_3 = _mm256_broadcastsi128_si256(filt_03);
-
-  res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
-  res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
-  res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
-  res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
-
-  coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
-  coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
-  coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
-  coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
-}
-
-static INLINE void prepare_vertical_filter_coeffs_gamma0_avx2(int delta, int sy,
-                                                              __m256i *coeffs) {
-  const __m128i filt_0 = _mm_loadu_si128(
-      (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
-  const __m128i filt_1 = _mm_loadu_si128(
-      (__m128i *)(av1_warped_filter + ((sy + delta) >> WARPEDDIFF_PREC_BITS)));
-
-  __m256i res_0 =
-      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_0), filt_1, 0x1);
-
-  coeffs[0] = _mm256_shuffle_epi8(
-      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0_avx2));
-  coeffs[1] = _mm256_shuffle_epi8(
-      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1_avx2));
-  coeffs[2] = _mm256_shuffle_epi8(
-      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2_avx2));
-  coeffs[3] = _mm256_shuffle_epi8(
-      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask3_avx2));
-
-  coeffs[4] = coeffs[0];
-  coeffs[5] = coeffs[1];
-  coeffs[6] = coeffs[2];
-  coeffs[7] = coeffs[3];
-}
-
-static INLINE void filter_src_pixels_vertical_avx2(__m256i *horz_out,
-                                                   __m256i *src,
-                                                   __m256i *coeffs,
-                                                   __m256i *res_lo,
-                                                   __m256i *res_hi, int row) {
-  const __m256i src_6 = horz_out[row + 3];
-  const __m256i src_7 =
-      _mm256_permute2x128_si256(horz_out[row + 3], horz_out[row + 4], 0x21);
-
-  src[6] = _mm256_unpacklo_epi16(src_6, src_7);
-
-  const __m256i res_0 = _mm256_madd_epi16(src[0], coeffs[0]);
-  const __m256i res_2 = _mm256_madd_epi16(src[2], coeffs[1]);
-  const __m256i res_4 = _mm256_madd_epi16(src[4], coeffs[2]);
-  const __m256i res_6 = _mm256_madd_epi16(src[6], coeffs[3]);
-
-  const __m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_2),
-                                            _mm256_add_epi32(res_4, res_6));
-
-  src[7] = _mm256_unpackhi_epi16(src_6, src_7);
-
-  const __m256i res_1 = _mm256_madd_epi16(src[1], coeffs[4]);
-  const __m256i res_3 = _mm256_madd_epi16(src[3], coeffs[5]);
-  const __m256i res_5 = _mm256_madd_epi16(src[5], coeffs[6]);
-  const __m256i res_7 = _mm256_madd_epi16(src[7], coeffs[7]);
-
-  const __m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_3),
-                                           _mm256_add_epi32(res_5, res_7));
-
-  // Rearrange pixels back into the order 0 ... 7
-  *res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
-  *res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
-}
-
-static INLINE void store_vertical_filter_output_avx2(
-    const __m256i *res_lo, const __m256i *res_hi, const __m256i *res_add_const,
-    const __m256i *wt, const __m256i *res_sub_const,
-    const __m256i *round_bits_const, uint8_t *pred, ConvolveParams *conv_params,
-    int i, int j, int k, const int reduce_bits_vert, int p_stride, int p_width,
-    const int round_bits) {
-  const int use_wtd_comp_avg = is_uneven_wtd_comp_avg(conv_params);
-  (void)wt;
-  __m256i res_lo_1 = *res_lo;
-  __m256i res_hi_1 = *res_hi;
-
-  if (conv_params->is_compound) {
-    __m128i *const p_0 =
-        (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
-    __m128i *const p_1 =
-        (__m128i *)&conv_params
-            ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j];
-
-    res_lo_1 = _mm256_srai_epi32(_mm256_add_epi32(res_lo_1, *res_add_const),
-                                 reduce_bits_vert);
-
-    const __m256i temp_lo_16 = _mm256_packus_epi32(res_lo_1, res_lo_1);
-    __m256i res_lo_16;
-    if (conv_params->do_average) {
-      __m128i *const dst8_0 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
-      __m128i *const dst8_1 =
-          (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
-      const __m128i p_16_0 = _mm_loadl_epi64(p_0);
-      const __m128i p_16_1 = _mm_loadl_epi64(p_1);
-      const __m256i p_16 =
-          _mm256_inserti128_si256(_mm256_castsi128_si256(p_16_0), p_16_1, 1);
-      if (use_wtd_comp_avg) {
-        const __m256i p_16_lo = _mm256_unpacklo_epi16(p_16, temp_lo_16);
-        const __m256i wt_res_lo = _mm256_madd_epi16(p_16_lo, *wt);
-        const __m256i shifted_32 =
-            _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
-        res_lo_16 = _mm256_packus_epi32(shifted_32, shifted_32);
-      } else {
-        res_lo_16 = _mm256_srai_epi16(_mm256_add_epi16(p_16, temp_lo_16), 1);
-      }
-      res_lo_16 = _mm256_add_epi16(res_lo_16, *res_sub_const);
-      res_lo_16 = _mm256_srai_epi16(
-          _mm256_add_epi16(res_lo_16, *round_bits_const), round_bits);
-      const __m256i res_8_lo = _mm256_packus_epi16(res_lo_16, res_lo_16);
-      const __m128i res_8_lo_0 = _mm256_castsi256_si128(res_8_lo);
-      const __m128i res_8_lo_1 = _mm256_extracti128_si256(res_8_lo, 1);
-      *(uint32_t *)dst8_0 = _mm_cvtsi128_si32(res_8_lo_0);
-      *(uint32_t *)dst8_1 = _mm_cvtsi128_si32(res_8_lo_1);
-    } else {
-      const __m128i temp_lo_16_0 = _mm256_castsi256_si128(temp_lo_16);
-      const __m128i temp_lo_16_1 = _mm256_extracti128_si256(temp_lo_16, 1);
-      _mm_storel_epi64(p_0, temp_lo_16_0);
-      _mm_storel_epi64(p_1, temp_lo_16_1);
-    }
-    if (p_width > 4) {
-      __m128i *const p4_0 =
-          (__m128i *)&conv_params
-              ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
-      __m128i *const p4_1 =
-          (__m128i *)&conv_params
-              ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j + 4];
-      res_hi_1 = _mm256_srai_epi32(_mm256_add_epi32(res_hi_1, *res_add_const),
-                                   reduce_bits_vert);
-      const __m256i temp_hi_16 = _mm256_packus_epi32(res_hi_1, res_hi_1);
-      __m256i res_hi_16;
-      if (conv_params->do_average) {
-        __m128i *const dst8_4_0 =
-            (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
-        __m128i *const dst8_4_1 =
-            (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j + 4];
-        const __m128i p4_16_0 = _mm_loadl_epi64(p4_0);
-        const __m128i p4_16_1 = _mm_loadl_epi64(p4_1);
-        const __m256i p4_16 = _mm256_inserti128_si256(
-            _mm256_castsi128_si256(p4_16_0), p4_16_1, 1);
-        if (use_wtd_comp_avg) {
-          const __m256i p_16_hi = _mm256_unpacklo_epi16(p4_16, temp_hi_16);
-          const __m256i wt_res_hi = _mm256_madd_epi16(p_16_hi, *wt);
-          const __m256i shifted_32 =
-              _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
-          res_hi_16 = _mm256_packus_epi32(shifted_32, shifted_32);
-        } else {
-          res_hi_16 = _mm256_srai_epi16(_mm256_add_epi16(p4_16, temp_hi_16), 1);
-        }
-        res_hi_16 = _mm256_add_epi16(res_hi_16, *res_sub_const);
-        res_hi_16 = _mm256_srai_epi16(
-            _mm256_add_epi16(res_hi_16, *round_bits_const), round_bits);
-        __m256i res_8_hi = _mm256_packus_epi16(res_hi_16, res_hi_16);
-        const __m128i res_8_hi_0 = _mm256_castsi256_si128(res_8_hi);
-        const __m128i res_8_hi_1 = _mm256_extracti128_si256(res_8_hi, 1);
-        *(uint32_t *)dst8_4_0 = _mm_cvtsi128_si32(res_8_hi_0);
-        *(uint32_t *)dst8_4_1 = _mm_cvtsi128_si32(res_8_hi_1);
-      } else {
-        const __m128i temp_hi_16_0 = _mm256_castsi256_si128(temp_hi_16);
-        const __m128i temp_hi_16_1 = _mm256_extracti128_si256(temp_hi_16, 1);
-        _mm_storel_epi64(p4_0, temp_hi_16_0);
-        _mm_storel_epi64(p4_1, temp_hi_16_1);
-      }
-    }
-  } else {
-    const __m256i res_lo_round = _mm256_srai_epi32(
-        _mm256_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
-    const __m256i res_hi_round = _mm256_srai_epi32(
-        _mm256_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
-
-    const __m256i res_16bit = _mm256_packs_epi32(res_lo_round, res_hi_round);
-    const __m256i res_8bit = _mm256_packus_epi16(res_16bit, res_16bit);
-    const __m128i res_8bit0 = _mm256_castsi256_si128(res_8bit);
-    const __m128i res_8bit1 = _mm256_extracti128_si256(res_8bit, 1);
-
-    // Store, blending with 'pred' if needed
-    __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
-    __m128i *const p1 = (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
-
-    if (p_width == 4) {
-      *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit0);
-      *(uint32_t *)p1 = _mm_cvtsi128_si32(res_8bit1);
-    } else {
-      _mm_storel_epi64(p, res_8bit0);
-      _mm_storel_epi64(p1, res_8bit1);
-    }
-  }
-}
-
-static INLINE void warp_vertical_filter_avx2(
-    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
-    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
-    int i, int j, int sy4, const int reduce_bits_vert,
-    const __m256i *res_add_const, const int round_bits,
-    const __m256i *res_sub_const, const __m256i *round_bits_const,
-    const __m256i *wt) {
-  int k, row = 0;
-  __m256i src[8];
-  const __m256i src_0 = horz_out[0];
-  const __m256i src_1 =
-      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
-  const __m256i src_2 = horz_out[1];
-  const __m256i src_3 =
-      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
-  const __m256i src_4 = horz_out[2];
-  const __m256i src_5 =
-      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
-
-  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
-  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
-  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
-
-  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
-  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
-  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
-
-  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
-    int sy = sy4 + delta * (k + 4);
-    __m256i coeffs[8];
-    prepare_vertical_filter_coeffs_avx2(gamma, delta, sy, coeffs);
-    __m256i res_lo, res_hi;
-    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
-                                    row);
-    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
-                                      res_sub_const, round_bits_const, pred,
-                                      conv_params, i, j, k, reduce_bits_vert,
-                                      p_stride, p_width, round_bits);
-    src[0] = src[2];
-    src[2] = src[4];
-    src[4] = src[6];
-    src[1] = src[3];
-    src[3] = src[5];
-    src[5] = src[7];
-
-    row += 1;
-  }
-}
-
-static INLINE void warp_vertical_filter_gamma0_avx2(
-    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
-    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
-    int i, int j, int sy4, const int reduce_bits_vert,
-    const __m256i *res_add_const, const int round_bits,
-    const __m256i *res_sub_const, const __m256i *round_bits_const,
-    const __m256i *wt) {
-  (void)gamma;
-  int k, row = 0;
-  __m256i src[8];
-  const __m256i src_0 = horz_out[0];
-  const __m256i src_1 =
-      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
-  const __m256i src_2 = horz_out[1];
-  const __m256i src_3 =
-      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
-  const __m256i src_4 = horz_out[2];
-  const __m256i src_5 =
-      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
-
-  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
-  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
-  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
-
-  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
-  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
-  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
-
-  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
-    int sy = sy4 + delta * (k + 4);
-    __m256i coeffs[8];
-    prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy, coeffs);
-    __m256i res_lo, res_hi;
-    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
-                                    row);
-    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
-                                      res_sub_const, round_bits_const, pred,
-                                      conv_params, i, j, k, reduce_bits_vert,
-                                      p_stride, p_width, round_bits);
-    src[0] = src[2];
-    src[2] = src[4];
-    src[4] = src[6];
-    src[1] = src[3];
-    src[3] = src[5];
-    src[5] = src[7];
-    row += 1;
-  }
-}
-
-static INLINE void warp_vertical_filter_delta0_avx2(
-    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
-    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
-    int i, int j, int sy4, const int reduce_bits_vert,
-    const __m256i *res_add_const, const int round_bits,
-    const __m256i *res_sub_const, const __m256i *round_bits_const,
-    const __m256i *wt) {
-  (void)delta;
-  int k, row = 0;
-  __m256i src[8], coeffs[8];
-  const __m256i src_0 = horz_out[0];
-  const __m256i src_1 =
-      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
-  const __m256i src_2 = horz_out[1];
-  const __m256i src_3 =
-      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
-  const __m256i src_4 = horz_out[2];
-  const __m256i src_5 =
-      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
-
-  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
-  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
-  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
-
-  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
-  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
-  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
-
-  prepare_vertical_filter_coeffs_delta0_avx2(gamma, sy4, coeffs);
-
-  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
-    __m256i res_lo, res_hi;
-    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
-                                    row);
-    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
-                                      res_sub_const, round_bits_const, pred,
-                                      conv_params, i, j, k, reduce_bits_vert,
-                                      p_stride, p_width, round_bits);
-    src[0] = src[2];
-    src[2] = src[4];
-    src[4] = src[6];
-    src[1] = src[3];
-    src[3] = src[5];
-    src[5] = src[7];
-    row += 1;
-  }
-}
-
-static INLINE void warp_vertical_filter_gamma0_delta0_avx2(
-    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
-    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
-    int i, int j, int sy4, const int reduce_bits_vert,
-    const __m256i *res_add_const, const int round_bits,
-    const __m256i *res_sub_const, const __m256i *round_bits_const,
-    const __m256i *wt) {
-  (void)gamma;
-  int k, row = 0;
-  __m256i src[8], coeffs[8];
-  const __m256i src_0 = horz_out[0];
-  const __m256i src_1 =
-      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
-  const __m256i src_2 = horz_out[1];
-  const __m256i src_3 =
-      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
-  const __m256i src_4 = horz_out[2];
-  const __m256i src_5 =
-      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
-
-  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
-  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
-  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
-
-  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
-  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
-  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
-
-  prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy4, coeffs);
-
-  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
-    __m256i res_lo, res_hi;
-    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
-                                    row);
-    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
-                                      res_sub_const, round_bits_const, pred,
-                                      conv_params, i, j, k, reduce_bits_vert,
-                                      p_stride, p_width, round_bits);
-    src[0] = src[2];
-    src[2] = src[4];
-    src[4] = src[6];
-    src[1] = src[3];
-    src[3] = src[5];
-    src[5] = src[7];
-    row += 1;
-  }
-}
-
-static INLINE void prepare_warp_vertical_filter_avx2(
-    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
-    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
-    int i, int j, int sy4, const int reduce_bits_vert,
-    const __m256i *res_add_const, const int round_bits,
-    const __m256i *res_sub_const, const __m256i *round_bits_const,
-    const __m256i *wt) {
-  if (gamma == 0 && delta == 0)
-    warp_vertical_filter_gamma0_delta0_avx2(
-        pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
-        i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
-        round_bits_const, wt);
-  else if (gamma == 0 && delta != 0)
-    warp_vertical_filter_gamma0_avx2(
-        pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
-        i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
-        round_bits_const, wt);
-  else if (gamma != 0 && delta == 0)
-    warp_vertical_filter_delta0_avx2(
-        pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
-        i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
-        round_bits_const, wt);
-  else
-    warp_vertical_filter_avx2(pred, horz_out, conv_params, gamma, delta,
-                              p_height, p_stride, p_width, i, j, sy4,
-                              reduce_bits_vert, res_add_const, round_bits,
-                              res_sub_const, round_bits_const, wt);
-}
-
-static INLINE void prepare_warp_horizontal_filter_avx2(
-    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
-    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
-    const __m256i *round_const, const __m128i *shift,
-    const __m256i *shuffle_src) {
-  if (alpha == 0 && beta == 0)
-    warp_horizontal_filter_alpha0_beta0_avx2(
-        ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
-        round_const, shift, shuffle_src);
-  else if (alpha == 0 && beta != 0)
-    warp_horizontal_filter_alpha0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
-                                       alpha, beta, p_height, height, i,
-                                       round_const, shift, shuffle_src);
-  else if (alpha != 0 && beta == 0)
-    warp_horizontal_filter_beta0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
-                                      alpha, beta, p_height, height, i,
-                                      round_const, shift, shuffle_src);
-  else
-    warp_horizontal_filter_avx2(ref, horz_out, stride, ix4, iy4, sx4, alpha,
-                                beta, p_height, height, i, round_const, shift,
-                                shuffle_src);
-}
-
-int64_t av1_calc_frame_error_avx2(const uint8_t *const ref, int ref_stride,
-                                  const uint8_t *const dst, int p_width,
-                                  int p_height, int dst_stride) {
-  int64_t sum_error = 0;
-  int i, j;
-  __m256i row_error, col_error;
-  __m256i zero = _mm256_set1_epi16(0);
-  __m256i dup_255 = _mm256_set1_epi16(255);
-  col_error = zero;
-
-  for (i = 0; i < (p_height / 4); i++) {
-    row_error = _mm256_set1_epi16(0);
-    for (j = 0; j < (p_width / 16); j++) {
-      __m256i ref_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
-          (__m128i *)(ref + (j * 16) + (((i * 4) + 0) * ref_stride))));
-      __m256i dst_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
-          (__m128i *)(dst + (j * 16) + (((i * 4) + 0) * dst_stride))));
-      __m256i ref_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
-          (__m128i *)(ref + (j * 16) + (((i * 4) + 1) * ref_stride))));
-      __m256i dst_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
-          (__m128i *)(dst + (j * 16) + (((i * 4) + 1) * dst_stride))));
-      __m256i ref_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
-          (__m128i *)(ref + (j * 16) + (((i * 4) + 2) * ref_stride))));
-      __m256i dst_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
-          (__m128i *)(dst + (j * 16) + (((i * 4) + 2) * dst_stride))));
-      __m256i ref_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
-          (__m128i *)(ref + (j * 16) + (((i * 4) + 3) * ref_stride))));
-      __m256i dst_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
-          (__m128i *)(dst + (j * 16) + (((i * 4) + 3) * dst_stride))));
-
-      __m256i diff_1 =
-          _mm256_add_epi16(_mm256_sub_epi16(dst_1_16, ref_1_16), dup_255);
-      __m256i diff_2 =
-          _mm256_add_epi16(_mm256_sub_epi16(dst_2_16, ref_2_16), dup_255);
-      __m256i diff_3 =
-          _mm256_add_epi16(_mm256_sub_epi16(dst_3_16, ref_3_16), dup_255);
-      __m256i diff_4 =
-          _mm256_add_epi16(_mm256_sub_epi16(dst_4_16, ref_4_16), dup_255);
-
-      __m256i diff_1_lo = _mm256_unpacklo_epi16(diff_1, zero);
-      __m256i diff_1_hi = _mm256_unpackhi_epi16(diff_1, zero);
-      __m256i diff_2_lo = _mm256_unpacklo_epi16(diff_2, zero);
-      __m256i diff_2_hi = _mm256_unpackhi_epi16(diff_2, zero);
-      __m256i diff_3_lo = _mm256_unpacklo_epi16(diff_3, zero);
-      __m256i diff_3_hi = _mm256_unpackhi_epi16(diff_3, zero);
-      __m256i diff_4_lo = _mm256_unpacklo_epi16(diff_4, zero);
-      __m256i diff_4_hi = _mm256_unpackhi_epi16(diff_4, zero);
-
-      __m256i error_1_lo =
-          _mm256_i32gather_epi32(error_measure_lut, diff_1_lo, 4);
-      __m256i error_1_hi =
-          _mm256_i32gather_epi32(error_measure_lut, diff_1_hi, 4);
-      __m256i error_2_lo =
-          _mm256_i32gather_epi32(error_measure_lut, diff_2_lo, 4);
-      __m256i error_2_hi =
-          _mm256_i32gather_epi32(error_measure_lut, diff_2_hi, 4);
-      __m256i error_3_lo =
-          _mm256_i32gather_epi32(error_measure_lut, diff_3_lo, 4);
-      __m256i error_3_hi =
-          _mm256_i32gather_epi32(error_measure_lut, diff_3_hi, 4);
-      __m256i error_4_lo =
-          _mm256_i32gather_epi32(error_measure_lut, diff_4_lo, 4);
-      __m256i error_4_hi =
-          _mm256_i32gather_epi32(error_measure_lut, diff_4_hi, 4);
-
-      __m256i error_1 = _mm256_add_epi32(error_1_lo, error_1_hi);
-      __m256i error_2 = _mm256_add_epi32(error_2_lo, error_2_hi);
-      __m256i error_3 = _mm256_add_epi32(error_3_lo, error_3_hi);
-      __m256i error_4 = _mm256_add_epi32(error_4_lo, error_4_hi);
-
-      __m256i error_1_2 = _mm256_add_epi32(error_1, error_2);
-      __m256i error_3_4 = _mm256_add_epi32(error_3, error_4);
-
-      __m256i error_1_2_3_4 = _mm256_add_epi32(error_1_2, error_3_4);
-      row_error = _mm256_add_epi32(row_error, error_1_2_3_4);
-    }
-    __m256i col_error_lo = _mm256_unpacklo_epi32(row_error, zero);
-    __m256i col_error_hi = _mm256_unpackhi_epi32(row_error, zero);
-    __m256i col_error_temp = _mm256_add_epi64(col_error_lo, col_error_hi);
-    col_error = _mm256_add_epi64(col_error, col_error_temp);
-    // Error summation for remaining width, which is not multiple of 16
-    if (p_width & 0xf) {
-      for (int k = 0; k < 4; ++k) {
-        for (int l = j * 16; l < p_width; ++l) {
-          sum_error +=
-              (int64_t)error_measure(dst[l + ((i * 4) + k) * dst_stride] -
-                                     ref[l + ((i * 4) + k) * ref_stride]);
-        }
-      }
-    }
-  }
-  __m128i sum_error_q_0 = _mm256_castsi256_si128(col_error);
-  __m128i sum_error_q_1 = _mm256_extracti128_si256(col_error, 1);
-  sum_error_q_0 = _mm_add_epi64(sum_error_q_0, sum_error_q_1);
-  int64_t sum_error_d_0, sum_error_d_1;
-  xx_storel_64(&sum_error_d_0, sum_error_q_0);
-  xx_storel_64(&sum_error_d_1, _mm_srli_si128(sum_error_q_0, 8));
-  sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
-  // Error summation for remaining height, which is not multiple of 4
-  if (p_height & 0x3) {
-    for (int k = i * 4; k < p_height; ++k) {
-      for (int l = 0; l < p_width; ++l) {
-        sum_error += (int64_t)error_measure(dst[l + k * dst_stride] -
-                                            ref[l + k * ref_stride]);
-      }
-    }
-  }
-  return sum_error;
-}
-
-void av1_warp_affine_avx2(const int32_t *mat, const uint8_t *ref, int width,
-                          int height, int stride, uint8_t *pred, int p_col,
-                          int p_row, int p_width, int p_height, int p_stride,
-                          int subsampling_x, int subsampling_y,
-                          ConvolveParams *conv_params, int16_t alpha,
-                          int16_t beta, int16_t gamma, int16_t delta) {
-  __m256i horz_out[8];
-  int i, j, k;
-  const int bd = 8;
-  const int reduce_bits_horiz = conv_params->round_0;
-  const int reduce_bits_vert = conv_params->is_compound
-                                   ? conv_params->round_1
-                                   : 2 * FILTER_BITS - reduce_bits_horiz;
-  const int offset_bits_horiz = bd + FILTER_BITS - 1;
-  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
-
-  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
-  const __m256i reduce_bits_vert_const =
-      _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1));
-  const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert);
-  const int round_bits =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
-
-  const __m256i round_const = _mm256_set1_epi16(
-      (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
-  const __m128i shift = _mm_cvtsi32_si128(reduce_bits_horiz);
-
-  __m256i res_sub_const, round_bits_const, wt;
-  unpack_weights_and_set_round_const_avx2(conv_params, round_bits, offset_bits,
-                                          &res_sub_const, &round_bits_const,
-                                          &wt);
-
-  __m256i res_add_const_1;
-  if (conv_params->is_compound == 1) {
-    res_add_const_1 = _mm256_add_epi32(reduce_bits_vert_const, res_add_const);
-  } else {
-    res_add_const_1 = _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
-                                        ((1 << reduce_bits_vert) >> 1));
-  }
-  const int32_t const1 = alpha * (-4) + beta * (-4) +
-                         (1 << (WARPEDDIFF_PREC_BITS - 1)) +
-                         (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
-  const int32_t const2 = gamma * (-4) + delta * (-4) +
-                         (1 << (WARPEDDIFF_PREC_BITS - 1)) +
-                         (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
-  const int32_t const3 = ((1 << WARP_PARAM_REDUCE_BITS) - 1);
-  const int16_t const4 = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1));
-  const int16_t const5 = (1 << (FILTER_BITS - reduce_bits_horiz));
-
-  __m256i shuffle_src[4];
-  shuffle_src[0] = _mm256_load_si256((__m256i *)shuffle_src0);
-  shuffle_src[1] = _mm256_load_si256((__m256i *)shuffle_src1);
-  shuffle_src[2] = _mm256_load_si256((__m256i *)shuffle_src2);
-  shuffle_src[3] = _mm256_load_si256((__m256i *)shuffle_src3);
-
-  for (i = 0; i < p_height; i += 8) {
-    for (j = 0; j < p_width; j += 8) {
-      const int32_t src_x = (p_col + j + 4) << subsampling_x;
-      const int32_t src_y = (p_row + i + 4) << subsampling_y;
-      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
-      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
-      const int32_t x4 = dst_x >> subsampling_x;
-      const int32_t y4 = dst_y >> subsampling_y;
-
-      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
-      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
-      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-
-      // Add in all the constant terms, including rounding and offset
-      sx4 += const1;
-      sy4 += const2;
-
-      sx4 &= ~const3;
-      sy4 &= ~const3;
-
-      // Horizontal filter
-      // If the block is aligned such that, after clamping, every sample
-      // would be taken from the leftmost/rightmost column, then we can
-      // skip the expensive horizontal filter.
-
-      if (ix4 <= -7) {
-        int iy, row = 0;
-        for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
-          iy = iy4 + k;
-          iy = clamp(iy, 0, height - 1);
-          const __m256i temp_0 =
-              _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
-          iy = iy4 + k + 1;
-          iy = clamp(iy, 0, height - 1);
-          const __m256i temp_1 =
-              _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
-          horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
-          row += 1;
-        }
-        iy = iy4 + k;
-        iy = clamp(iy, 0, height - 1);
-        horz_out[row] = _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
-      } else if (ix4 >= width + 6) {
-        int iy, row = 0;
-        for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
-          iy = iy4 + k;
-          iy = clamp(iy, 0, height - 1);
-          const __m256i temp_0 = _mm256_set1_epi16(
-              const4 + ref[iy * stride + (width - 1)] * const5);
-          iy = iy4 + k + 1;
-          iy = clamp(iy, 0, height - 1);
-          const __m256i temp_1 = _mm256_set1_epi16(
-              const4 + ref[iy * stride + (width - 1)] * const5);
-          horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
-          row += 1;
-        }
-        iy = iy4 + k;
-        iy = clamp(iy, 0, height - 1);
-        horz_out[row] =
-            _mm256_set1_epi16(const4 + ref[iy * stride + (width - 1)] * const5);
-      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
-        const int out_of_boundary_left = -(ix4 - 6);
-        const int out_of_boundary_right = (ix4 + 8) - width;
-        int iy, sx, row = 0;
-        for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
-          iy = iy4 + k;
-          iy = clamp(iy, 0, height - 1);
-          __m128i src0 =
-              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-          iy = iy4 + k + 1;
-          iy = clamp(iy, 0, height - 1);
-          __m128i src1 =
-              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-
-          if (out_of_boundary_left >= 0) {
-            const __m128i shuffle_reg_left =
-                _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
-            src0 = _mm_shuffle_epi8(src0, shuffle_reg_left);
-            src1 = _mm_shuffle_epi8(src1, shuffle_reg_left);
-          }
-          if (out_of_boundary_right >= 0) {
-            const __m128i shuffle_reg_right = _mm_loadu_si128(
-                (__m128i *)warp_pad_right[out_of_boundary_right]);
-            src0 = _mm_shuffle_epi8(src0, shuffle_reg_right);
-            src1 = _mm_shuffle_epi8(src1, shuffle_reg_right);
-          }
-          sx = sx4 + beta * (k + 4);
-          const __m256i src_01 =
-              _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
-          horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row,
-                                 shuffle_src, &round_const, &shift);
-          row += 1;
-        }
-        iy = iy4 + k;
-        iy = clamp(iy, 0, height - 1);
-        __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-        if (out_of_boundary_left >= 0) {
-          const __m128i shuffle_reg_left =
-              _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
-          src = _mm_shuffle_epi8(src, shuffle_reg_left);
-        }
-        if (out_of_boundary_right >= 0) {
-          const __m128i shuffle_reg_right =
-              _mm_loadu_si128((__m128i *)warp_pad_right[out_of_boundary_right]);
-          src = _mm_shuffle_epi8(src, shuffle_reg_right);
-        }
-        sx = sx4 + beta * (k + 4);
-        const __m256i src_01 = _mm256_castsi128_si256(src);
-        __m256i coeff[4];
-        prepare_horizontal_filter_coeff(alpha, sx, coeff);
-        filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src,
-                               &round_const, &shift, row);
-      } else {
-        prepare_warp_horizontal_filter_avx2(
-            ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height,
-            i, &round_const, &shift, shuffle_src);
-      }
-
-      // Vertical filter
-      prepare_warp_vertical_filter_avx2(
-          pred, horz_out, conv_params, gamma, delta, p_height, p_stride,
-          p_width, i, j, sy4, reduce_bits_vert, &res_add_const_1, round_bits,
-          &res_sub_const, &round_bits_const, &wt);
-    }
-  }
-}

diff --git a/av1/common/x86/warp_plane_sse2.c b/av1/common/x86/warp_plane_sse2.c
deleted file mode 100644
index 35caad6..0000000
--- a/av1/common/x86/warp_plane_sse2.c
+++ /dev/null

@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <emmintrin.h>
-
-#include "aom_dsp/x86/synonyms.h"
-#include "av1/common/warped_motion.h"
-#include "config/av1_rtcd.h"
-
-int64_t av1_calc_frame_error_sse2(const uint8_t *const ref, int ref_stride,
-                                  const uint8_t *const dst, int p_width,
-                                  int p_height, int dst_stride) {
-  int64_t sum_error = 0;
-  int i, j;
-  __m128i row_error, col_error;
-  __m128i zero = _mm_set1_epi16(0);
-  __m128i dup_255 = _mm_set1_epi16(255);
-  col_error = zero;
-  for (i = 0; i < (p_height); i++) {
-    row_error = zero;
-    for (j = 0; j < (p_width / 16); j++) {
-      __m128i ref_8 =
-          _mm_load_si128((__m128i *)(ref + (j * 16) + (i * ref_stride)));
-      __m128i dst_8 =
-          _mm_load_si128((__m128i *)(dst + (j * 16) + (i * dst_stride)));
-      __m128i ref_16_lo = _mm_unpacklo_epi8(ref_8, zero);
-      __m128i ref_16_hi = _mm_unpackhi_epi8(ref_8, zero);
-      __m128i dst_16_lo = _mm_unpacklo_epi8(dst_8, zero);
-      __m128i dst_16_hi = _mm_unpackhi_epi8(dst_8, zero);
-
-      __m128i diff_1 =
-          _mm_add_epi16(_mm_sub_epi16(dst_16_lo, ref_16_lo), dup_255);
-      __m128i diff_2 =
-          _mm_add_epi16(_mm_sub_epi16(dst_16_hi, ref_16_hi), dup_255);
-
-      __m128i error_1_lo =
-          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_1, 3)],
-                        error_measure_lut[_mm_extract_epi16(diff_1, 2)],
-                        error_measure_lut[_mm_extract_epi16(diff_1, 1)],
-                        error_measure_lut[_mm_extract_epi16(diff_1, 0)]);
-      __m128i error_1_hi =
-          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_1, 7)],
-                        error_measure_lut[_mm_extract_epi16(diff_1, 6)],
-                        error_measure_lut[_mm_extract_epi16(diff_1, 5)],
-                        error_measure_lut[_mm_extract_epi16(diff_1, 4)]);
-      __m128i error_2_lo =
-          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_2, 3)],
-                        error_measure_lut[_mm_extract_epi16(diff_2, 2)],
-                        error_measure_lut[_mm_extract_epi16(diff_2, 1)],
-                        error_measure_lut[_mm_extract_epi16(diff_2, 0)]);
-      __m128i error_2_hi =
-          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_2, 7)],
-                        error_measure_lut[_mm_extract_epi16(diff_2, 6)],
-                        error_measure_lut[_mm_extract_epi16(diff_2, 5)],
-                        error_measure_lut[_mm_extract_epi16(diff_2, 4)]);
-
-      __m128i error_1 = _mm_add_epi32(error_1_lo, error_1_hi);
-      __m128i error_2 = _mm_add_epi32(error_2_lo, error_2_hi);
-      __m128i error_1_2 = _mm_add_epi32(error_1, error_2);
-
-      row_error = _mm_add_epi32(row_error, error_1_2);
-    }
-    __m128i col_error_lo = _mm_unpacklo_epi32(row_error, zero);
-    __m128i col_error_hi = _mm_unpackhi_epi32(row_error, zero);
-    __m128i col_error_temp = _mm_add_epi64(col_error_lo, col_error_hi);
-    col_error = _mm_add_epi64(col_error, col_error_temp);
-    // Error summation for remaining width, which is not multiple of 16
-    if (p_width & 0xf) {
-      for (int l = j * 16; l < p_width; ++l) {
-        sum_error += (int64_t)error_measure(dst[l + i * dst_stride] -
-                                            ref[l + i * ref_stride]);
-      }
-    }
-  }
-  int64_t sum_error_d_0, sum_error_d_1;
-  xx_storel_64(&sum_error_d_0, col_error);
-  xx_storel_64(&sum_error_d_1, _mm_srli_si128(col_error, 8));
-  sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
-  return sum_error;
-}

diff --git a/av1/common/x86/warp_plane_sse4.c b/av1/common/x86/warp_plane_sse4.c
deleted file mode 100644
index a47d6ad..0000000
--- a/av1/common/x86/warp_plane_sse4.c
+++ /dev/null

@@ -1,967 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <emmintrin.h>
-#include <smmintrin.h>
-
-#include "config/av1_rtcd.h"
-
-#include "av1/common/warped_motion.h"
-
-/* This is a modified version of 'av1_warped_filter' from warped_motion.c:
-   * Each coefficient is stored in 8 bits instead of 16 bits
-   * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
-
-     This is done in order to avoid overflow: Since the tap with the largest
-     coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
-     order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
-     convolve functions.
-
-     Instead, we use the summation order
-     ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
-     The rearrangement of coefficients in this table is so that we can get the
-     coefficients into the correct order more quickly.
-*/
-/* clang-format off */
-DECLARE_ALIGNED(8, const int8_t,
-                av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
-#if WARPEDPIXEL_PREC_BITS == 6
-  // [-1, 0)
-  { 0, 127,   0, 0,   0,   1, 0, 0}, { 0, 127,   0, 0,  -1,   2, 0, 0},
-  { 1, 127,  -1, 0,  -3,   4, 0, 0}, { 1, 126,  -2, 0,  -4,   6, 1, 0},
-  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 125,  -4, 0,  -6,  11, 1, 0},
-  { 1, 124,  -4, 0,  -7,  13, 1, 0}, { 2, 123,  -5, 0,  -8,  15, 1, 0},
-  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 121,  -6, 0, -10,  20, 1, 0},
-  { 2, 120,  -7, 0, -11,  22, 2, 0}, { 2, 119,  -8, 0, -12,  25, 2, 0},
-  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 116,  -9, 0, -13,  29, 2, 0},
-  { 3, 114, -10, 0, -14,  32, 3, 0}, { 3, 113, -10, 0, -15,  35, 2, 0},
-  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 109, -11, 0, -16,  40, 3, 0},
-  { 3, 108, -12, 0, -16,  42, 3, 0}, { 4, 106, -13, 0, -17,  45, 3, 0},
-  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 102, -14, 0, -17,  50, 3, 0},
-  { 4, 100, -14, 0, -17,  52, 3, 0}, { 4,  98, -15, 0, -18,  55, 4, 0},
-  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  94, -16, 0, -18,  60, 4, 0},
-  { 4,  91, -16, 0, -18,  63, 4, 0}, { 4,  89, -16, 0, -18,  65, 4, 0},
-  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  85, -17, 0, -18,  70, 4, 0},
-  { 4,  82, -17, 0, -18,  73, 4, 0}, { 4,  80, -17, 0, -18,  75, 4, 0},
-  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  75, -18, 0, -17,  80, 4, 0},
-  { 4,  73, -18, 0, -17,  82, 4, 0}, { 4,  70, -18, 0, -17,  85, 4, 0},
-  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  65, -18, 0, -16,  89, 4, 0},
-  { 4,  63, -18, 0, -16,  91, 4, 0}, { 4,  60, -18, 0, -16,  94, 4, 0},
-  { 3,  58, -18, 0, -15,  96, 4, 0}, { 4,  55, -18, 0, -15,  98, 4, 0},
-  { 3,  52, -17, 0, -14, 100, 4, 0}, { 3,  50, -17, 0, -14, 102, 4, 0},
-  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  45, -17, 0, -13, 106, 4, 0},
-  { 3,  42, -16, 0, -12, 108, 3, 0}, { 3,  40, -16, 0, -11, 109, 3, 0},
-  { 3,  37, -15, 0, -11, 111, 3, 0}, { 2,  35, -15, 0, -10, 113, 3, 0},
-  { 3,  32, -14, 0, -10, 114, 3, 0}, { 2,  29, -13, 0,  -9, 116, 3, 0},
-  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  25, -12, 0,  -8, 119, 2, 0},
-  { 2,  22, -11, 0,  -7, 120, 2, 0}, { 1,  20, -10, 0,  -6, 121, 2, 0},
-  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  15,  -8, 0,  -5, 123, 2, 0},
-  { 1,  13,  -7, 0,  -4, 124, 1, 0}, { 1,  11,  -6, 0,  -4, 125, 1, 0},
-  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 1,   6,  -4, 0,  -2, 126, 1, 0},
-  { 0,   4,  -3, 0,  -1, 127, 1, 0}, { 0,   2,  -1, 0,   0, 127, 0, 0},
-  // [0, 1)
-  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -1,   2, 0, 0, 127,   0,  0},
-  { 0,  -3,   4, 1, 1, 127,  -2,  0}, { 0,  -5,   6, 1, 1, 127,  -2,  0},
-  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -7,  11, 2, 2, 126,  -4, -1},
-  {-1,  -8,  13, 2, 3, 125,  -5, -1}, {-1, -10,  16, 3, 3, 124,  -6, -1},
-  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -12,  20, 3, 4, 122,  -7, -1},
-  {-1, -13,  23, 3, 4, 121,  -8, -1}, {-2, -14,  25, 4, 5, 120,  -9, -1},
-  {-1, -15,  27, 4, 5, 119, -10, -1}, {-1, -16,  30, 4, 5, 118, -11, -1},
-  {-2, -17,  33, 5, 6, 116, -12, -1}, {-2, -17,  35, 5, 6, 114, -12, -1},
-  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  41, 6, 7, 111, -14, -2},
-  {-2, -19,  43, 6, 7, 110, -15, -2}, {-2, -20,  46, 6, 7, 108, -15, -2},
-  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  51, 7, 7, 104, -16, -2},
-  {-2, -21,  54, 7, 7, 102, -17, -2}, {-2, -21,  56, 7, 8, 100, -18, -2},
-  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  62, 7, 8,  96, -19, -2},
-  {-2, -22,  64, 7, 8,  94, -19, -2}, {-2, -22,  67, 8, 8,  91, -20, -2},
-  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -22,  72, 8, 8,  87, -21, -2},
-  {-2, -21,  74, 8, 8,  84, -21, -2}, {-2, -22,  77, 8, 8,  82, -21, -2},
-  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  82, 8, 8,  77, -22, -2},
-  {-2, -21,  84, 8, 8,  74, -21, -2}, {-2, -21,  87, 8, 8,  72, -22, -2},
-  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -20,  91, 8, 8,  67, -22, -2},
-  {-2, -19,  94, 8, 7,  64, -22, -2}, {-2, -19,  96, 8, 7,  62, -22, -2},
-  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -18, 100, 8, 7,  56, -21, -2},
-  {-2, -17, 102, 7, 7,  54, -21, -2}, {-2, -16, 104, 7, 7,  51, -21, -2},
-  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 108, 7, 6,  46, -20, -2},
-  {-2, -15, 110, 7, 6,  43, -19, -2}, {-2, -14, 111, 7, 6,  41, -19, -2},
-  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 114, 6, 5,  35, -17, -2},
-  {-1, -12, 116, 6, 5,  33, -17, -2}, {-1, -11, 118, 5, 4,  30, -16, -1},
-  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -9, 120, 5, 4,  25, -14, -2},
-  {-1,  -8, 121, 4, 3,  23, -13, -1}, {-1,  -7, 122, 4, 3,  20, -12, -1},
-  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -6, 124, 3, 3,  16, -10, -1},
-  {-1,  -5, 125, 3, 2,  13,  -8, -1}, {-1,  -4, 126, 2, 2,  11,  -7, -1},
-  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   6,  -5,  0},
-  { 0,  -2, 127, 1, 1,   4,  -3,  0}, { 0,   0, 127, 0, 0,   2,  -1,  0},
-  // [1, 2)
-  { 0, 0, 127,   0, 0,   1,   0, 0}, { 0, 0, 127,   0, 0,  -1,   2, 0},
-  { 0, 1, 127,  -1, 0,  -3,   4, 0}, { 0, 1, 126,  -2, 0,  -4,   6, 1},
-  { 0, 1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 125,  -4, 0,  -6,  11, 1},
-  { 0, 1, 124,  -4, 0,  -7,  13, 1}, { 0, 2, 123,  -5, 0,  -8,  15, 1},
-  { 0, 2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 121,  -6, 0, -10,  20, 1},
-  { 0, 2, 120,  -7, 0, -11,  22, 2}, { 0, 2, 119,  -8, 0, -12,  25, 2},
-  { 0, 3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 116,  -9, 0, -13,  29, 2},
-  { 0, 3, 114, -10, 0, -14,  32, 3}, { 0, 3, 113, -10, 0, -15,  35, 2},
-  { 0, 3, 111, -11, 0, -15,  37, 3}, { 0, 3, 109, -11, 0, -16,  40, 3},
-  { 0, 3, 108, -12, 0, -16,  42, 3}, { 0, 4, 106, -13, 0, -17,  45, 3},
-  { 0, 4, 104, -13, 0, -17,  47, 3}, { 0, 4, 102, -14, 0, -17,  50, 3},
-  { 0, 4, 100, -14, 0, -17,  52, 3}, { 0, 4,  98, -15, 0, -18,  55, 4},
-  { 0, 4,  96, -15, 0, -18,  58, 3}, { 0, 4,  94, -16, 0, -18,  60, 4},
-  { 0, 4,  91, -16, 0, -18,  63, 4}, { 0, 4,  89, -16, 0, -18,  65, 4},
-  { 0, 4,  87, -17, 0, -18,  68, 4}, { 0, 4,  85, -17, 0, -18,  70, 4},
-  { 0, 4,  82, -17, 0, -18,  73, 4}, { 0, 4,  80, -17, 0, -18,  75, 4},
-  { 0, 4,  78, -18, 0, -18,  78, 4}, { 0, 4,  75, -18, 0, -17,  80, 4},
-  { 0, 4,  73, -18, 0, -17,  82, 4}, { 0, 4,  70, -18, 0, -17,  85, 4},
-  { 0, 4,  68, -18, 0, -17,  87, 4}, { 0, 4,  65, -18, 0, -16,  89, 4},
-  { 0, 4,  63, -18, 0, -16,  91, 4}, { 0, 4,  60, -18, 0, -16,  94, 4},
-  { 0, 3,  58, -18, 0, -15,  96, 4}, { 0, 4,  55, -18, 0, -15,  98, 4},
-  { 0, 3,  52, -17, 0, -14, 100, 4}, { 0, 3,  50, -17, 0, -14, 102, 4},
-  { 0, 3,  47, -17, 0, -13, 104, 4}, { 0, 3,  45, -17, 0, -13, 106, 4},
-  { 0, 3,  42, -16, 0, -12, 108, 3}, { 0, 3,  40, -16, 0, -11, 109, 3},
-  { 0, 3,  37, -15, 0, -11, 111, 3}, { 0, 2,  35, -15, 0, -10, 113, 3},
-  { 0, 3,  32, -14, 0, -10, 114, 3}, { 0, 2,  29, -13, 0,  -9, 116, 3},
-  { 0, 2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  25, -12, 0,  -8, 119, 2},
-  { 0, 2,  22, -11, 0,  -7, 120, 2}, { 0, 1,  20, -10, 0,  -6, 121, 2},
-  { 0, 1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  15,  -8, 0,  -5, 123, 2},
-  { 0, 1,  13,  -7, 0,  -4, 124, 1}, { 0, 1,  11,  -6, 0,  -4, 125, 1},
-  { 0, 1,   8,  -5, 0,  -3, 126, 1}, { 0, 1,   6,  -4, 0,  -2, 126, 1},
-  { 0, 0,   4,  -3, 0,  -1, 127, 1}, { 0, 0,   2,  -1, 0,   0, 127, 0},
-  // dummy (replicate row index 191)
-  { 0, 0,   2,  -1, 0,   0, 127, 0},
-
-#else
-  // [-1, 0)
-  { 0, 127,   0, 0,   0,   1, 0, 0}, { 1, 127,  -1, 0,  -3,   4, 0, 0},
-  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 124,  -4, 0,  -7,  13, 1, 0},
-  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 120,  -7, 0, -11,  22, 2, 0},
-  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 114, -10, 0, -14,  32, 3, 0},
-  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 108, -12, 0, -16,  42, 3, 0},
-  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 100, -14, 0, -17,  52, 3, 0},
-  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  91, -16, 0, -18,  63, 4, 0},
-  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  82, -17, 0, -18,  73, 4, 0},
-  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  73, -18, 0, -17,  82, 4, 0},
-  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  63, -18, 0, -16,  91, 4, 0},
-  { 3,  58, -18, 0, -15,  96, 4, 0}, { 3,  52, -17, 0, -14, 100, 4, 0},
-  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  42, -16, 0, -12, 108, 3, 0},
-  { 3,  37, -15, 0, -11, 111, 3, 0}, { 3,  32, -14, 0, -10, 114, 3, 0},
-  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  22, -11, 0,  -7, 120, 2, 0},
-  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  13,  -7, 0,  -4, 124, 1, 0},
-  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 0,   4,  -3, 0,  -1, 127, 1, 0},
-  // [0, 1)
-  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -3,   4, 1, 1, 127,  -2,  0},
-  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -8,  13, 2, 3, 125,  -5, -1},
-  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -13,  23, 3, 4, 121,  -8, -1},
-  {-1, -15,  27, 4, 5, 119, -10, -1}, {-2, -17,  33, 5, 6, 116, -12, -1},
-  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  43, 6, 7, 110, -15, -2},
-  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  54, 7, 7, 102, -17, -2},
-  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  64, 7, 8,  94, -19, -2},
-  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -21,  74, 8, 8,  84, -21, -2},
-  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  84, 8, 8,  74, -21, -2},
-  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -19,  94, 8, 7,  64, -22, -2},
-  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -17, 102, 7, 7,  54, -21, -2},
-  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 110, 7, 6,  43, -19, -2},
-  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 116, 6, 5,  33, -17, -2},
-  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -8, 121, 4, 3,  23, -13, -1},
-  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -5, 125, 3, 2,  13,  -8, -1},
-  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   4,  -3,  0},
-  // [1, 2)
-  { 0,  0, 127,   0, 0,   1,   0, 0}, { 0, 1, 127,  -1, 0,  -3,   4, 0},
-  { 0,  1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 124,  -4, 0,  -7,  13, 1},
-  { 0,  2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 120,  -7, 0, -11,  22, 2},
-  { 0,  3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 114, -10, 0, -14,  32, 3},
-  { 0,  3, 111, -11, 0, -15,  37, 3}, { 0, 3, 108, -12, 0, -16,  42, 3},
-  { 0,  4, 104, -13, 0, -17,  47, 3}, { 0, 4, 100, -14, 0, -17,  52, 3},
-  { 0,  4,  96, -15, 0, -18,  58, 3}, { 0, 4,  91, -16, 0, -18,  63, 4},
-  { 0,  4,  87, -17, 0, -18,  68, 4}, { 0, 4,  82, -17, 0, -18,  73, 4},
-  { 0,  4,  78, -18, 0, -18,  78, 4}, { 0, 4,  73, -18, 0, -17,  82, 4},
-  { 0,  4,  68, -18, 0, -17,  87, 4}, { 0, 4,  63, -18, 0, -16,  91, 4},
-  { 0,  3,  58, -18, 0, -15,  96, 4}, { 0, 3,  52, -17, 0, -14, 100, 4},
-  { 0,  3,  47, -17, 0, -13, 104, 4}, { 0, 3,  42, -16, 0, -12, 108, 3},
-  { 0,  3,  37, -15, 0, -11, 111, 3}, { 0, 3,  32, -14, 0, -10, 114, 3},
-  { 0,  2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  22, -11, 0,  -7, 120, 2},
-  { 0,  1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  13,  -7, 0,  -4, 124, 1},
-  { 0,  1,   8,  -5, 0,  -3, 126, 1}, { 0, 0,   4,  -3, 0,  -1, 127, 1},
-  // dummy (replicate row index 95)
-  { 0, 0,   4,  -3, 0,  -1, 127, 1},
-#endif  // WARPEDPIXEL_PREC_BITS == 6
-};
-/* clang-format on */
-
-// Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15
-// in an SSE register into two sequences:
-// 0, 2, 2, 4, ..., 12, 12, 14, <don't care>
-// 1, 3, 3, 5, ..., 13, 13, 15, <don't care>
-DECLARE_ALIGNED(16, static const uint8_t,
-                even_mask[16]) = { 0, 2,  2,  4,  4,  6,  6,  8,
-                                   8, 10, 10, 12, 12, 14, 14, 0 };
-
-DECLARE_ALIGNED(16, static const uint8_t,
-                odd_mask[16]) = { 1, 3,  3,  5,  5,  7,  7,  9,
-                                  9, 11, 11, 13, 13, 15, 15, 0 };
-
-DECLARE_ALIGNED(16, static const uint8_t,
-                shuffle_alpha0_mask01[16]) = { 0, 1, 0, 1, 0, 1, 0, 1,
-                                               0, 1, 0, 1, 0, 1, 0, 1 };
-
-DECLARE_ALIGNED(16, static const uint8_t,
-                shuffle_alpha0_mask23[16]) = { 2, 3, 2, 3, 2, 3, 2, 3,
-                                               2, 3, 2, 3, 2, 3, 2, 3 };
-
-DECLARE_ALIGNED(16, static const uint8_t,
-                shuffle_alpha0_mask45[16]) = { 4, 5, 4, 5, 4, 5, 4, 5,
-                                               4, 5, 4, 5, 4, 5, 4, 5 };
-
-DECLARE_ALIGNED(16, static const uint8_t,
-                shuffle_alpha0_mask67[16]) = { 6, 7, 6, 7, 6, 7, 6, 7,
-                                               6, 7, 6, 7, 6, 7, 6, 7 };
-
-DECLARE_ALIGNED(16, static const uint8_t,
-                shuffle_gamma0_mask0[16]) = { 0, 1, 2, 3, 0, 1, 2, 3,
-                                              0, 1, 2, 3, 0, 1, 2, 3 };
-
-DECLARE_ALIGNED(16, static const uint8_t,
-                shuffle_gamma0_mask1[16]) = { 4, 5, 6, 7, 4, 5, 6, 7,
-                                              4, 5, 6, 7, 4, 5, 6, 7 };
-
-DECLARE_ALIGNED(16, static const uint8_t,
-                shuffle_gamma0_mask2[16]) = { 8, 9, 10, 11, 8, 9, 10, 11,
-                                              8, 9, 10, 11, 8, 9, 10, 11 };
-
-DECLARE_ALIGNED(16, static const uint8_t,
-                shuffle_gamma0_mask3[16]) = { 12, 13, 14, 15, 12, 13, 14, 15,
-                                              12, 13, 14, 15, 12, 13, 14, 15 };
-
-static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
-                                     const int offset_bits_horiz,
-                                     const int reduce_bits_horiz, int k) {
-  const __m128i src_even =
-      _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)even_mask));
-  const __m128i src_odd =
-      _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)odd_mask));
-  // The pixel order we need for 'src' is:
-  // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
-  const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
-  const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]);
-  // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
-  const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
-                                            _mm_srli_si128(src_odd, 4));
-  const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]);
-  // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
-  const __m128i src_13 =
-      _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
-  const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]);
-  // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
-  const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
-                                            _mm_srli_si128(src_even, 6));
-  const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]);
-
-  const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
-                                             ((1 << reduce_bits_horiz) >> 1));
-
-  // Note: The values res_02 + res_46 and res_13 + res_57 both
-  // fit into int16s at this point, but their sum may be too wide to fit
-  // into an int16. However, once we also add round_const, the sum of
-  // all of these fits into a uint16.
-  //
-  // The wrapping behaviour of _mm_add_* is used here to make sure we
-  // get the correct result despite converting between different
-  // (implicit) types.
-  const __m128i res_even = _mm_add_epi16(res_02, res_46);
-  const __m128i res_odd = _mm_add_epi16(res_13, res_57);
-  const __m128i res =
-      _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
-  tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
-}
-
-static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
-                                                   __m128i *coeff) {
-  // Filter even-index pixels
-  const __m128i tmp_0 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
-  const __m128i tmp_1 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
-  const __m128i tmp_2 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
-  const __m128i tmp_3 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
-  const __m128i tmp_4 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
-  const __m128i tmp_5 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
-  const __m128i tmp_6 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
-  const __m128i tmp_7 = _mm_loadl_epi64(
-      (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
-
-  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
-  const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
-  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3
-  const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
-  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6
-  const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
-  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7
-  const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
-
-  // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6
-  const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
-  // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6
-  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
-  // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7
-  const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
-  // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7
-  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
-
-  // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
-  coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14);
-  // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
-  coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14);
-  // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
-  coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15);
-  // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
-  coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15);
-}
-
-static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx,
-                                                          __m128i *coeff) {
-  // Filter even-index pixels
-  const __m128i tmp_0 =
-      _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
-
-  // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
-  coeff[0] =
-      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask01));
-  // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
-  coeff[1] =
-      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask23));
-  // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
-  coeff[2] =
-      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask45));
-  // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
-  coeff[3] =
-      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask67));
-}
-
-static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
-                                     int alpha, int k,
-                                     const int offset_bits_horiz,
-                                     const int reduce_bits_horiz) {
-  __m128i coeff[4];
-  prepare_horizontal_filter_coeff(alpha, sx, coeff);
-  filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
-}
-
-static INLINE void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp,
-                                          int stride, int32_t ix4, int32_t iy4,
-                                          int32_t sx4, int alpha, int beta,
-                                          int p_height, int height, int i,
-                                          const int offset_bits_horiz,
-                                          const int reduce_bits_horiz) {
-  int k;
-  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-    int iy = iy4 + k;
-    if (iy < 0)
-      iy = 0;
-    else if (iy > height - 1)
-      iy = height - 1;
-    int sx = sx4 + beta * (k + 4);
-
-    // Load source pixels
-    const __m128i src =
-        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-    horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
-                      reduce_bits_horiz);
-  }
-}
-
-static INLINE void warp_horizontal_filter_alpha0(
-    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
-    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
-    const int offset_bits_horiz, const int reduce_bits_horiz) {
-  (void)alpha;
-  int k;
-  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-    int iy = iy4 + k;
-    if (iy < 0)
-      iy = 0;
-    else if (iy > height - 1)
-      iy = height - 1;
-    int sx = sx4 + beta * (k + 4);
-
-    // Load source pixels
-    const __m128i src =
-        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-
-    __m128i coeff[4];
-    prepare_horizontal_filter_coeff_alpha0(sx, coeff);
-    filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
-  }
-}
-
-static INLINE void warp_horizontal_filter_beta0(
-    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
-    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
-    const int offset_bits_horiz, const int reduce_bits_horiz) {
-  (void)beta;
-  int k;
-  __m128i coeff[4];
-  prepare_horizontal_filter_coeff(alpha, sx4, coeff);
-
-  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-    int iy = iy4 + k;
-    if (iy < 0)
-      iy = 0;
-    else if (iy > height - 1)
-      iy = height - 1;
-
-    // Load source pixels
-    const __m128i src =
-        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-    filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
-  }
-}
-
-static INLINE void warp_horizontal_filter_alpha0_beta0(
-    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
-    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
-    const int offset_bits_horiz, const int reduce_bits_horiz) {
-  (void)beta;
-  (void)alpha;
-  int k;
-
-  __m128i coeff[4];
-  prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
-
-  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-    int iy = iy4 + k;
-    if (iy < 0)
-      iy = 0;
-    else if (iy > height - 1)
-      iy = height - 1;
-
-    // Load source pixels
-    const __m128i src =
-        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-    filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
-  }
-}
-
-static INLINE void unpack_weights_and_set_round_const(
-    ConvolveParams *conv_params, const int round_bits, const int offset_bits,
-    __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) {
-  (void)wt;
-  *res_sub_const =
-      _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
-                     (1 << (offset_bits - conv_params->round_1 - 1)));
-  *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
-
-  const int w0 = conv_params->fwd_offset;
-  const int w1 = conv_params->bck_offset;
-  const __m128i wt0 = _mm_set1_epi16((int16_t)w0);
-  const __m128i wt1 = _mm_set1_epi16((int16_t)w1);
-  *wt = _mm_unpacklo_epi16(wt0, wt1);
-}
-
-static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy,
-                                                  __m128i *coeffs) {
-  const __m128i tmp_0 =
-      _mm_loadu_si128((__m128i *)(av1_warped_filter +
-                                  ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_2 =
-      _mm_loadu_si128((__m128i *)(av1_warped_filter +
-                                  ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_4 =
-      _mm_loadu_si128((__m128i *)(av1_warped_filter +
-                                  ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_6 =
-      _mm_loadu_si128((__m128i *)(av1_warped_filter +
-                                  ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-  const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
-  const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
-  const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
-  const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
-  // even coeffs
-  coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
-  coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10);
-  coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14);
-  coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
-  const __m128i tmp_1 =
-      _mm_loadu_si128((__m128i *)(av1_warped_filter +
-                                  ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_3 =
-      _mm_loadu_si128((__m128i *)(av1_warped_filter +
-                                  ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_5 =
-      _mm_loadu_si128((__m128i *)(av1_warped_filter +
-                                  ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
-  const __m128i tmp_7 =
-      _mm_loadu_si128((__m128i *)(av1_warped_filter +
-                                  ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
-  const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
-  const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
-  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
-  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
-  // odd coeffs
-  coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11);
-  coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11);
-  coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15);
-  coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
-}
-
-static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy,
-                                                         __m128i *coeffs) {
-  const __m128i tmp_0 = _mm_loadu_si128(
-      (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
-
-  // even coeffs
-  coeffs[0] =
-      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask0));
-  coeffs[1] =
-      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask1));
-  coeffs[2] =
-      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask2));
-  coeffs[3] =
-      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask3));
-
-  // odd coeffs
-  coeffs[4] = coeffs[0];
-  coeffs[5] = coeffs[1];
-  coeffs[6] = coeffs[2];
-  coeffs[7] = coeffs[3];
-}
-
-static INLINE void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs,
-                                              __m128i *res_lo, __m128i *res_hi,
-                                              int k) {
-  // Load from tmp and rearrange pairs of consecutive rows into the
-  // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
-  const __m128i *src = tmp + (k + 4);
-  const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
-  const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
-  const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
-  const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
-
-  const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
-  const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
-  const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
-  const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
-
-  const __m128i res_even =
-      _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6));
-
-  // Filter odd-index pixels
-  const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
-  const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
-  const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
-  const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
-
-  const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]);
-  const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]);
-  const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]);
-  const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]);
-
-  const __m128i res_odd =
-      _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7));
-
-  // Rearrange pixels back into the order 0 ... 7
-  *res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-  *res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-}
-
-static INLINE void store_vertical_filter_output(
-    __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const,
-    const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const,
-    uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k,
-    const int reduce_bits_vert, int p_stride, int p_width,
-    const int round_bits) {
-  const int use_wtd_comp_avg = is_uneven_wtd_comp_avg(conv_params);
-  (void)wt;
-  __m128i res_lo_1 = *res_lo;
-  __m128i res_hi_1 = *res_hi;
-
-  if (conv_params->is_compound) {
-    __m128i *const p =
-        (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
-    res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const),
-                              reduce_bits_vert);
-    const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1);
-    __m128i res_lo_16;
-    if (conv_params->do_average) {
-      __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
-      const __m128i p_16 = _mm_loadl_epi64(p);
-
-      if (use_wtd_comp_avg) {
-        const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
-        const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt);
-        const __m128i shifted_32 =
-            _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
-        res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
-      } else {
-        res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
-      }
-
-      res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const);
-
-      res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const),
-                                 round_bits);
-      __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
-      *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo);
-    } else {
-      _mm_storel_epi64(p, temp_lo_16);
-    }
-    if (p_width > 4) {
-      __m128i *const p4 =
-          (__m128i *)&conv_params
-              ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
-      res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const),
-                                reduce_bits_vert);
-      const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1);
-      __m128i res_hi_16;
-
-      if (conv_params->do_average) {
-        __m128i *const dst8_4 =
-            (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
-        const __m128i p4_16 = _mm_loadl_epi64(p4);
-
-        if (use_wtd_comp_avg) {
-          const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
-          const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt);
-          const __m128i shifted_32 =
-              _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
-          res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
-        } else {
-          res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
-        }
-        res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const);
-
-        res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const),
-                                   round_bits);
-        __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
-        *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
-
-      } else {
-        _mm_storel_epi64(p4, temp_hi_16);
-      }
-    }
-  } else {
-    const __m128i res_lo_round = _mm_srai_epi32(
-        _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
-    const __m128i res_hi_round = _mm_srai_epi32(
-        _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
-
-    const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
-    __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
-
-    // Store, blending with 'pred' if needed
-    __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
-
-    // Note: If we're outputting a 4x4 block, we need to be very careful
-    // to only output 4 pixels at this point, to avoid encode/decode
-    // mismatches when encoding with multiple threads.
-    if (p_width == 4) {
-      *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
-    } else {
-      _mm_storel_epi64(p, res_8bit);
-    }
-  }
-}
-
-static INLINE void warp_vertical_filter(
-    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
-    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
-    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
-    const int round_bits, const int offset_bits) {
-  int k;
-  __m128i res_sub_const, round_bits_const, wt;
-  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
-                                     &res_sub_const, &round_bits_const, &wt);
-  // Vertical filter
-  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
-    int sy = sy4 + delta * (k + 4);
-
-    __m128i coeffs[8];
-    prepare_vertical_filter_coeffs(gamma, sy, coeffs);
-
-    __m128i res_lo;
-    __m128i res_hi;
-    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
-
-    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
-                                 &res_sub_const, &round_bits_const, pred,
-                                 conv_params, i, j, k, reduce_bits_vert,
-                                 p_stride, p_width, round_bits);
-  }
-}
-
-static INLINE void warp_vertical_filter_gamma0(
-    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
-    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
-    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
-    const int round_bits, const int offset_bits) {
-  int k;
-  (void)gamma;
-  __m128i res_sub_const, round_bits_const, wt;
-  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
-                                     &res_sub_const, &round_bits_const, &wt);
-  // Vertical filter
-  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
-    int sy = sy4 + delta * (k + 4);
-
-    __m128i coeffs[8];
-    prepare_vertical_filter_coeffs_gamma0(sy, coeffs);
-
-    __m128i res_lo;
-    __m128i res_hi;
-    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
-
-    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
-                                 &res_sub_const, &round_bits_const, pred,
-                                 conv_params, i, j, k, reduce_bits_vert,
-                                 p_stride, p_width, round_bits);
-  }
-}
-
-static INLINE void warp_vertical_filter_delta0(
-    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
-    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
-    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
-    const int round_bits, const int offset_bits) {
-  (void)delta;
-  int k;
-  __m128i res_sub_const, round_bits_const, wt;
-  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
-                                     &res_sub_const, &round_bits_const, &wt);
-
-  __m128i coeffs[8];
-  prepare_vertical_filter_coeffs(gamma, sy4, coeffs);
-  // Vertical filter
-  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
-    __m128i res_lo;
-    __m128i res_hi;
-    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
-
-    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
-                                 &res_sub_const, &round_bits_const, pred,
-                                 conv_params, i, j, k, reduce_bits_vert,
-                                 p_stride, p_width, round_bits);
-  }
-}
-
-static INLINE void warp_vertical_filter_gamma0_delta0(
-    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
-    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
-    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
-    const int round_bits, const int offset_bits) {
-  (void)delta;
-  (void)gamma;
-  int k;
-  __m128i res_sub_const, round_bits_const, wt;
-  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
-                                     &res_sub_const, &round_bits_const, &wt);
-
-  __m128i coeffs[8];
-  prepare_vertical_filter_coeffs_gamma0(sy4, coeffs);
-  // Vertical filter
-  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
-    __m128i res_lo;
-    __m128i res_hi;
-    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
-
-    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
-                                 &res_sub_const, &round_bits_const, pred,
-                                 conv_params, i, j, k, reduce_bits_vert,
-                                 p_stride, p_width, round_bits);
-  }
-}
-
-static INLINE void prepare_warp_vertical_filter(
-    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
-    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
-    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
-    const int round_bits, const int offset_bits) {
-  if (gamma == 0 && delta == 0)
-    warp_vertical_filter_gamma0_delta0(
-        pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j,
-        sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits);
-  else if (gamma == 0 && delta != 0)
-    warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height,
-                                p_stride, p_width, i, j, sy4, reduce_bits_vert,
-                                res_add_const, round_bits, offset_bits);
-  else if (gamma != 0 && delta == 0)
-    warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height,
-                                p_stride, p_width, i, j, sy4, reduce_bits_vert,
-                                res_add_const, round_bits, offset_bits);
-  else
-    warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height,
-                         p_stride, p_width, i, j, sy4, reduce_bits_vert,
-                         res_add_const, round_bits, offset_bits);
-}
-
-static INLINE void prepare_warp_horizontal_filter(
-    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
-    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
-    const int offset_bits_horiz, const int reduce_bits_horiz) {
-  if (alpha == 0 && beta == 0)
-    warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
-                                        beta, p_height, height, i,
-                                        offset_bits_horiz, reduce_bits_horiz);
-  else if (alpha == 0 && beta != 0)
-    warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
-                                  p_height, height, i, offset_bits_horiz,
-                                  reduce_bits_horiz);
-  else if (alpha != 0 && beta == 0)
-    warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
-                                 p_height, height, i, offset_bits_horiz,
-                                 reduce_bits_horiz);
-  else
-    warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
-                           p_height, height, i, offset_bits_horiz,
-                           reduce_bits_horiz);
-}
-
-void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
-                            int height, int stride, uint8_t *pred, int p_col,
-                            int p_row, int p_width, int p_height, int p_stride,
-                            int subsampling_x, int subsampling_y,
-                            ConvolveParams *conv_params, int16_t alpha,
-                            int16_t beta, int16_t gamma, int16_t delta) {
-  __m128i tmp[15];
-  int i, j, k;
-  const int bd = 8;
-  const int reduce_bits_horiz = conv_params->round_0;
-  const int reduce_bits_vert = conv_params->is_compound
-                                   ? conv_params->round_1
-                                   : 2 * FILTER_BITS - reduce_bits_horiz;
-  const int offset_bits_horiz = bd + FILTER_BITS - 1;
-  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
-
-  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
-  const __m128i reduce_bits_vert_const =
-      _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
-  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
-  const int round_bits =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
-
-  /* Note: For this code to work, the left/right frame borders need to be
-  extended by at least 13 pixels each. By the time we get here, other
-  code will have set up this border, but we allow an explicit check
-  for debugging purposes.
-  */
-  /*for (i = 0; i < height; ++i) {
-  for (j = 0; j < 13; ++j) {
-  assert(ref[i * stride - 13 + j] == ref[i * stride]);
-  assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
-  }
-  }*/
-  __m128i res_add_const_1;
-  if (conv_params->is_compound == 1) {
-    res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const);
-  } else {
-    res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
-                                     ((1 << reduce_bits_vert) >> 1));
-  }
-
-  for (i = 0; i < p_height; i += 8) {
-    for (j = 0; j < p_width; j += 8) {
-      const int32_t src_x = (p_col + j + 4) << subsampling_x;
-      const int32_t src_y = (p_row + i + 4) << subsampling_y;
-      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
-      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
-      const int32_t x4 = dst_x >> subsampling_x;
-      const int32_t y4 = dst_y >> subsampling_y;
-
-      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
-      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
-      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-
-      // Add in all the constant terms, including rounding and offset
-      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
-             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
-      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
-             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
-
-      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-
-      // Horizontal filter
-      // If the block is aligned such that, after clamping, every sample
-      // would be taken from the leftmost/rightmost column, then we can
-      // skip the expensive horizontal filter.
-      if (ix4 <= -7) {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          tmp[k + 7] = _mm_set1_epi16(
-              (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
-              ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
-        }
-      } else if (ix4 >= width + 6) {
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          tmp[k + 7] =
-              _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
-                             ref[iy * stride + (width - 1)] *
-                                 (1 << (FILTER_BITS - reduce_bits_horiz)));
-        }
-      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
-        const int out_of_boundary_left = -(ix4 - 6);
-        const int out_of_boundary_right = (ix4 + 8) - width;
-        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-          int sx = sx4 + beta * (k + 4);
-
-          // Load source pixels
-          __m128i src =
-              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-          if (out_of_boundary_left >= 0) {
-            const __m128i shuffle_reg_left =
-                _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
-            src = _mm_shuffle_epi8(src, shuffle_reg_left);
-          }
-          if (out_of_boundary_right >= 0) {
-            const __m128i shuffle_reg_right = _mm_loadu_si128(
-                (__m128i *)warp_pad_right[out_of_boundary_right]);
-            src = _mm_shuffle_epi8(src, shuffle_reg_right);
-          }
-          horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
-                            reduce_bits_horiz);
-        }
-      } else {
-        prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha,
-                                       beta, p_height, height, i,
-                                       offset_bits_horiz, reduce_bits_horiz);
-      }
-
-      // Vertical filter
-      prepare_warp_vertical_filter(
-          pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i,
-          j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits);
-    }
-  }
-}

diff --git a/av1/common/x86/wiener_convolve_avx2.c b/av1/common/x86/wiener_convolve_avx2.c
deleted file mode 100644
index 44ed35c..0000000
--- a/av1/common/x86/wiener_convolve_avx2.c
+++ /dev/null

@@ -1,243 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <immintrin.h>
-#include <assert.h>
-
-#include "config/av1_rtcd.h"
-
-#include "av1/common/convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/convolve_avx2.h"
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/synonyms_avx2.h"
-
-// 128-bit xmmwords are written as [ ... ] with the MSB on the left.
-// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
-// on the left.
-// A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be
-// loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ].
-
-// Exploiting the range of wiener filter coefficients,
-// horizontal filtering can be done in 16 bit intermediate precision.
-// The details are as follows :
-// Consider the horizontal wiener filter coefficients of the following form :
-//      [C0, C1, C2, 2^(FILTER_BITS) -2 * (C0 + C1 + C2), C2, C1, C0]
-// Subtracting  2^(FILTER_BITS) from the centre tap we get the following  :
-//      [C0, C1, C2,     -2 * (C0 + C1 + C2),             C2, C1, C0]
-// The sum of the product "C0 * p0 + C1 * p1 + C2 * p2 -2 * (C0 + C1 + C2) * p3
-// + C2 * p4 + C1 * p5 + C0 * p6" would be in the range of signed 16 bit
-// precision. Finally, after rounding the above result by round_0, we multiply
-// the centre pixel by 2^(FILTER_BITS - round_0) and add it to get the
-// horizontal filter output.
-
-void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const int16_t *filter_x, int x_step_q4,
-                                      const int16_t *filter_y, int y_step_q4,
-                                      int w, int h,
-                                      const ConvolveParams *conv_params) {
-  const int bd = 8;
-  assert(x_step_q4 == 16 && y_step_q4 == 16);
-  assert(!(w & 7));
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS) * 8]);
-  int im_h = h + SUBPEL_TAPS - 2;
-  int im_stride = 8;
-  memset(im_block + (im_h * im_stride), 0, MAX_SB_SIZE);
-  int i, j;
-  const int center_tap = (SUBPEL_TAPS - 1) / 2;
-  const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
-
-  __m256i filt[4], coeffs_h[4], coeffs_v[4], filt_center;
-
-  assert(conv_params->round_0 > 0);
-
-  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
-  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
-  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
-  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
-
-  filt_center = _mm256_load_si256((__m256i const *)filt_center_global_avx2);
-
-  const __m128i coeffs_x = _mm_loadu_si128((__m128i *)filter_x);
-  const __m256i filter_coeffs_x = _mm256_broadcastsi128_si256(coeffs_x);
-
-  // coeffs 0 1 0 1 0 1 0 1
-  coeffs_h[0] =
-      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0200u));
-  // coeffs 2 3 2 3 2 3 2 3
-  coeffs_h[1] =
-      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0604u));
-  // coeffs 4 5 4 5 4 5 4 5
-  coeffs_h[2] =
-      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0a08u));
-  // coeffs 6 7 6 7 6 7 6 7
-  coeffs_h[3] =
-      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0e0cu));
-
-  const __m256i round_const_h =
-      _mm256_set1_epi16((1 << (conv_params->round_0 - 1)));
-  const __m256i round_const_horz =
-      _mm256_set1_epi16((1 << (bd + FILTER_BITS - conv_params->round_0 - 1)));
-  const __m256i clamp_low = _mm256_setzero_si256();
-  const __m256i clamp_high =
-      _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
-  const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0);
-
-  // Add an offset to account for the "add_src" part of the convolve function.
-  const __m128i zero_128 = _mm_setzero_si128();
-  const __m128i offset_0 = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
-  const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset_0);
-
-  const __m256i filter_coeffs_y = _mm256_broadcastsi128_si256(coeffs_y);
-
-  // coeffs 0 1 0 1 0 1 0 1
-  coeffs_v[0] = _mm256_shuffle_epi32(filter_coeffs_y, 0x00);
-  // coeffs 2 3 2 3 2 3 2 3
-  coeffs_v[1] = _mm256_shuffle_epi32(filter_coeffs_y, 0x55);
-  // coeffs 4 5 4 5 4 5 4 5
-  coeffs_v[2] = _mm256_shuffle_epi32(filter_coeffs_y, 0xaa);
-  // coeffs 6 7 6 7 6 7 6 7
-  coeffs_v[3] = _mm256_shuffle_epi32(filter_coeffs_y, 0xff);
-
-  const __m256i round_const_v =
-      _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
-                        (1 << (bd + conv_params->round_1 - 1)));
-  const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
-
-  for (j = 0; j < w; j += 8) {
-    for (i = 0; i < im_h; i += 2) {
-      __m256i data = _mm256_castsi128_si256(
-          _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
-
-      // Load the next line
-      if (i + 1 < im_h)
-        data = _mm256_inserti128_si256(
-            data,
-            _mm_loadu_si128(
-                (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
-            1);
-
-      __m256i res = convolve_lowbd_x(data, coeffs_h, filt);
-
-      res =
-          _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
-
-      __m256i data_0 = _mm256_shuffle_epi8(data, filt_center);
-
-      // multiply the center pixel by 2^(FILTER_BITS - round_0) and add it to
-      // the result
-      data_0 = _mm256_slli_epi16(data_0, FILTER_BITS - conv_params->round_0);
-      res = _mm256_add_epi16(res, data_0);
-      res = _mm256_add_epi16(res, round_const_horz);
-      const __m256i res_clamped =
-          _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high);
-      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res_clamped);
-    }
-
-    /* Vertical filter */
-    {
-      __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
-      __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
-      __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
-      __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
-      __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
-      __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
-
-      __m256i s[8];
-      s[0] = _mm256_unpacklo_epi16(src_0, src_1);
-      s[1] = _mm256_unpacklo_epi16(src_2, src_3);
-      s[2] = _mm256_unpacklo_epi16(src_4, src_5);
-
-      s[4] = _mm256_unpackhi_epi16(src_0, src_1);
-      s[5] = _mm256_unpackhi_epi16(src_2, src_3);
-      s[6] = _mm256_unpackhi_epi16(src_4, src_5);
-
-      for (i = 0; i < h - 1; i += 2) {
-        const int16_t *data = &im_block[i * im_stride];
-
-        const __m256i s6 =
-            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
-        const __m256i s7 =
-            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
-
-        s[3] = _mm256_unpacklo_epi16(s6, s7);
-        s[7] = _mm256_unpackhi_epi16(s6, s7);
-
-        __m256i res_a = convolve(s, coeffs_v);
-        __m256i res_b = convolve(s + 4, coeffs_v);
-
-        const __m256i res_a_round = _mm256_sra_epi32(
-            _mm256_add_epi32(res_a, round_const_v), round_shift_v);
-        const __m256i res_b_round = _mm256_sra_epi32(
-            _mm256_add_epi32(res_b, round_const_v), round_shift_v);
-
-        /* rounding code */
-        // 16 bit conversion
-        const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
-        // 8 bit conversion and saturation to uint8
-        const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
-
-        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
-        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
-
-        // Store values into the destination buffer
-        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
-        __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
-
-        _mm_storel_epi64(p_0, res_0);
-        _mm_storel_epi64(p_1, res_1);
-
-        s[0] = s[1];
-        s[1] = s[2];
-        s[2] = s[3];
-
-        s[4] = s[5];
-        s[5] = s[6];
-        s[6] = s[7];
-      }
-      if (h - i) {
-        s[0] = _mm256_permute2x128_si256(s[0], s[4], 0x20);
-        s[1] = _mm256_permute2x128_si256(s[1], s[5], 0x20);
-        s[2] = _mm256_permute2x128_si256(s[2], s[6], 0x20);
-
-        const int16_t *data = &im_block[i * im_stride];
-        const __m128i s6_ = _mm_loadu_si128((__m128i *)(data + 6 * im_stride));
-        const __m128i s7_ = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
-
-        __m128i s3 = _mm_unpacklo_epi16(s6_, s7_);
-        __m128i s7 = _mm_unpackhi_epi16(s6_, s7_);
-
-        s[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s7, 1);
-        __m256i convolveres = convolve(s, coeffs_v);
-
-        const __m256i res_round = _mm256_sra_epi32(
-            _mm256_add_epi32(convolveres, round_const_v), round_shift_v);
-
-        /* rounding code */
-        // 16 bit conversion
-        __m128i reslo = _mm256_castsi256_si128(res_round);
-        __m128i reshi = _mm256_extracti128_si256(res_round, 1);
-        const __m128i res_16bit = _mm_packus_epi32(reslo, reshi);
-
-        // 8 bit conversion and saturation to uint8
-        const __m128i res_8b = _mm_packus_epi16(res_16bit, res_16bit);
-        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
-        _mm_storel_epi64(p_0, res_8b);
-      }
-    }
-  }
-}

diff --git a/av1/common/x86/wiener_convolve_sse2.c b/av1/common/x86/wiener_convolve_sse2.c
deleted file mode 100644
index bcdcef7..0000000
--- a/av1/common/x86/wiener_convolve_sse2.c
+++ /dev/null

@@ -1,200 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <emmintrin.h>
-#include <assert.h>
-
-#include "config/av1_rtcd.h"
-
-#include "av1/common/convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-
-void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const int16_t *filter_x, int x_step_q4,
-                                      const int16_t *filter_y, int y_step_q4,
-                                      int w, int h,
-                                      const ConvolveParams *conv_params) {
-  const int bd = 8;
-  assert(x_step_q4 == 16 && y_step_q4 == 16);
-  assert(!(w & 7));
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  DECLARE_ALIGNED(16, uint16_t,
-                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
-  int intermediate_height = h + SUBPEL_TAPS - 2;
-  memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
-  int i, j;
-  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
-  const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
-
-  const __m128i zero = _mm_setzero_si128();
-  // Add an offset to account for the "add_src" part of the convolve function.
-  const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);
-
-  /* Horizontal filter */
-  {
-    const __m128i coeffs_x =
-        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const = _mm_set1_epi32(
-        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
-
-    for (i = 0; i < intermediate_height; ++i) {
-      for (j = 0; j < w; j += 8) {
-        const __m128i data =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-
-        // Filter even-index pixels
-        const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
-                                         _mm_add_epi32(res_2, res_6));
-        res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
-                                  conv_params->round_0);
-
-        // Filter odd-index pixels
-        const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
-                                        _mm_add_epi32(res_3, res_7));
-        res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
-                                 conv_params->round_0);
-
-        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
-        __m128i res = _mm_packs_epi32(res_even, res_odd);
-        res = _mm_min_epi16(
-            _mm_max_epi16(res, zero),
-            _mm_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1));
-        _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
-      }
-    }
-  }
-
-  /* Vertical filter */
-  {
-    const __m128i coeffs_y =
-        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const =
-        _mm_set1_epi32((1 << (conv_params->round_1 - 1)) -
-                       (1 << (bd + conv_params->round_1 - 1)));
-
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        // Filter even-index pixels
-        const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
-        const __m128i src_0 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
-        const __m128i src_2 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
-        const __m128i src_4 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
-        const __m128i src_6 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
-
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                               _mm_add_epi32(res_4, res_6));
-
-        // Filter odd-index pixels
-        const __m128i src_1 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
-        const __m128i src_3 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
-        const __m128i src_5 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
-        const __m128i src_7 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
-
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
-
-        // Rearrange pixels back into the order 0 ... 7
-        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-        const __m128i res_lo_round = _mm_srai_epi32(
-            _mm_add_epi32(res_lo, round_const), conv_params->round_1);
-        const __m128i res_hi_round = _mm_srai_epi32(
-            _mm_add_epi32(res_hi, round_const), conv_params->round_1);
-
-        const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
-        __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
-
-        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
-        _mm_storel_epi64(p, res_8bit);
-      }
-    }
-  }
-}

diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 50cf689..5ddf573 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c

@@ -108,27 +108,17 @@
 static AOM_INLINE void set_planes_to_neutral_grey(
     const SequenceHeader *const seq_params, const YV12_BUFFER_CONFIG *const buf,
     int only_chroma) {
-  if (seq_params->use_highbitdepth) {
-    const int val = 1 << (seq_params->bit_depth - 1);
-    for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
-      const int is_uv = plane > 0;
-      uint16_t *const base = CONVERT_TO_SHORTPTR(buf->buffers[plane]);
-      // Set the first row to neutral grey. Then copy the first row to all
-      // subsequent rows.
-      if (buf->crop_heights[is_uv] > 0) {
-        aom_memset16(base, val, buf->crop_widths[is_uv]);
-        for (int row_idx = 1; row_idx < buf->crop_heights[is_uv]; row_idx++) {
-          memcpy(&base[row_idx * buf->strides[is_uv]], base,
-                 sizeof(*base) * buf->crop_widths[is_uv]);
-        }
-      }
-    }
-  } else {
-    for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
-      const int is_uv = plane > 0;
-      for (int row_idx = 0; row_idx < buf->crop_heights[is_uv]; row_idx++) {
-        memset(&buf->buffers[plane][row_idx * buf->uv_stride], 1 << 7,
-               buf->crop_widths[is_uv]);
+  const int val = 1 << (seq_params->bit_depth - 1);
+  for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
+    const int is_uv = plane > 0;
+    uint16_t *const base = CONVERT_TO_SHORTPTR(buf->buffers[plane]);
+    // Set the first row to neutral grey. Then copy the first row to all
+    // subsequent rows.
+    if (buf->crop_heights[is_uv] > 0) {
+      aom_memset16(base, val, buf->crop_widths[is_uv]);
+      for (int row_idx = 1; row_idx < buf->crop_heights[is_uv]; row_idx++) {
+        memcpy(&base[row_idx * buf->strides[is_uv]], base,
+               sizeof(*base) * buf->crop_widths[is_uv]);
       }
     }
   }
@@ -286,8 +276,7 @@
   mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row,
                   pd->subsampling_x, pd->subsampling_y);
   mismatch_check_block_tx(dst, pd->dst.stride, cm->current_frame.order_hint,
-                          plane, pixel_c, pixel_r, blk_w, blk_h,
-                          xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+                          plane, pixel_c, pixel_r, blk_w, blk_h);
 #endif
 }
 
@@ -463,42 +452,6 @@
   } while (--b_h);
 }
 
-static AOM_INLINE void build_mc_border(const uint8_t *src, int src_stride,
-                                       uint8_t *dst, int dst_stride, int x,
-                                       int y, int b_w, int b_h, int w, int h) {
-  // Get a pointer to the start of the real data for this row.
-  const uint8_t *ref_row = src - x - y * src_stride;
-
-  if (y >= h)
-    ref_row += (h - 1) * src_stride;
-  else if (y > 0)
-    ref_row += y * src_stride;
-
-  do {
-    int right = 0, copy;
-    int left = x < 0 ? -x : 0;
-
-    if (left > b_w) left = b_w;
-
-    if (x + b_w > w) right = x + b_w - w;
-
-    if (right > b_w) right = b_w;
-
-    copy = b_w - left - right;
-
-    if (left) memset(dst, ref_row[0], left);
-
-    if (copy) memcpy(dst + left, ref_row + x + left, copy);
-
-    if (right) memset(dst + left + copy, ref_row[w - 1], right);
-
-    dst += dst_stride;
-    ++y;
-
-    if (y > 0 && y < h) ref_row += src_stride;
-  } while (--b_h);
-}
-
 static INLINE int update_extend_mc_border_params(
     const struct scale_factors *const sf, struct buf_2d *const pre_buf,
     MV32 scaled_mv, PadBlock *block, int subpel_x_mv, int subpel_y_mv,
@@ -544,7 +497,7 @@
                                     struct buf_2d *const pre_buf,
                                     MV32 scaled_mv, PadBlock block,
                                     int subpel_x_mv, int subpel_y_mv,
-                                    int do_warp, int is_intrabc, int highbd,
+                                    int do_warp, int is_intrabc,
                                     uint8_t *mc_buf, uint8_t **pre,
                                     int *src_stride) {
   int x_pad = 0, y_pad = 0;
@@ -559,14 +512,8 @@
     const int b_h = block.y1 - block.y0;
 
     // Extend the border.
-    if (highbd) {
-      highbd_build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0,
-                             block.y0, b_w, b_h, pre_buf->width,
-                             pre_buf->height);
-    } else {
-      build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, block.y0, b_w,
-                      b_h, pre_buf->width, pre_buf->height);
-    }
+    highbd_build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, block.y0,
+                           b_w, b_h, pre_buf->width, pre_buf->height);
 
     *src_stride = b_w;
     *pre = mc_buf + y_pad * (AOM_INTERP_EXTEND - 1) * b_w +
@@ -712,11 +659,11 @@
                          use_optflow_refinement,
 #endif  // CONFIG_OPTFLOW_REFINEMENT
                          &scaled_mv, &subpel_x_mv, &subpel_y_mv);
-  extend_mc_border(
-      inter_pred_params->scale_factors, &inter_pred_params->ref_frame_buf,
-      scaled_mv, block, subpel_x_mv, subpel_y_mv,
-      inter_pred_params->mode == WARP_PRED, inter_pred_params->is_intrabc,
-      inter_pred_params->use_hbd_buf, mc_buf[ref], pre, src_stride);
+  extend_mc_border(inter_pred_params->scale_factors,
+                   &inter_pred_params->ref_frame_buf, scaled_mv, block,
+                   subpel_x_mv, subpel_y_mv,
+                   inter_pred_params->mode == WARP_PRED,
+                   inter_pred_params->is_intrabc, mc_buf[ref], pre, src_stride);
 }
 
 #if CONFIG_TIP
@@ -825,11 +772,11 @@
   tip_dec_calc_subpel_params(src_mv, inter_pred_params, mi_x, mi_y, pre,
                              subpel_params, src_stride, &block, &scaled_mv,
                              &subpel_x_mv, &subpel_y_mv);
-  extend_mc_border(
-      inter_pred_params->scale_factors, &inter_pred_params->ref_frame_buf,
-      scaled_mv, block, subpel_x_mv, subpel_y_mv,
-      inter_pred_params->mode == WARP_PRED, inter_pred_params->is_intrabc,
-      inter_pred_params->use_hbd_buf, mc_buf[ref], pre, src_stride);
+  extend_mc_border(inter_pred_params->scale_factors,
+                   &inter_pred_params->ref_frame_buf, scaled_mv, block,
+                   subpel_x_mv, subpel_y_mv,
+                   inter_pred_params->mode == WARP_PRED,
+                   inter_pred_params->is_intrabc, mc_buf[ref], pre, src_stride);
 }
 
 static void av1_dec_setup_tip_frame(AV1_COMMON *cm, MACROBLOCKD *xd,
@@ -1185,8 +1132,7 @@
       continue;
     mismatch_check_block_pre(pd->dst.buf, pd->dst.stride,
                              cm->current_frame.order_hint, plane, pixel_c,
-                             pixel_r, pd->width, pd->height,
-                             xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+                             pixel_r, pd->width, pd->height);
   }
 #endif
 }
@@ -2723,9 +2669,9 @@
   lock_buffer_pool(pool);
   if (aom_realloc_frame_buffer(
           &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x,
-          seq_params->subsampling_y, seq_params->use_highbitdepth,
-          AOM_DEC_BORDER_IN_PIXELS, cm->features.byte_alignment,
-          &cm->cur_frame->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) {
+          seq_params->subsampling_y, AOM_DEC_BORDER_IN_PIXELS,
+          cm->features.byte_alignment, &cm->cur_frame->raw_frame_buffer,
+          pool->get_fb_cb, pool->cb_priv)) {
     unlock_buffer_pool(pool);
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
@@ -2751,9 +2697,8 @@
   YV12_BUFFER_CONFIG *tip_frame_buf = &cm->tip_ref.tip_frame->buf;
   if (aom_realloc_frame_buffer(
           tip_frame_buf, cm->width, cm->height, seq_params->subsampling_x,
-          seq_params->subsampling_y, seq_params->use_highbitdepth,
-          AOM_DEC_BORDER_IN_PIXELS, cm->features.byte_alignment, NULL, NULL,
-          NULL)) {
+          seq_params->subsampling_y, AOM_DEC_BORDER_IN_PIXELS,
+          cm->features.byte_alignment, NULL, NULL, NULL)) {
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
   }
@@ -4210,14 +4155,10 @@
 void av1_free_mc_tmp_buf(ThreadData *thread_data) {
   int ref;
   for (ref = 0; ref < 2; ref++) {
-    if (thread_data->mc_buf_use_highbd)
-      aom_free(CONVERT_TO_SHORTPTR(thread_data->mc_buf[ref]));
-    else
-      aom_free(thread_data->mc_buf[ref]);
+    aom_free(CONVERT_TO_SHORTPTR(thread_data->mc_buf[ref]));
     thread_data->mc_buf[ref] = NULL;
   }
   thread_data->mc_buf_size = 0;
-  thread_data->mc_buf_use_highbd = 0;
 
   aom_free(thread_data->tmp_conv_dst);
   thread_data->tmp_conv_dst = NULL;
@@ -4229,26 +4170,19 @@
 
 static AOM_INLINE void allocate_mc_tmp_buf(AV1_COMMON *const cm,
                                            ThreadData *thread_data,
-                                           int buf_size, int use_highbd) {
+                                           int buf_size) {
   for (int ref = 0; ref < 2; ref++) {
     // The mc_buf/hbd_mc_buf must be zeroed to fix a intermittent valgrind error
     // 'Conditional jump or move depends on uninitialised value' from the loop
     // filter. Uninitialized reads in convolve function (e.g. horiz_4tap path in
     // av1_convolve_2d_sr_avx2()) from mc_buf/hbd_mc_buf are seen to be the
     // potential reason for this issue.
-    if (use_highbd) {
-      uint16_t *hbd_mc_buf;
-      CHECK_MEM_ERROR(cm, hbd_mc_buf, (uint16_t *)aom_memalign(16, buf_size));
-      memset(hbd_mc_buf, 0, buf_size);
-      thread_data->mc_buf[ref] = CONVERT_TO_BYTEPTR(hbd_mc_buf);
-    } else {
-      CHECK_MEM_ERROR(cm, thread_data->mc_buf[ref],
-                      (uint8_t *)aom_memalign(16, buf_size));
-      memset(thread_data->mc_buf[ref], 0, buf_size);
-    }
+    uint16_t *hbd_mc_buf;
+    CHECK_MEM_ERROR(cm, hbd_mc_buf, (uint16_t *)aom_memalign(16, buf_size));
+    memset(hbd_mc_buf, 0, buf_size);
+    thread_data->mc_buf[ref] = CONVERT_TO_BYTEPTR(hbd_mc_buf);
   }
   thread_data->mc_buf_size = buf_size;
-  thread_data->mc_buf_use_highbd = use_highbd;
 
   CHECK_MEM_ERROR(cm, thread_data->tmp_conv_dst,
                   aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
@@ -4367,13 +4301,12 @@
       thread_data->error_info.setjmp = 0;
     }
   }
-  const int use_highbd = cm->seq_params.use_highbitdepth;
-  const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
+  const int buf_size = MC_TEMP_BUF_PELS << 1;
   for (worker_idx = 0; worker_idx < pbi->max_threads - 1; ++worker_idx) {
     DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
     if (thread_data->td->mc_buf_size != buf_size) {
       av1_free_mc_tmp_buf(thread_data->td);
-      allocate_mc_tmp_buf(cm, thread_data->td, buf_size, use_highbd);
+      allocate_mc_tmp_buf(cm, thread_data->td, buf_size);
     }
   }
 }
@@ -4899,12 +4832,10 @@
 }
 
 void av1_read_color_config(struct aom_read_bit_buffer *rb,
-                           int allow_lowbitdepth, SequenceHeader *seq_params,
+                           SequenceHeader *seq_params,
                            struct aom_internal_error_info *error_info) {
   read_bitdepth(rb, seq_params, error_info);
 
-  seq_params->use_highbitdepth =
-      seq_params->bit_depth > AOM_BITS_8 || !allow_lowbitdepth;
   // monochrome bit (not needed for PROFILE_1)
   const int is_monochrome =
       seq_params->profile != PROFILE_1 ? aom_rb_read_bit(rb) : 0;
@@ -5724,9 +5655,9 @@
           if (aom_realloc_frame_buffer(
                   &buf->buf, seq_params->max_frame_width,
                   seq_params->max_frame_height, seq_params->subsampling_x,
-                  seq_params->subsampling_y, seq_params->use_highbitdepth,
-                  AOM_BORDER_IN_PIXELS, features->byte_alignment,
-                  &buf->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) {
+                  seq_params->subsampling_y, AOM_BORDER_IN_PIXELS,
+                  features->byte_alignment, &buf->raw_frame_buffer,
+                  pool->get_fb_cb, pool->cb_priv)) {
             decrease_ref_count(buf, pool);
             unlock_buffer_pool(pool);
             aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
@@ -6444,11 +6375,10 @@
       cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
     av1_alloc_restoration_buffers(cm);
   }
-  const int use_highbd = cm->seq_params.use_highbitdepth;
-  const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
+  const int buf_size = MC_TEMP_BUF_PELS << 1;
   if (pbi->td.mc_buf_size != buf_size) {
     av1_free_mc_tmp_buf(&pbi->td);
-    allocate_mc_tmp_buf(cm, &pbi->td, buf_size, use_highbd);
+    allocate_mc_tmp_buf(cm, &pbi->td, buf_size);
   }
 }
 
@@ -6531,24 +6461,14 @@
 #endif
         for (int r = 0; r < pic_height; ++r) {
           for (int c = 0; c < pic_width; ++c) {
-            if (cm->seq_params.use_highbitdepth) {
 #if CONFIG_CCSO_EXT
-              ext_rec_y[c] = CONVERT_TO_SHORTPTR(xd->plane[pli].dst.buf)[c];
+            ext_rec_y[c] = CONVERT_TO_SHORTPTR(xd->plane[pli].dst.buf)[c];
 #else
-                ext_rec_y[(r + CCSO_PADDING_SIZE) * ccso_stride_ext + c +
-                          CCSO_PADDING_SIZE] =
-                    CONVERT_TO_SHORTPTR(
-                        xd->plane[pli].dst.buf)[r * dst_stride + c];
+              ext_rec_y[(r + CCSO_PADDING_SIZE) * ccso_stride_ext + c +
+                        CCSO_PADDING_SIZE] =
+                  CONVERT_TO_SHORTPTR(
+                      xd->plane[pli].dst.buf)[r * dst_stride + c];
 #endif
-            } else {
-#if CONFIG_CCSO_EXT
-              ext_rec_y[c] = xd->plane[pli].dst.buf[c];
-#else
-                ext_rec_y[(r + CCSO_PADDING_SIZE) * ccso_stride_ext + c +
-                          CCSO_PADDING_SIZE] =
-                    xd->plane[pli].dst.buf[r * dst_stride + c];
-#endif
-            }
           }
 #if CONFIG_CCSO_EXT
           ext_rec_y += ccso_stride_ext;

diff --git a/av1/decoder/decodeframe.h b/av1/decoder/decodeframe.h
index d09c4e8..e1380b7 100644
--- a/av1/decoder/decodeframe.h
+++ b/av1/decoder/decodeframe.h

@@ -67,7 +67,7 @@
 // Implements the color_config() function in the spec. Reports errors by
 // calling rb->error_handler() or aom_internal_error().
 void av1_read_color_config(struct aom_read_bit_buffer *rb,
-                           int allow_lowbitdepth, SequenceHeader *seq_params,
+                           SequenceHeader *seq_params,
                            struct aom_internal_error_info *error_info);
 
 // Implements the timing_info() function in the spec. Reports errors by calling

diff --git a/av1/decoder/decoder.c b/av1/decoder/decoder.c
index e7798b2..12b8201 100644
--- a/av1/decoder/decoder.c
+++ b/av1/decoder/decoder.c

@@ -337,9 +337,7 @@
   return a->y_height == b->y_height && a->y_width == b->y_width &&
          a->uv_height == b->uv_height && a->uv_width == b->uv_width &&
          a->y_stride == b->y_stride && a->uv_stride == b->uv_stride &&
-         a->border == b->border &&
-         (a->flags & YV12_FLAG_HIGHBITDEPTH) ==
-             (b->flags & YV12_FLAG_HIGHBITDEPTH);
+         a->border == b->border;
 }
 
 aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx,

diff --git a/av1/decoder/decoder.h b/av1/decoder/decoder.h
index d7ab3a7..78b6fc9 100644
--- a/av1/decoder/decoder.h
+++ b/av1/decoder/decoder.h

@@ -274,7 +274,6 @@
   // So we track whether this is the first frame or not.
   int decoding_first_frame;
 
-  int allow_lowbitdepth;
   int max_threads;
   int inv_tile_order;
   int need_resync;  // wait for key/intra-only frame.

diff --git a/av1/decoder/obu.c b/av1/decoder/obu.c
index 98d2a92..8670f8a 100644
--- a/av1/decoder/obu.c
+++ b/av1/decoder/obu.c

@@ -241,7 +241,7 @@
 
   av1_read_sequence_header(cm, rb, seq_params);
 
-  av1_read_color_config(rb, pbi->allow_lowbitdepth, seq_params, &cm->error);
+  av1_read_color_config(rb, seq_params, &cm->error);
   if (!(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0) &&
       !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) &&
       !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 0)) {
@@ -385,42 +385,14 @@
          (pbi->output_frame_width_in_tiles_minus_1 + 1) *
              (pbi->output_frame_height_in_tiles_minus_1 + 1));
 
-  // Allocate the tile list output buffer.
-  // Note: if cm->seq_params.use_highbitdepth is 1 and cm->seq_params.bit_depth
-  // is 8, we could allocate less memory, namely, 8 bits/pixel.
   if (aom_alloc_frame_buffer(&pbi->tile_list_outbuf, output_frame_width,
                              output_frame_height, cm->seq_params.subsampling_x,
-                             cm->seq_params.subsampling_y,
-                             (cm->seq_params.use_highbitdepth &&
-                              (cm->seq_params.bit_depth > AOM_BITS_8)),
-                             0, cm->features.byte_alignment))
+                             cm->seq_params.subsampling_y, 0,
+                             cm->features.byte_alignment))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate the tile list output buffer");
 }
 
-static void yv12_tile_copy(const YV12_BUFFER_CONFIG *src, int hstart1,
-                           int hend1, int vstart1, int vend1,
-                           YV12_BUFFER_CONFIG *dst, int hstart2, int vstart2,
-                           int plane) {
-  const int src_stride = (plane > 0) ? src->strides[1] : src->strides[0];
-  const int dst_stride = (plane > 0) ? dst->strides[1] : dst->strides[0];
-  int row, col;
-
-  assert(src->flags & YV12_FLAG_HIGHBITDEPTH);
-  assert(!(dst->flags & YV12_FLAG_HIGHBITDEPTH));
-
-  const uint16_t *src16 =
-      CONVERT_TO_SHORTPTR(src->buffers[plane] + vstart1 * src_stride + hstart1);
-  uint8_t *dst8 = dst->buffers[plane] + vstart2 * dst_stride + hstart2;
-
-  for (row = vstart1; row < vend1; ++row) {
-    for (col = 0; col < (hend1 - hstart1); ++col) *dst8++ = (uint8_t)(*src16++);
-    src16 += src_stride - (hend1 - hstart1);
-    dst8 += dst_stride - (hend1 - hstart1);
-  }
-  return;
-}
-
 static void copy_decoded_tile_to_tile_list_buffer(AV1Decoder *pbi,
                                                   int tile_idx) {
   AV1_COMMON *const cm = &pbi->common;
@@ -453,26 +425,20 @@
     int vstart2 = tr * h;
     int hstart2 = tc * w;
 
-    if (cm->seq_params.use_highbitdepth &&
-        cm->seq_params.bit_depth == AOM_BITS_8) {
-      yv12_tile_copy(cur_frame, hstart1, hend1, vstart1, vend1,
-                     &pbi->tile_list_outbuf, hstart2, vstart2, plane);
-    } else {
-      switch (plane) {
-        case 0:
-          aom_yv12_partial_copy_y(cur_frame, hstart1, hend1, vstart1, vend1,
-                                  &pbi->tile_list_outbuf, hstart2, vstart2);
-          break;
-        case 1:
-          aom_yv12_partial_copy_u(cur_frame, hstart1, hend1, vstart1, vend1,
-                                  &pbi->tile_list_outbuf, hstart2, vstart2);
-          break;
-        case 2:
-          aom_yv12_partial_copy_v(cur_frame, hstart1, hend1, vstart1, vend1,
-                                  &pbi->tile_list_outbuf, hstart2, vstart2);
-          break;
-        default: assert(0);
-      }
+    switch (plane) {
+      case 0:
+        aom_yv12_partial_copy_y(cur_frame, hstart1, hend1, vstart1, vend1,
+                                &pbi->tile_list_outbuf, hstart2, vstart2);
+        break;
+      case 1:
+        aom_yv12_partial_copy_u(cur_frame, hstart1, hend1, vstart1, vend1,
+                                &pbi->tile_list_outbuf, hstart2, vstart2);
+        break;
+      case 2:
+        aom_yv12_partial_copy_v(cur_frame, hstart1, hend1, vstart1, vend1,
+                                &pbi->tile_list_outbuf, hstart2, vstart2);
+        break;
+      default: assert(0);
     }
   }
 }

diff --git a/av1/encoder/aq_variance.c b/av1/encoder/aq_variance.c
index e02dded..5595f53 100644
--- a/av1/encoder/aq_variance.c
+++ b/av1/encoder/aq_variance.c

@@ -34,8 +34,6 @@
 #define ENERGY_IN_BOUNDS(energy) \
   assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
 
-DECLARE_ALIGNED(16, static const uint8_t, av1_all_zeros[MAX_SB_SIZE]) = { 0 };
-
 DECLARE_ALIGNED(16, static const uint16_t,
                 av1_highbd_all_zeros[MAX_SB_SIZE]) = { 0 };
 
@@ -131,20 +129,11 @@
 
   for (i = 0; i < bh; i += 4) {
     for (j = 0; j < bw; j += 4) {
-      if (is_cur_buf_hbd(xd)) {
-        var +=
-            log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf(
-                          x->plane[0].src.buf + i * x->plane[0].src.stride + j,
-                          x->plane[0].src.stride,
-                          CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse) /
-                          16);
-      } else {
-        var +=
-            log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf(
-                          x->plane[0].src.buf + i * x->plane[0].src.stride + j,
-                          x->plane[0].src.stride, av1_all_zeros, 0, &sse) /
-                          16);
-      }
+      var += log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf(
+                           x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+                           x->plane[0].src.stride,
+                           CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse) /
+                           16);
     }
   }
   // Use average of 4x4 log variance. The range for 8 bit 0 - 9.704121561.
@@ -158,17 +147,15 @@
 #define DEFAULT_E_MIDPOINT 10.0
 
 static unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
-  MACROBLOCKD *xd = &x->e_mbd;
   int stride = x->plane[0].src.stride;
   uint8_t *buf = x->plane[0].src.buf;
   const int bw = MI_SIZE * mi_size_wide[bs];
   const int bh = MI_SIZE * mi_size_high[bs];
-  const int hbd = is_cur_buf_hbd(xd);
 
   int var = 0;
   for (int r = 0; r < bh; r += 8)
     for (int c = 0; c < bw; c += 8) {
-      var += av1_haar_ac_sad_8x8_uint8_input(buf + c + r * stride, stride, hbd);
+      var += av1_haar_ac_sad_8x8_uint8_input(buf + c + r * stride, stride);
     }
 
   return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs];

diff --git a/av1/encoder/arm/neon/av1_error_neon.c b/av1/encoder/arm/neon/av1_error_neon.c
deleted file mode 100644
index 22da1a8..0000000
--- a/av1/encoder/arm/neon/av1_error_neon.c
+++ /dev/null

@@ -1,85 +0,0 @@
-/*
- *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "av1/common/arm/mem_neon.h"
-#include "aom_dsp/aom_dsp_common.h"
-
-int64_t av1_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff,
-                             intptr_t block_size, int64_t *ssz) {
-  int64x2_t error = vdupq_n_s64(0);
-  int64x2_t sqcoeff = vdupq_n_s64(0);
-
-  assert(block_size >= 8);
-  assert((block_size % 8) == 0);
-
-  do {
-    const int16x8_t c = load_tran_low_to_s16q(coeff);
-    const int16x8_t d = load_tran_low_to_s16q(dqcoeff);
-    const int16x8_t diff = vsubq_s16(c, d);
-    const int16x4_t diff_lo = vget_low_s16(diff);
-    const int16x4_t diff_hi = vget_high_s16(diff);
-    // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before
-    // accumulating them in 64-bits.
-    const int32x4_t err0 = vmull_s16(diff_lo, diff_lo);
-    const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi);
-    const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1));
-    error = vaddq_s64(error, err2);
-
-    const int16x4_t coeff_lo = vget_low_s16(c);
-    const int16x4_t coeff_hi = vget_high_s16(c);
-    const int32x4_t sqcoeff0 = vmull_s16(coeff_lo, coeff_lo);
-    const int32x4_t sqcoeff1 = vmlal_s16(sqcoeff0, coeff_hi, coeff_hi);
-    const int64x2_t sqcoeff2 =
-        vaddl_s32(vget_low_s32(sqcoeff1), vget_high_s32(sqcoeff1));
-    sqcoeff = vaddq_s64(sqcoeff, sqcoeff2);
-
-    coeff += 8;
-    dqcoeff += 8;
-    block_size -= 8;
-  } while (block_size != 0);
-
-#if defined(__aarch64__)
-  *ssz = vaddvq_s64(sqcoeff);
-  return vaddvq_s64(error);
-#else
-  *ssz = vgetq_lane_s64(sqcoeff, 0) + vgetq_lane_s64(sqcoeff, 1);
-  return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1);
-#endif
-}
-
-int64_t av1_block_error_lp_neon(const int16_t *coeff, const int16_t *dqcoeff,
-                                int block_size) {
-  int64x2_t error = vdupq_n_s64(0);
-
-  assert(block_size >= 8);
-  assert((block_size % 8) == 0);
-
-  do {
-    const int16x8_t c = vld1q_s16(coeff);
-    const int16x8_t d = vld1q_s16(dqcoeff);
-    const int16x8_t diff = vsubq_s16(c, d);
-    const int16x4_t diff_lo = vget_low_s16(diff);
-    const int16x4_t diff_hi = vget_high_s16(diff);
-    // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before
-    // accumulating them in 64-bits.
-    const int32x4_t err0 = vmull_s16(diff_lo, diff_lo);
-    const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi);
-    const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1));
-    error = vaddq_s64(error, err2);
-    coeff += 8;
-    dqcoeff += 8;
-    block_size -= 8;
-  } while (block_size != 0);
-
-  return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1);
-}

diff --git a/av1/encoder/arm/neon/picksrt_neon.c b/av1/encoder/arm/neon/picksrt_neon.c
deleted file mode 100644
index 7e461ac..0000000
--- a/av1/encoder/arm/neon/picksrt_neon.c
+++ /dev/null

@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <arm_neon.h>
-#include <math.h>
-
-#include "aom/aom_integer.h"
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/mem.h"
-#include "av1/common/restoration.h"
-#include "common/tools_common.h"
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-int64_t av1_lowbd_pixel_proj_error_neon(
-    const uint8_t *src8, int width, int height, int src_stride,
-    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
-    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
-  int i, j, k;
-  const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
-  const int32x4_t zero = vdupq_n_s32(0);
-  uint64x2_t sum64 = vreinterpretq_u64_s32(zero);
-  const uint8_t *src = src8;
-  const uint8_t *dat = dat8;
-
-  int64_t err = 0;
-  if (params->r[0] > 0 && params->r[1] > 0) {
-    for (i = 0; i < height; ++i) {
-      int32x4_t err0 = zero;
-      for (j = 0; j <= width - 8; j += 8) {
-        const uint8x8_t d0 = vld1_u8(&dat[j]);
-        const uint8x8_t s0 = vld1_u8(&src[j]);
-        const int16x8_t flt0_16b =
-            vcombine_s16(vqmovn_s32(vld1q_s32(&flt0[j])),
-                         vqmovn_s32(vld1q_s32(&flt0[j + 4])));
-        const int16x8_t flt1_16b =
-            vcombine_s16(vqmovn_s32(vld1q_s32(&flt1[j])),
-                         vqmovn_s32(vld1q_s32(&flt1[j + 4])));
-        const int16x8_t u0 =
-            vreinterpretq_s16_u16(vshll_n_u8(d0, SGRPROJ_RST_BITS));
-        const int16x8_t flt0_0_sub_u = vsubq_s16(flt0_16b, u0);
-        const int16x8_t flt1_0_sub_u = vsubq_s16(flt1_16b, u0);
-        const int16x4_t flt0_16b_sub_u_lo = vget_low_s16(flt0_0_sub_u);
-        const int16x4_t flt0_16b_sub_u_hi = vget_high_s16(flt0_0_sub_u);
-        const int16x4_t flt1_16b_sub_u_lo = vget_low_s16(flt1_0_sub_u);
-        const int16x4_t flt1_16b_sub_u_hi = vget_high_s16(flt1_0_sub_u);
-
-        int32x4_t v0 = vmull_n_s16(flt0_16b_sub_u_lo, (int16_t)xq[0]);
-        v0 = vmlal_n_s16(v0, flt1_16b_sub_u_lo, (int16_t)xq[1]);
-        int32x4_t v1 = vmull_n_s16(flt0_16b_sub_u_hi, (int16_t)xq[0]);
-        v1 = vmlal_n_s16(v1, flt1_16b_sub_u_hi, (int16_t)xq[1]);
-        const int16x4_t vr0 = vqrshrn_n_s32(v0, 11);
-        const int16x4_t vr1 = vqrshrn_n_s32(v1, 11);
-        const int16x8_t e0 = vaddq_s16(vcombine_s16(vr0, vr1),
-                                       vreinterpretq_s16_u16(vsubl_u8(d0, s0)));
-        const int16x4_t e0_lo = vget_low_s16(e0);
-        const int16x4_t e0_hi = vget_high_s16(e0);
-        err0 = vmlal_s16(err0, e0_lo, e0_lo);
-        err0 = vmlal_s16(err0, e0_hi, e0_hi);
-      }
-      for (k = j; k < width; ++k) {
-        const int32_t u = dat[k] << SGRPROJ_RST_BITS;
-        int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
-        const int32_t e = ROUND_POWER_OF_TWO(v, 11) + dat[k] - src[k];
-        err += e * e;
-      }
-      dat += dat_stride;
-      src += src_stride;
-      flt0 += flt0_stride;
-      flt1 += flt1_stride;
-      sum64 = vpadalq_u32(sum64, vreinterpretq_u32_s32(err0));
-    }
-
-  } else if (params->r[0] > 0 || params->r[1] > 0) {
-    const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
-    const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
-    const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
-    for (i = 0; i < height; ++i) {
-      int32x4_t err0 = zero;
-      for (j = 0; j <= width - 8; j += 8) {
-        const uint8x8_t d0 = vld1_u8(&dat[j]);
-        const uint8x8_t s0 = vld1_u8(&src[j]);
-        const uint16x8_t d0s0 = vsubl_u8(d0, s0);
-        const uint16x8x2_t d0w =
-            vzipq_u16(vmovl_u8(d0), vreinterpretq_u16_s32(zero));
-
-        const int32x4_t flt_16b_lo = vld1q_s32(&flt[j]);
-        const int32x4_t flt_16b_hi = vld1q_s32(&flt[j + 4]);
-
-        int32x4_t v0 = vmulq_n_s32(flt_16b_lo, xq_active);
-        v0 = vmlsq_n_s32(v0, vreinterpretq_s32_u16(d0w.val[0]),
-                         xq_active << SGRPROJ_RST_BITS);
-        int32x4_t v1 = vmulq_n_s32(flt_16b_hi, xq_active);
-        v1 = vmlsq_n_s32(v1, vreinterpretq_s32_u16(d0w.val[1]),
-                         xq_active << SGRPROJ_RST_BITS);
-        const int16x4_t vr0 = vqrshrn_n_s32(v0, 11);
-        const int16x4_t vr1 = vqrshrn_n_s32(v1, 11);
-        const int16x8_t e0 =
-            vaddq_s16(vcombine_s16(vr0, vr1), vreinterpretq_s16_u16(d0s0));
-        const int16x4_t e0_lo = vget_low_s16(e0);
-        const int16x4_t e0_hi = vget_high_s16(e0);
-        err0 = vmlal_s16(err0, e0_lo, e0_lo);
-        err0 = vmlal_s16(err0, e0_hi, e0_hi);
-      }
-      for (k = j; k < width; ++k) {
-        const int32_t u = dat[k] << SGRPROJ_RST_BITS;
-        int32_t v = xq_active * (flt[k] - u);
-        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
-        err += e * e;
-      }
-      dat += dat_stride;
-      src += src_stride;
-      flt += flt_stride;
-      sum64 = vpadalq_u32(sum64, vreinterpretq_u32_s32(err0));
-    }
-  } else {
-    uint32x4_t err0 = vreinterpretq_u32_s32(zero);
-    for (i = 0; i < height; ++i) {
-      for (j = 0; j <= width - 16; j += 16) {
-        const uint8x16_t d = vld1q_u8(&dat[j]);
-        const uint8x16_t s = vld1q_u8(&src[j]);
-        const uint8x16_t diff = vabdq_u8(d, s);
-        const uint8x8_t diff0 = vget_low_u8(diff);
-        const uint8x8_t diff1 = vget_high_u8(diff);
-        err0 = vpadalq_u16(err0, vmull_u8(diff0, diff0));
-        err0 = vpadalq_u16(err0, vmull_u8(diff1, diff1));
-      }
-      for (k = j; k < width; ++k) {
-        const int32_t e = dat[k] - src[k];
-        err += e * e;
-      }
-      dat += dat_stride;
-      src += src_stride;
-    }
-    sum64 = vpaddlq_u32(err0);
-  }
-#if defined(__aarch64__)
-  err += vaddvq_u64(sum64);
-#else
-  err += vget_lane_u64(vadd_u64(vget_low_u64(sum64), vget_high_u64(sum64)), 0);
-#endif  // __aarch64__
-  return err;
-}

diff --git a/av1/encoder/av1_noise_estimate.c b/av1/encoder/av1_noise_estimate.c
index 3f0f5dc..4db7083 100644
--- a/av1/encoder/av1_noise_estimate.c
+++ b/av1/encoder/av1_noise_estimate.c

@@ -49,25 +49,8 @@
        (cpi->common.width != resize_pending_params->width ||
         cpi->common.height != resize_pending_params->height));
 
-  if (cpi->common.seq_params.use_highbitdepth) return 0;
-
-// Enable noise estimation if denoising is on.
-#if CONFIG_AV1_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 && cpi->common.width >= 320 &&
-      cpi->common.height >= 180)
-    return 1;
-#endif
-  // Only allow noise estimate under certain encoding mode.
-  // Enabled for 1 pass CBR, speed >=5, and if resolution is same as original.
-  // Not enabled for SVC mode and screen_content_mode.
-  // Not enabled for low resolutions.
-  if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
-      cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.speed >= 5 &&
-      resize_pending == 0 && cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN &&
-      cpi->common.width * cpi->common.height >= 640 * 360)
-    return 1;
-  else
-    return 0;
+  (void)resize_pending;
+  /* if (cpi->common.seq_params.use_highbitdepth) */ return 0;
 }
 
 #if CONFIG_AV1_TEMPORAL_DENOISING

diff --git a/av1/encoder/av1_quantize.c b/av1/encoder/av1_quantize.c
index b93929f..9b16a19 100644
--- a/av1/encoder/av1_quantize.c
+++ b/av1/encoder/av1_quantize.c

@@ -34,94 +34,6 @@
   *eob_ptr = 0;
 }
 
-static void quantize_fp_helper_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int32_t *zbin_ptr,
-    const int32_t *round_ptr, const int32_t *quant_ptr,
-    const int32_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int32_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
-    const qm_val_t *iqm_ptr, int log_scale) {
-  int i, eob = -1;
-  const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
-                            ROUND_POWER_OF_TWO(round_ptr[1], log_scale) };
-  // TODO(jingning) Decide the need of these arguments after the
-  // quantization process is completed.
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
-  (void)iscan;
-
-  const int shift = 16 - log_scale + QUANT_FP_BITS;
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (qm_ptr == NULL && iqm_ptr == NULL) {
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int32_t thresh =
-          (int32_t)ROUND_POWER_OF_TWO(dequant_ptr[rc != 0], QUANT_TABLE_BITS);
-
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = AOMSIGN(coeff);
-      int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      int tmp32 = 0;
-      if ((abs_coeff << (1 + log_scale)) >= thresh) {
-        abs_coeff =
-            clamp64(abs_coeff + rounding[rc != 0], INT16_MIN, INT16_MAX);
-        tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (shift));
-
-        if (tmp32) {
-          qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
-          const tran_low_t abs_dqcoeff =
-              (tran_low_t)ROUND_POWER_OF_TWO_64(
-                  (tran_high_t)tmp32 * dequant_ptr[rc != 0],
-                  QUANT_TABLE_BITS) >>
-              log_scale;
-
-          dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
-        }
-      }
-      if (tmp32) eob = i;
-    }
-  } else {
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const qm_val_t wt = qm_ptr ? qm_ptr[rc] : (1 << AOM_QM_BITS);
-      const qm_val_t iwt = iqm_ptr ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
-      const int dequant =
-          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
-          AOM_QM_BITS;
-      const int coeff_sign = AOMSIGN(coeff);
-      int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      int tmp32 = 0;
-
-      if ((((tran_high_t)abs_coeff * wt) << QUANT_TABLE_BITS) >=
-          ((tran_high_t)dequant_ptr[rc != 0]
-           << (AOM_QM_BITS - (1 + log_scale)))) {
-        abs_coeff += rounding[rc != 0];
-        abs_coeff = clamp64(abs_coeff, INT16_MIN, INT16_MAX);
-
-        tmp32 = (int)((abs_coeff * wt * quant_ptr[rc != 0]) >>
-                      (shift + AOM_QM_BITS));
-
-        qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
-
-        const tran_low_t abs_dqcoeff =
-            (tran_low_t)ROUND_POWER_OF_TWO_64((tran_high_t)tmp32 * dequant,
-                                              QUANT_TABLE_BITS) >>
-            log_scale;
-
-        dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
-      }
-
-      if (tmp32) eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
 static void highbd_quantize_fp_helper_c(
     const tran_low_t *coeff_ptr, intptr_t count, const int32_t *zbin_ptr,
     const int32_t *round_ptr, const int32_t *quant_ptr,
@@ -211,231 +123,6 @@
   *eob_ptr = eob + 1;
 }
 
-void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                       const int32_t *zbin_ptr, const int32_t *round_ptr,
-                       const int32_t *quant_ptr, const int32_t *quant_shift_ptr,
-                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                       const int32_t *dequant_ptr, uint16_t *eob_ptr,
-                       const int16_t *scan, const int16_t *iscan) {
-  quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
-                       quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
-                       eob_ptr, scan, iscan, NULL, NULL, 0);
-}
-
-void av1_quantize_lp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                       const int32_t *round_ptr, const int32_t *quant_ptr,
-                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                       const int32_t *dequant_ptr, uint16_t *eob_ptr,
-                       const int16_t *scan) {
-  int eob = -1;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  // Quantization pass: All coefficients with index >= zero_flag are
-  // skippable. Note: zero_flag can be zero.
-  for (int i = 0; i < n_coeffs; i++) {
-    const int rc = scan[i];
-    const int coeff = coeff_ptr[rc];
-    const int coeff_sign = AOMSIGN(coeff);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-    int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-    tmp = (tmp * quant_ptr[rc != 0]) >> 16;
-
-    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-
-    if (tmp) eob = i;
-  }
-  *eob_ptr = eob + 1;
-}
-
-void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             const int32_t *zbin_ptr, const int32_t *round_ptr,
-                             const int32_t *quant_ptr,
-                             const int32_t *quant_shift_ptr,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             const int32_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan) {
-  quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
-                       quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
-                       eob_ptr, scan, iscan, NULL, NULL, 1);
-}
-
-void av1_quantize_fp_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             const int32_t *zbin_ptr, const int32_t *round_ptr,
-                             const int32_t *quant_ptr,
-                             const int32_t *quant_shift_ptr,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             const int32_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan) {
-  quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
-                       quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
-                       eob_ptr, scan, iscan, NULL, NULL, 2);
-}
-
-void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
-  const qm_val_t *qm_ptr = qparam->qmatrix;
-  const qm_val_t *iqm_ptr = qparam->iqmatrix;
-  if (qm_ptr != NULL && iqm_ptr != NULL) {
-    quantize_fp_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
-                         p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                         dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                         sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
-  } else {
-    switch (qparam->log_scale) {
-      case 0:
-        av1_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
-                        p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                        dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                        sc->iscan);
-        break;
-      case 1:
-        av1_quantize_fp_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
-                              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                              dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                              sc->iscan);
-        break;
-      case 2:
-        av1_quantize_fp_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
-                              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                              dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                              sc->iscan);
-        break;
-      default: assert(0);
-    }
-  }
-}
-
-void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                           const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
-                           tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                           const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
-  const qm_val_t *qm_ptr = qparam->qmatrix;
-  const qm_val_t *iqm_ptr = qparam->iqmatrix;
-  if (qparam->use_quant_b_adapt) {
-    // TODO(sarahparker) These quantize_b optimizations need SIMD
-    // implementations
-    if (qm_ptr != NULL && iqm_ptr != NULL) {
-      aom_quantize_b_adaptive_helper_c(
-          coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
-          p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
-          sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
-    } else {
-      switch (qparam->log_scale) {
-        case 0:
-          aom_quantize_b_adaptive(coeff_ptr, n_coeffs, p->zbin_QTX,
-                                  p->round_QTX, p->quant_QTX,
-                                  p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
-                                  p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
-          break;
-        case 1:
-          aom_quantize_b_32x32_adaptive(
-              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
-              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
-              eob_ptr, sc->scan, sc->iscan);
-          break;
-        case 2:
-          aom_quantize_b_64x64_adaptive(
-              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
-              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
-              eob_ptr, sc->scan, sc->iscan);
-          break;
-        default: assert(0);
-      }
-    }
-  } else {
-    if (qm_ptr != NULL && iqm_ptr != NULL) {
-      aom_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
-                              p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                              dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                              sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
-    } else {
-      switch (qparam->log_scale) {
-        case 0:
-          aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
-                         p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                         dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                         sc->iscan);
-          break;
-        case 1:
-          aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
-                               p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                               dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                               sc->iscan);
-          break;
-        case 2:
-          aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
-                               p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
-                               dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
-                               sc->iscan);
-          break;
-        default: assert(0);
-      }
-    }
-  }
-}
-
-static void quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
-                        int skip_block, const int32_t *round_ptr,
-                        const int32_t quant, tran_low_t *qcoeff_ptr,
-                        tran_low_t *dqcoeff_ptr, const int32_t dequant_ptr,
-                        uint16_t *eob_ptr, const qm_val_t *qm_ptr,
-                        const qm_val_t *iqm_ptr, const int log_scale) {
-  const int rc = 0;
-  const int coeff = coeff_ptr[rc];
-  const int coeff_sign = AOMSIGN(coeff);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int64_t tmp;
-  int eob = -1;
-  int32_t tmp32;
-  int dequant;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    const int wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
-    const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
-    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
-                INT16_MIN, INT16_MAX);
-
-    const int shift = 16 - log_scale + QUANT_FP_BITS;
-    tmp32 = (int32_t)((tmp * wt * quant) >> (shift + AOM_QM_BITS));
-
-    qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
-    dequant = (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-
-    const tran_low_t abs_dqcoeff =
-        (tran_low_t)ROUND_POWER_OF_TWO_64((tran_high_t)tmp32 * dequant,
-                                          QUANT_TABLE_BITS) >>
-        log_scale;
-
-    dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
-    if (tmp32) eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
-  // obsolete skip_block
-  const int skip_block = 0;
-  (void)sc;
-  assert(qparam->log_scale >= 0 && qparam->log_scale < (3));
-  const qm_val_t *qm_ptr = qparam->qmatrix;
-  const qm_val_t *iqm_ptr = qparam->iqmatrix;
-  quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX,
-              p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX[0],
-              eob_ptr, qm_ptr, iqm_ptr, qparam->log_scale);
-}
-
 void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
                                    intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
                                    tran_low_t *qcoeff_ptr,

diff --git a/av1/encoder/av1_quantize.h b/av1/encoder/av1_quantize.h
index d423af2..8aff5fe 100644
--- a/av1/encoder/av1_quantize.h
+++ b/av1/encoder/av1_quantize.h

@@ -129,21 +129,6 @@
 void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
                        tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr);
 
-void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
-
-void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                           const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
-                           tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                           const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
-
-void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
-                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
-
 void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
                                    intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
                                    tran_low_t *qcoeff_ptr,

diff --git a/av1/encoder/cnn.c b/av1/encoder/cnn.c
index ab2b217..702566f 100644
--- a/av1/encoder/cnn.c
+++ b/av1/encoder/cnn.c

@@ -998,60 +998,6 @@
 
 // Assume output already has proper allocation
 // Assume input image buffers all have same resolution and strides
-void av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
-                                   int stride, const CNN_CONFIG *cnn_config,
-                                   const CNN_THREAD_DATA *thread_data,
-                                   CNN_MULTI_OUT *output) {
-  const float max_val = 255.0;
-
-  const int in_width = width + 2 * cnn_config->ext_width;
-  const int in_height = height + 2 * cnn_config->ext_height;
-  const int in_channels = cnn_config->layer_config[0].in_channels;
-  float *inputs[CNN_MAX_CHANNELS];
-  float *input_ =
-      (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
-  const int in_stride = in_width;
-
-  for (int c = 0; c < in_channels; ++c) {
-    inputs[c] = input_ + c * in_stride * in_height;
-    float *input =
-        inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
-
-    if (cnn_config->strict_bounds) {
-      for (int i = 0; i < height; ++i)
-        for (int j = 0; j < width; ++j)
-          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
-      // extend left and right
-      for (int i = 0; i < height; ++i) {
-        for (int j = -cnn_config->ext_width; j < 0; ++j)
-          input[i * in_stride + j] = input[i * in_stride];
-        for (int j = width; j < width + cnn_config->ext_width; ++j)
-          input[i * in_stride + j] = input[i * in_stride + width - 1];
-      }
-      // extend top and bottom
-      for (int i = -cnn_config->ext_height; i < 0; ++i)
-        memcpy(&input[i * in_stride - cnn_config->ext_width],
-               &input[-cnn_config->ext_width], in_width * sizeof(*input));
-      for (int i = height; i < height + cnn_config->ext_height; ++i)
-        memcpy(&input[i * in_stride - cnn_config->ext_width],
-               &input[(height - 1) * in_stride - cnn_config->ext_width],
-               in_width * sizeof(*input));
-    } else {
-      for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
-           ++i)
-        for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
-             ++j)
-          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
-    }
-  }
-  av1_cnn_predict((const float **)inputs, in_width, in_height, in_stride,
-                  cnn_config, thread_data, output);
-
-  aom_free(input_);
-}
-
-// Assume output already has proper allocation
-// Assume input image buffers all have same resolution and strides
 void av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
                                           int stride,
                                           const CNN_CONFIG *cnn_config,
@@ -1109,24 +1055,6 @@
 
 // Assume output already has proper allocation
 // Assume input image buffers all have same resolution and strides
-void av1_cnn_predict_img(uint8_t **dgd, int width, int height, int stride,
-                         const CNN_CONFIG *cnn_config,
-                         const CNN_THREAD_DATA *thread_data, float **output,
-                         int out_stride) {
-  int out_width = 0, out_height = 0, out_channels = 0;
-  av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height,
-                           &out_channels);
-  const int output_chs[1] = { out_channels };
-  const int output_strides[1] = { out_stride };
-  CNN_MULTI_OUT output_struct = { .output_channels = output_chs,
-                                  .output_strides = output_strides,
-                                  .output_buffer = output };
-  av1_cnn_predict_img_multi_out(dgd, width, height, stride, cnn_config,
-                                thread_data, &output_struct);
-}
-
-// Assume output already has proper allocation
-// Assume input image buffers all have same resolution and strides
 void av1_cnn_predict_img_highbd(uint16_t **dgd, int width, int height,
                                 int stride, const CNN_CONFIG *cnn_config,
                                 const CNN_THREAD_DATA *thread_data,

diff --git a/av1/encoder/cnn.h b/av1/encoder/cnn.h
index 8e2dac7..d8a2645 100644
--- a/av1/encoder/cnn.h
+++ b/av1/encoder/cnn.h

@@ -170,10 +170,6 @@
 
 // Prediction functions from set of input image buffers. This function supports
 // CNN with multiple outputs.
-void av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
-                                   int stride, const CNN_CONFIG *cnn_config,
-                                   const CNN_THREAD_DATA *thread_data,
-                                   struct CNN_MULTI_OUT *output);
 void av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
                                           int stride,
                                           const CNN_CONFIG *cnn_config,
@@ -182,10 +178,6 @@
 
 // Prediction functions from set of input image buffers. This function only
 // supports a single output.
-void av1_cnn_predict_img(uint8_t **dgd, int width, int height, int stride,
-                         const CNN_CONFIG *cnn_config,
-                         const CNN_THREAD_DATA *thread_data, float **output,
-                         int out_stride);
 void av1_cnn_predict_img_highbd(uint16_t **dgd, int width, int height,
                                 int stride, const CNN_CONFIG *cnn_config,
                                 const CNN_THREAD_DATA *thread_data,

diff --git a/av1/encoder/compound_type.c b/av1/encoder/compound_type.c
index 38b9843..2e3e31c 100644
--- a/av1/encoder/compound_type.c
+++ b/av1/encoder/compound_type.c

@@ -154,10 +154,8 @@
   const BLOCK_SIZE f_index = split_qtr[bsize];
   assert(f_index != BLOCK_INVALID);
 
-  if (is_cur_buf_hbd(&x->e_mbd)) {
-    pred0 = CONVERT_TO_BYTEPTR(pred0);
-    pred1 = CONVERT_TO_BYTEPTR(pred1);
-  }
+  pred0 = CONVERT_TO_BYTEPTR(pred0);
+  pred1 = CONVERT_TO_BYTEPTR(pred1);
 
   // Residual variance computation over relevant quandrants in order to
   // find TL + BR, TL = sum(1st,2nd,3rd) quadrants of (pred0 - pred1),
@@ -202,17 +200,12 @@
   const int8_t wedge_types = get_wedge_types_lookup(bsize);
   const uint8_t *mask;
   uint64_t sse;
-  const int hbd = is_cur_buf_hbd(xd);
-  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+  const int bd_round = (xd->bd - 8) * 2;
 
   DECLARE_ALIGNED(32, int16_t, residual0[MAX_SB_SQUARE]);  // src - pred0
 
-  if (hbd) {
-    aom_highbd_subtract_block(bh, bw, residual0, bw, src->buf, src->stride,
-                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
-  } else {
-    aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw);
-  }
+  aom_highbd_subtract_block(bh, bw, residual0, bw, src->buf, src->stride,
+                            CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
 
   int64_t sign_limit = ((int64_t)aom_sum_squares_i16(residual0, N) -
                         (int64_t)aom_sum_squares_i16(residual1, N)) *
@@ -274,8 +267,7 @@
   const int8_t wedge_types = get_wedge_types_lookup(bsize);
   const uint8_t *mask;
   uint64_t sse;
-  const int hbd = is_cur_buf_hbd(xd);
-  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+  const int bd_round = (xd->bd - 8) * 2;
   for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
     mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
     sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
@@ -344,20 +336,15 @@
   DIFFWTD_MASK_TYPE cur_mask_type;
   int64_t best_rd = INT64_MAX;
   DIFFWTD_MASK_TYPE best_mask_type = 0;
-  const int hbd = is_cur_buf_hbd(xd);
-  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+  const int bd_round = (xd->bd - 8) * 2;
   DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
   uint8_t *tmp_mask[2] = { xd->seg_mask, seg_mask };
   // try each mask type and its inverse
   for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) {
     // build mask and inverse
-    if (hbd)
-      av1_build_compound_diffwtd_mask_highbd(
-          tmp_mask[cur_mask_type], cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw,
-          CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd);
-    else
-      av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type,
-                                      p0, bw, p1, bw, bh, bw);
+    av1_build_compound_diffwtd_mask_highbd(
+        tmp_mask[cur_mask_type], cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw,
+        CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd);
 
     // compute rd for mask
     uint64_t sse = av1_wedge_sse_from_residuals(residual1, diff10,
@@ -397,15 +384,10 @@
   DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]);  // src - pred1
   DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]);     // pred1 - pred0
 
-  if (is_cur_buf_hbd(xd)) {
-    aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
-                              CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
-    aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(p1), bw,
-                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
-  } else {
-    aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw);
-    aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw);
-  }
+  aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
+                            CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
+  aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(p1), bw,
+                            CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
 
   int8_t wedge_index = -1;
   uint64_t sse;
@@ -427,15 +409,10 @@
   av1_build_inter_predictor_single_buf_y(xd, bsize, 1, pred1, stride);
   const struct buf_2d *const src = &x->plane[0].src;
 
-  if (is_cur_buf_hbd(xd)) {
-    aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
-                              CONVERT_TO_BYTEPTR(pred1), bw, xd->bd);
-    aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(pred1), bw,
-                              CONVERT_TO_BYTEPTR(pred0), bw, xd->bd);
-  } else {
-    aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, pred1, bw);
-    aom_subtract_block(bh, bw, diff10, bw, pred1, bw, pred0, bw);
-  }
+  aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
+                            CONVERT_TO_BYTEPTR(pred1), bw, xd->bd);
+  aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(pred1), bw,
+                            CONVERT_TO_BYTEPTR(pred0), bw, xd->bd);
 }
 
 // Computes the rd cost for the given interintra mode and updates the best
@@ -511,7 +488,7 @@
   const int bw = block_size_wide[bsize];
   int64_t best_interintra_rd_wedge = INT64_MAX;
   int64_t best_total_rd = INT64_MAX;
-  uint8_t *intrapred = get_buf_by_bd(xd, intrapred_);
+  uint8_t *intrapred = CONVERT_TO_BYTEPTR(intrapred_);
   for (INTERINTRA_MODE mode = 0; mode < INTERINTRA_MODES; ++mode) {
     mbmi->interintra_mode = mode;
     av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
@@ -743,8 +720,8 @@
   const int bw = block_size_wide[bsize];
   DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]);
   DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]);
-  uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_);
-  uint8_t *intrapred = get_buf_by_bd(xd, intrapred_);
+  uint8_t *tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf_);
+  uint8_t *intrapred = CONVERT_TO_BYTEPTR(intrapred_);
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
 
@@ -1058,11 +1035,8 @@
   if (cpi->sf.inter_sf.prune_wedge_pred_diff_based &&
       compound_type == COMPOUND_WEDGE) {
     unsigned int sse;
-    if (is_cur_buf_hbd(xd))
-      (void)cpi->fn_ptr[bsize].vf(CONVERT_TO_BYTEPTR(pred0), stride,
-                                  CONVERT_TO_BYTEPTR(pred1), stride, &sse);
-    else
-      (void)cpi->fn_ptr[bsize].vf(pred0, stride, pred1, stride, &sse);
+    (void)cpi->fn_ptr[bsize].vf(CONVERT_TO_BYTEPTR(pred0), stride,
+                                CONVERT_TO_BYTEPTR(pred1), stride, &sse);
     const unsigned int mse =
         ROUND_POWER_OF_TWO(sse, num_pels_log2_lookup[bsize]);
     // If two predictors are very similar, skip wedge compound mode search

diff --git a/av1/encoder/dwt.c b/av1/encoder/dwt.c
index f11bf84..280a18e 100644
--- a/av1/encoder/dwt.c
+++ b/av1/encoder/dwt.c

@@ -73,22 +73,14 @@
 static void dyadic_analyze_53_uint8_input(int levels, int width, int height,
                                           uint8_t *x, int pitch_x,
                                           tran_low_t *c, int pitch_c,
-                                          int dwt_scale_bits, int hbd) {
+                                          int dwt_scale_bits) {
   int lv, i, j, nh, nw, hh = height, hw = width;
   tran_low_t buffer[2 * DWT_MAX_LENGTH];
 
-  if (hbd) {
-    uint16_t *x16 = CONVERT_TO_SHORTPTR(x);
-    for (i = 0; i < height; i++) {
-      for (j = 0; j < width; j++) {
-        c[i * pitch_c + j] = x16[i * pitch_x + j] << dwt_scale_bits;
-      }
-    }
-  } else {
-    for (i = 0; i < height; i++) {
-      for (j = 0; j < width; j++) {
-        c[i * pitch_c + j] = x[i * pitch_x + j] << dwt_scale_bits;
-      }
+  uint16_t *x16 = CONVERT_TO_SHORTPTR(x);
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      c[i * pitch_c + j] = x16[i * pitch_x + j] << dwt_scale_bits;
     }
   }
 
@@ -110,9 +102,8 @@
   }
 }
 
-void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride,
-                               int hbd) {
-  dyadic_analyze_53_uint8_input(4, 8, 8, input, stride, output, 8, 2, hbd);
+void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride) {
+  dyadic_analyze_53_uint8_input(4, 8, 8, input, stride, output, 8, 2);
 }
 
 int av1_haar_ac_sad(tran_low_t *output, int bw, int bh, int stride) {
@@ -148,9 +139,9 @@
   return sse - (uint32_t)(((int64_t)sum * sum) / (bw * bh));
 }
 
-int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride, int hbd) {
+int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride) {
   tran_low_t output[64];
 
-  av1_fdwt8x8_uint8_input_c(input, output, stride, hbd);
+  av1_fdwt8x8_uint8_input_c(input, output, stride);
   return av1_haar_ac_sad(output, 8, 8, 8);
 }

diff --git a/av1/encoder/dwt.h b/av1/encoder/dwt.h
index e095ad7..67433b8 100644
--- a/av1/encoder/dwt.h
+++ b/av1/encoder/dwt.h

@@ -19,8 +19,7 @@
 #define DWT_MAX_LENGTH 64
 
 void av1_fdwt8x8(tran_low_t *input, tran_low_t *output, int stride);
-void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride,
-                               int hbd);
-int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride, int hbd);
+void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride);
+int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride);
 
 #endif  // AOM_AV1_ENCODER_DWT_H_

diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 161356c..8b7daa3 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c

@@ -149,15 +149,6 @@
 };
 /*!\endcond */
 
-unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi,
-                                           const struct buf_2d *ref,
-                                           BLOCK_SIZE bs) {
-  unsigned int sse;
-  const unsigned int var =
-      cpi->fn_ptr[bs].vf(ref->buf, ref->stride, AV1_VAR_OFFS, 0, &sse);
-  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
-}
-
 unsigned int av1_high_get_sby_perpixel_variance(const AV1_COMP *cpi,
                                                 const struct buf_2d *ref,
                                                 BLOCK_SIZE bs, int bd) {

diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 4bfcccf..003be64 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c

@@ -41,13 +41,8 @@
                         const uint8_t *src8, ptrdiff_t src_stride,
                         const uint8_t *pred8, ptrdiff_t pred_stride) {
   assert(rows >= 4 && cols >= 4);
-  if (is_cur_buf_hbd(xd)) {
-    aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride,
-                              pred8, pred_stride, xd->bd);
-    return;
-  }
-  aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8,
-                     pred_stride);
+  aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride,
+                            pred8, pred_stride, xd->bd);
 }
 
 void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
@@ -440,7 +435,6 @@
                               cm->features.reduced_tx_set_used);
 
   txfm_param->bd = xd->bd;
-  txfm_param->is_hbd = is_cur_buf_hbd(xd);
 }
 void av1_setup_quant(TX_SIZE tx_size, int use_optimize_b, int xform_quant_idx,
                      int use_quant_b_adapt, QUANT_PARAM *qparam) {
@@ -618,8 +612,7 @@
     mi_to_pixel_loc(&pixel_c, &pixel_r, xd->mi_col, xd->mi_row, blk_col,
                     blk_row, pd->subsampling_x, pd->subsampling_y);
     mismatch_record_block_tx(dst, pd->dst.stride, cm->current_frame.order_hint,
-                             plane, pixel_c, pixel_r, blk_w, blk_h,
-                             xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+                             plane, pixel_c, pixel_r, blk_w, blk_h);
   }
 #endif
 }
@@ -781,11 +774,7 @@
 
   if (p->eobs[block] > 0) {
     txfm_param.eob = p->eobs[block];
-    if (txfm_param.is_hbd) {
-      av1_highbd_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
-      return;
-    }
-    av1_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
+    av1_highbd_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
   }
 }
 

diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 517265f..9bca2cd 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c

@@ -450,7 +450,7 @@
   seq->enable_opfl_refine = tool_cfg->enable_opfl_refine;
 #endif  // CONFIG_OPTFLOW_REFINEMENT
 #if CONFIG_TIP
-  seq->enable_tip = seq->use_highbitdepth ? tool_cfg->enable_tip : 0;
+  seq->enable_tip = tool_cfg->enable_tip;
 
   if (oxcf->superres_cfg.superres_mode != AOM_SUPERRES_NONE) {
     seq->enable_tip = 0;
@@ -540,7 +540,6 @@
 
   seq_params->profile = oxcf->profile;
   seq_params->bit_depth = oxcf->tool_cfg.bit_depth;
-  seq_params->use_highbitdepth = oxcf->use_highbitdepth;
   seq_params->color_primaries = color_cfg->color_primaries;
   seq_params->transfer_characteristics = color_cfg->transfer_characteristics;
   seq_params->matrix_coefficients = color_cfg->matrix_coefficients;
@@ -735,9 +734,6 @@
     seq_params->enable_tip = 0;
   }
 
-  if (!seq_params->use_highbitdepth) {
-    seq_params->enable_tip = 0;
-  }
 #endif  // CONFIG_TIP
   x->e_mbd.bd = (int)seq_params->bit_depth;
   x->e_mbd.global_motion = cm->global_motion;
@@ -1192,232 +1188,6 @@
   av1_zero(cpi->partition_stats);
 #endif
 
-#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
-  cpi->fn_ptr[BT].sdf = SDF;                                    \
-  cpi->fn_ptr[BT].sdaf = SDAF;                                  \
-  cpi->fn_ptr[BT].vf = VF;                                      \
-  cpi->fn_ptr[BT].svf = SVF;                                    \
-  cpi->fn_ptr[BT].svaf = SVAF;                                  \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;                              \
-  cpi->fn_ptr[BT].jsdaf = JSDAF;                                \
-  cpi->fn_ptr[BT].jsvaf = JSVAF;
-
-  BFP(BLOCK_4X16, aom_sad4x16, aom_sad4x16_avg, aom_variance4x16,
-      aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16,
-      aom_sad4x16x4d, aom_dist_wtd_sad4x16_avg,
-      aom_dist_wtd_sub_pixel_avg_variance4x16)
-
-  BFP(BLOCK_16X4, aom_sad16x4, aom_sad16x4_avg, aom_variance16x4,
-      aom_sub_pixel_variance16x4, aom_sub_pixel_avg_variance16x4,
-      aom_sad16x4x4d, aom_dist_wtd_sad16x4_avg,
-      aom_dist_wtd_sub_pixel_avg_variance16x4)
-
-  BFP(BLOCK_8X32, aom_sad8x32, aom_sad8x32_avg, aom_variance8x32,
-      aom_sub_pixel_variance8x32, aom_sub_pixel_avg_variance8x32,
-      aom_sad8x32x4d, aom_dist_wtd_sad8x32_avg,
-      aom_dist_wtd_sub_pixel_avg_variance8x32)
-
-  BFP(BLOCK_32X8, aom_sad32x8, aom_sad32x8_avg, aom_variance32x8,
-      aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8,
-      aom_sad32x8x4d, aom_dist_wtd_sad32x8_avg,
-      aom_dist_wtd_sub_pixel_avg_variance32x8)
-
-  BFP(BLOCK_16X64, aom_sad16x64, aom_sad16x64_avg, aom_variance16x64,
-      aom_sub_pixel_variance16x64, aom_sub_pixel_avg_variance16x64,
-      aom_sad16x64x4d, aom_dist_wtd_sad16x64_avg,
-      aom_dist_wtd_sub_pixel_avg_variance16x64)
-
-  BFP(BLOCK_64X16, aom_sad64x16, aom_sad64x16_avg, aom_variance64x16,
-      aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16,
-      aom_sad64x16x4d, aom_dist_wtd_sad64x16_avg,
-      aom_dist_wtd_sub_pixel_avg_variance64x16)
-
-  BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128,
-      aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128,
-      aom_sad128x128x4d, aom_dist_wtd_sad128x128_avg,
-      aom_dist_wtd_sub_pixel_avg_variance128x128)
-
-  BFP(BLOCK_128X64, aom_sad128x64, aom_sad128x64_avg, aom_variance128x64,
-      aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64,
-      aom_sad128x64x4d, aom_dist_wtd_sad128x64_avg,
-      aom_dist_wtd_sub_pixel_avg_variance128x64)
-
-  BFP(BLOCK_64X128, aom_sad64x128, aom_sad64x128_avg, aom_variance64x128,
-      aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128,
-      aom_sad64x128x4d, aom_dist_wtd_sad64x128_avg,
-      aom_dist_wtd_sub_pixel_avg_variance64x128)
-
-  BFP(BLOCK_32X16, aom_sad32x16, aom_sad32x16_avg, aom_variance32x16,
-      aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16,
-      aom_sad32x16x4d, aom_dist_wtd_sad32x16_avg,
-      aom_dist_wtd_sub_pixel_avg_variance32x16)
-
-  BFP(BLOCK_16X32, aom_sad16x32, aom_sad16x32_avg, aom_variance16x32,
-      aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32,
-      aom_sad16x32x4d, aom_dist_wtd_sad16x32_avg,
-      aom_dist_wtd_sub_pixel_avg_variance16x32)
-
-  BFP(BLOCK_64X32, aom_sad64x32, aom_sad64x32_avg, aom_variance64x32,
-      aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32,
-      aom_sad64x32x4d, aom_dist_wtd_sad64x32_avg,
-      aom_dist_wtd_sub_pixel_avg_variance64x32)
-
-  BFP(BLOCK_32X64, aom_sad32x64, aom_sad32x64_avg, aom_variance32x64,
-      aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64,
-      aom_sad32x64x4d, aom_dist_wtd_sad32x64_avg,
-      aom_dist_wtd_sub_pixel_avg_variance32x64)
-
-  BFP(BLOCK_32X32, aom_sad32x32, aom_sad32x32_avg, aom_variance32x32,
-      aom_sub_pixel_variance32x32, aom_sub_pixel_avg_variance32x32,
-      aom_sad32x32x4d, aom_dist_wtd_sad32x32_avg,
-      aom_dist_wtd_sub_pixel_avg_variance32x32)
-
-  BFP(BLOCK_64X64, aom_sad64x64, aom_sad64x64_avg, aom_variance64x64,
-      aom_sub_pixel_variance64x64, aom_sub_pixel_avg_variance64x64,
-      aom_sad64x64x4d, aom_dist_wtd_sad64x64_avg,
-      aom_dist_wtd_sub_pixel_avg_variance64x64)
-
-  BFP(BLOCK_16X16, aom_sad16x16, aom_sad16x16_avg, aom_variance16x16,
-      aom_sub_pixel_variance16x16, aom_sub_pixel_avg_variance16x16,
-      aom_sad16x16x4d, aom_dist_wtd_sad16x16_avg,
-      aom_dist_wtd_sub_pixel_avg_variance16x16)
-
-  BFP(BLOCK_16X8, aom_sad16x8, aom_sad16x8_avg, aom_variance16x8,
-      aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8,
-      aom_sad16x8x4d, aom_dist_wtd_sad16x8_avg,
-      aom_dist_wtd_sub_pixel_avg_variance16x8)
-
-  BFP(BLOCK_8X16, aom_sad8x16, aom_sad8x16_avg, aom_variance8x16,
-      aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16,
-      aom_sad8x16x4d, aom_dist_wtd_sad8x16_avg,
-      aom_dist_wtd_sub_pixel_avg_variance8x16)
-
-  BFP(BLOCK_8X8, aom_sad8x8, aom_sad8x8_avg, aom_variance8x8,
-      aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x4d,
-      aom_dist_wtd_sad8x8_avg, aom_dist_wtd_sub_pixel_avg_variance8x8)
-
-  BFP(BLOCK_8X4, aom_sad8x4, aom_sad8x4_avg, aom_variance8x4,
-      aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, aom_sad8x4x4d,
-      aom_dist_wtd_sad8x4_avg, aom_dist_wtd_sub_pixel_avg_variance8x4)
-
-  BFP(BLOCK_4X8, aom_sad4x8, aom_sad4x8_avg, aom_variance4x8,
-      aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, aom_sad4x8x4d,
-      aom_dist_wtd_sad4x8_avg, aom_dist_wtd_sub_pixel_avg_variance4x8)
-
-  BFP(BLOCK_4X4, aom_sad4x4, aom_sad4x4_avg, aom_variance4x4,
-      aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x4d,
-      aom_dist_wtd_sad4x4_avg, aom_dist_wtd_sub_pixel_avg_variance4x4)
-
-#define OBFP(BT, OSDF, OVF, OSVF) \
-  cpi->fn_ptr[BT].osdf = OSDF;    \
-  cpi->fn_ptr[BT].ovf = OVF;      \
-  cpi->fn_ptr[BT].osvf = OSVF;
-
-  OBFP(BLOCK_128X128, aom_obmc_sad128x128, aom_obmc_variance128x128,
-       aom_obmc_sub_pixel_variance128x128)
-  OBFP(BLOCK_128X64, aom_obmc_sad128x64, aom_obmc_variance128x64,
-       aom_obmc_sub_pixel_variance128x64)
-  OBFP(BLOCK_64X128, aom_obmc_sad64x128, aom_obmc_variance64x128,
-       aom_obmc_sub_pixel_variance64x128)
-  OBFP(BLOCK_64X64, aom_obmc_sad64x64, aom_obmc_variance64x64,
-       aom_obmc_sub_pixel_variance64x64)
-  OBFP(BLOCK_64X32, aom_obmc_sad64x32, aom_obmc_variance64x32,
-       aom_obmc_sub_pixel_variance64x32)
-  OBFP(BLOCK_32X64, aom_obmc_sad32x64, aom_obmc_variance32x64,
-       aom_obmc_sub_pixel_variance32x64)
-  OBFP(BLOCK_32X32, aom_obmc_sad32x32, aom_obmc_variance32x32,
-       aom_obmc_sub_pixel_variance32x32)
-  OBFP(BLOCK_32X16, aom_obmc_sad32x16, aom_obmc_variance32x16,
-       aom_obmc_sub_pixel_variance32x16)
-  OBFP(BLOCK_16X32, aom_obmc_sad16x32, aom_obmc_variance16x32,
-       aom_obmc_sub_pixel_variance16x32)
-  OBFP(BLOCK_16X16, aom_obmc_sad16x16, aom_obmc_variance16x16,
-       aom_obmc_sub_pixel_variance16x16)
-  OBFP(BLOCK_16X8, aom_obmc_sad16x8, aom_obmc_variance16x8,
-       aom_obmc_sub_pixel_variance16x8)
-  OBFP(BLOCK_8X16, aom_obmc_sad8x16, aom_obmc_variance8x16,
-       aom_obmc_sub_pixel_variance8x16)
-  OBFP(BLOCK_8X8, aom_obmc_sad8x8, aom_obmc_variance8x8,
-       aom_obmc_sub_pixel_variance8x8)
-  OBFP(BLOCK_4X8, aom_obmc_sad4x8, aom_obmc_variance4x8,
-       aom_obmc_sub_pixel_variance4x8)
-  OBFP(BLOCK_8X4, aom_obmc_sad8x4, aom_obmc_variance8x4,
-       aom_obmc_sub_pixel_variance8x4)
-  OBFP(BLOCK_4X4, aom_obmc_sad4x4, aom_obmc_variance4x4,
-       aom_obmc_sub_pixel_variance4x4)
-  OBFP(BLOCK_4X16, aom_obmc_sad4x16, aom_obmc_variance4x16,
-       aom_obmc_sub_pixel_variance4x16)
-  OBFP(BLOCK_16X4, aom_obmc_sad16x4, aom_obmc_variance16x4,
-       aom_obmc_sub_pixel_variance16x4)
-  OBFP(BLOCK_8X32, aom_obmc_sad8x32, aom_obmc_variance8x32,
-       aom_obmc_sub_pixel_variance8x32)
-  OBFP(BLOCK_32X8, aom_obmc_sad32x8, aom_obmc_variance32x8,
-       aom_obmc_sub_pixel_variance32x8)
-  OBFP(BLOCK_16X64, aom_obmc_sad16x64, aom_obmc_variance16x64,
-       aom_obmc_sub_pixel_variance16x64)
-  OBFP(BLOCK_64X16, aom_obmc_sad64x16, aom_obmc_variance64x16,
-       aom_obmc_sub_pixel_variance64x16)
-
-#define MBFP(BT, MCSDF, MCSVF)  \
-  cpi->fn_ptr[BT].msdf = MCSDF; \
-  cpi->fn_ptr[BT].msvf = MCSVF;
-
-  MBFP(BLOCK_128X128, aom_masked_sad128x128,
-       aom_masked_sub_pixel_variance128x128)
-  MBFP(BLOCK_128X64, aom_masked_sad128x64, aom_masked_sub_pixel_variance128x64)
-  MBFP(BLOCK_64X128, aom_masked_sad64x128, aom_masked_sub_pixel_variance64x128)
-  MBFP(BLOCK_64X64, aom_masked_sad64x64, aom_masked_sub_pixel_variance64x64)
-  MBFP(BLOCK_64X32, aom_masked_sad64x32, aom_masked_sub_pixel_variance64x32)
-  MBFP(BLOCK_32X64, aom_masked_sad32x64, aom_masked_sub_pixel_variance32x64)
-  MBFP(BLOCK_32X32, aom_masked_sad32x32, aom_masked_sub_pixel_variance32x32)
-  MBFP(BLOCK_32X16, aom_masked_sad32x16, aom_masked_sub_pixel_variance32x16)
-  MBFP(BLOCK_16X32, aom_masked_sad16x32, aom_masked_sub_pixel_variance16x32)
-  MBFP(BLOCK_16X16, aom_masked_sad16x16, aom_masked_sub_pixel_variance16x16)
-  MBFP(BLOCK_16X8, aom_masked_sad16x8, aom_masked_sub_pixel_variance16x8)
-  MBFP(BLOCK_8X16, aom_masked_sad8x16, aom_masked_sub_pixel_variance8x16)
-  MBFP(BLOCK_8X8, aom_masked_sad8x8, aom_masked_sub_pixel_variance8x8)
-  MBFP(BLOCK_4X8, aom_masked_sad4x8, aom_masked_sub_pixel_variance4x8)
-  MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_sub_pixel_variance8x4)
-  MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_sub_pixel_variance4x4)
-
-  MBFP(BLOCK_4X16, aom_masked_sad4x16, aom_masked_sub_pixel_variance4x16)
-
-  MBFP(BLOCK_16X4, aom_masked_sad16x4, aom_masked_sub_pixel_variance16x4)
-
-  MBFP(BLOCK_8X32, aom_masked_sad8x32, aom_masked_sub_pixel_variance8x32)
-
-  MBFP(BLOCK_32X8, aom_masked_sad32x8, aom_masked_sub_pixel_variance32x8)
-
-  MBFP(BLOCK_16X64, aom_masked_sad16x64, aom_masked_sub_pixel_variance16x64)
-
-  MBFP(BLOCK_64X16, aom_masked_sad64x16, aom_masked_sub_pixel_variance64x16)
-
-#define SDSFP(BT, SDSF, SDSX4DF) \
-  cpi->fn_ptr[BT].sdsf = SDSF;   \
-  cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
-
-  SDSFP(BLOCK_128X128, aom_sad_skip_128x128, aom_sad_skip_128x128x4d);
-  SDSFP(BLOCK_128X64, aom_sad_skip_128x64, aom_sad_skip_128x64x4d);
-  SDSFP(BLOCK_64X128, aom_sad_skip_64x128, aom_sad_skip_64x128x4d);
-  SDSFP(BLOCK_64X64, aom_sad_skip_64x64, aom_sad_skip_64x64x4d);
-  SDSFP(BLOCK_64X32, aom_sad_skip_64x32, aom_sad_skip_64x32x4d);
-  SDSFP(BLOCK_64X16, aom_sad_skip_64x16, aom_sad_skip_64x16x4d);
-  SDSFP(BLOCK_32X64, aom_sad_skip_32x64, aom_sad_skip_32x64x4d);
-  SDSFP(BLOCK_32X32, aom_sad_skip_32x32, aom_sad_skip_32x32x4d);
-  SDSFP(BLOCK_32X16, aom_sad_skip_32x16, aom_sad_skip_32x16x4d);
-  SDSFP(BLOCK_32X8, aom_sad_skip_32x8, aom_sad_skip_32x8x4d);
-
-  SDSFP(BLOCK_16X64, aom_sad_skip_16x64, aom_sad_skip_16x64x4d);
-  SDSFP(BLOCK_16X32, aom_sad_skip_16x32, aom_sad_skip_16x32x4d);
-  SDSFP(BLOCK_16X16, aom_sad_skip_16x16, aom_sad_skip_16x16x4d);
-  SDSFP(BLOCK_16X8, aom_sad_skip_16x8, aom_sad_skip_16x8x4d);
-  SDSFP(BLOCK_8X16, aom_sad_skip_8x16, aom_sad_skip_8x16x4d);
-  SDSFP(BLOCK_8X8, aom_sad_skip_8x8, aom_sad_skip_8x8x4d);
-  SDSFP(BLOCK_4X16, aom_sad_skip_4x16, aom_sad_skip_4x16x4d);
-  SDSFP(BLOCK_4X8, aom_sad_skip_4x8, aom_sad_skip_4x8x4d);
-  SDSFP(BLOCK_8X32, aom_sad_skip_8x32, aom_sad_skip_8x32x4d);
-#undef SDSFP
-
   highbd_set_var_fns(cpi);
 
   /* av1_init_quantizer() is first called here. Add check in
@@ -1720,56 +1490,31 @@
   uint8_t *src = s->y_buffer;
   int h = cm->height;
   if (yuv_rec_file == NULL) return;
-  if (s->flags & YV12_FLAG_HIGHBITDEPTH) {
-    uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer);
-
-    do {
-      fwrite(src16, s->y_width, 2, yuv_rec_file);
-      src16 += s->y_stride;
-    } while (--h);
-
-    src16 = CONVERT_TO_SHORTPTR(s->u_buffer);
-    h = s->uv_height;
-
-    do {
-      fwrite(src16, s->uv_width, 2, yuv_rec_file);
-      src16 += s->uv_stride;
-    } while (--h);
-
-    src16 = CONVERT_TO_SHORTPTR(s->v_buffer);
-    h = s->uv_height;
-
-    do {
-      fwrite(src16, s->uv_width, 2, yuv_rec_file);
-      src16 += s->uv_stride;
-    } while (--h);
-
-    fflush(yuv_rec_file);
-    return;
-  }
+  uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer);
 
   do {
-    fwrite(src, s->y_width, 1, yuv_rec_file);
-    src += s->y_stride;
+    fwrite(src16, s->y_width, 2, yuv_rec_file);
+    src16 += s->y_stride;
   } while (--h);
 
-  src = s->u_buffer;
+  src16 = CONVERT_TO_SHORTPTR(s->u_buffer);
   h = s->uv_height;
 
   do {
-    fwrite(src, s->uv_width, 1, yuv_rec_file);
-    src += s->uv_stride;
+    fwrite(src16, s->uv_width, 2, yuv_rec_file);
+    src16 += s->uv_stride;
   } while (--h);
 
-  src = s->v_buffer;
+  src16 = CONVERT_TO_SHORTPTR(s->v_buffer);
   h = s->uv_height;
 
   do {
-    fwrite(src, s->uv_width, 1, yuv_rec_file);
-    src += s->uv_stride;
+    fwrite(src16, s->uv_width, 2, yuv_rec_file);
+    src16 += s->uv_stride;
   } while (--h);
 
   fflush(yuv_rec_file);
+  return;
 }
 #endif  // OUTPUT_YUV_REC
 
@@ -1854,7 +1599,6 @@
   // blocks that have few luma colors.
   const uint8_t *src = cpi->unfiltered_source->y_buffer;
   assert(src != NULL);
-  const int use_hbd = cpi->unfiltered_source->flags & YV12_FLAG_HIGHBITDEPTH;
   const int stride = cpi->unfiltered_source->y_stride;
   const int width = cpi->unfiltered_source->y_width;
   const int height = cpi->unfiltered_source->y_height;
@@ -1875,20 +1619,15 @@
       int count_buf[1 << 8];  // Maximum (1 << 8) bins for hbd path.
       const uint8_t *const this_src = src + r * stride + c;
       int n_colors;
-      if (use_hbd)
-        av1_count_colors_highbd(this_src, stride, blk_w, blk_h, bd, NULL,
-                                count_buf, &n_colors, NULL);
-      else
-        av1_count_colors(this_src, stride, blk_w, blk_h, count_buf, &n_colors);
+      av1_count_colors_highbd(this_src, stride, blk_w, blk_h, bd, NULL,
+                              count_buf, &n_colors, NULL);
       if (n_colors > 1 && n_colors <= color_thresh) {
         ++counts_1;
         struct buf_2d buf;
         buf.stride = stride;
         buf.buf = (uint8_t *)this_src;
         const unsigned int var =
-            use_hbd
-                ? av1_high_get_sby_perpixel_variance(cpi, &buf, BLOCK_16X16, bd)
-                : av1_get_sby_perpixel_variance(cpi, &buf, BLOCK_16X16);
+            av1_high_get_sby_perpixel_variance(cpi, &buf, BLOCK_16X16, bd);
         if (var > var_thresh) ++counts_2;
       }
     }
@@ -2010,19 +1749,17 @@
   }
 }
 
-void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
-                             int subsampling_x, int subsampling_y) {
+void av1_check_initial_width(AV1_COMP *cpi, int subsampling_x,
+                             int subsampling_y) {
   AV1_COMMON *const cm = &cpi->common;
   SequenceHeader *const seq_params = &cm->seq_params;
   InitialDimensions *const initial_dimensions = &cpi->initial_dimensions;
 
   if (!initial_dimensions->width ||
-      seq_params->use_highbitdepth != use_highbitdepth ||
       seq_params->subsampling_x != subsampling_x ||
       seq_params->subsampling_y != subsampling_y) {
     seq_params->subsampling_x = subsampling_x;
     seq_params->subsampling_y = subsampling_y;
-    seq_params->use_highbitdepth = use_highbitdepth;
 
     av1_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed);
     av1_set_speed_features_framesize_dependent(cpi, cpi->oxcf.speed);
@@ -2045,8 +1782,7 @@
 int av1_set_size_literal(AV1_COMP *cpi, int width, int height) {
   AV1_COMMON *cm = &cpi->common;
   InitialDimensions *const initial_dimensions = &cpi->initial_dimensions;
-  av1_check_initial_width(cpi, cm->seq_params.use_highbitdepth,
-                          cm->seq_params.subsampling_x,
+  av1_check_initial_width(cpi, cm->seq_params.subsampling_x,
                           cm->seq_params.subsampling_y);
 
   if (width <= 0 || height <= 0) return 1;
@@ -2111,9 +1847,8 @@
   // Reset the frame pointers to the current frame size.
   if (aom_realloc_frame_buffer(
           &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x,
-          seq_params->subsampling_y, seq_params->use_highbitdepth,
-          cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
-          NULL))
+          seq_params->subsampling_y, cpi->oxcf.border_in_pixels,
+          cm->features.byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
 
@@ -2203,18 +1938,10 @@
       const int dst_stride = xd->plane[pli].dst.stride;
       for (int r = 0; r < pic_height; ++r) {
         for (int c = 0; c < pic_width; ++c) {
-          if (cm->seq_params.use_highbitdepth) {
-            if (pli == 0)
-              ext_rec_y[(r + CCSO_PADDING_SIZE) * ccso_stride_ext + c +
-                        CCSO_PADDING_SIZE] =
-                  CONVERT_TO_SHORTPTR(
-                      xd->plane[pli].dst.buf)[r * dst_stride + c];
-          } else {
-            if (pli == 0)
-              ext_rec_y[(r + CCSO_PADDING_SIZE) * ccso_stride_ext + c +
-                        CCSO_PADDING_SIZE] =
-                  xd->plane[pli].dst.buf[r * dst_stride + c];
-          }
+          if (pli == 0)
+            ext_rec_y[(r + CCSO_PADDING_SIZE) * ccso_stride_ext + c +
+                      CCSO_PADDING_SIZE] =
+                CONVERT_TO_SHORTPTR(xd->plane[pli].dst.buf)[r * dst_stride + c];
         }
       }
     }
@@ -2277,16 +2004,10 @@
       }
       for (int r = 0; r < pic_height; ++r) {
         for (int c = 0; c < pic_width; ++c) {
-          if (cm->seq_params.use_highbitdepth) {
-            rec_uv[pli][r * ccso_stride + c] =
-                CONVERT_TO_SHORTPTR(xd->plane[pli].dst.buf)[r * dst_stride + c];
-            org_uv[pli][r * ccso_stride + c] =
-                CONVERT_TO_SHORTPTR(ref_buffer)[r * ref_stride + c];
-          } else {
-            rec_uv[pli][r * ccso_stride + c] =
-                xd->plane[pli].dst.buf[r * dst_stride + c];
-            org_uv[pli][r * ccso_stride + c] = ref_buffer[r * ref_stride + c];
-          }
+          rec_uv[pli][r * ccso_stride + c] =
+              CONVERT_TO_SHORTPTR(xd->plane[pli].dst.buf)[r * dst_stride + c];
+          org_uv[pli][r * ccso_stride + c] =
+              CONVERT_TO_SHORTPTR(ref_buffer)[r * ref_stride + c];
         }
       }
     }
@@ -2829,9 +2550,7 @@
 
     // Compute sse and rate.
     YV12_BUFFER_CONFIG *tip_frame_buf = &cm->tip_ref.tip_frame->buf;
-    *sse = (cm->seq_params.use_highbitdepth)
-               ? aom_highbd_get_y_sse(cpi->source, tip_frame_buf)
-               : aom_get_y_sse(cpi->source, tip_frame_buf);
+    *sse = aom_highbd_get_y_sse(cpi->source, tip_frame_buf);
 
     const int64_t bits = (*size << 3);
     *rate = (bits << 5);  // To match scale.
@@ -2854,10 +2573,7 @@
     tip_as_ref_sse = *sse;
     tip_as_ref_rate = *rate;
   } else {
-    tip_as_ref_sse =
-        (cm->seq_params.use_highbitdepth)
-            ? aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf)
-            : aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+    tip_as_ref_sse = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
 
     const int64_t bits = (*size << 3);
     tip_as_ref_rate = (bits << 5);  // To match scale.
@@ -2970,11 +2686,7 @@
   // fixed interval. Note the reconstruction error if it is the frame before
   // the force key frame
   if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
-    if (seq_params->use_highbitdepth) {
-      cpi->ambient_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
-    } else {
-      cpi->ambient_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
-    }
+    cpi->ambient_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
   }
 
   cm->cur_frame->buf.color_primaries = seq_params->color_primaries;
@@ -3032,9 +2744,7 @@
 
   // Compute sse and rate.
   if (sse != NULL) {
-    *sse = (seq_params->use_highbitdepth)
-               ? aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf)
-               : aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+    *sse = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
   }
   if (rate != NULL) {
     const int64_t bits = (*size << 3);
@@ -3403,13 +3113,8 @@
     if (cm->current_frame_id == -1) {
       int lsb, msb;
       /* quasi-random initialization of current_frame_id for a key frame */
-      if (cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) {
-        lsb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[0] & 0xff;
-        msb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[1] & 0xff;
-      } else {
-        lsb = cpi->source->y_buffer[0] & 0xff;
-        msb = cpi->source->y_buffer[1] & 0xff;
-      }
+      lsb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[0] & 0xff;
+      msb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[1] & 0xff;
       cm->current_frame_id =
           ((msb << 8) + lsb) % (1 << seq_params->frame_id_length);
 
@@ -3660,7 +3365,6 @@
   int res = 0;
   const int subsampling_x = sd->subsampling_x;
   const int subsampling_y = sd->subsampling_y;
-  const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
 
 #if CONFIG_TUNE_VMAF
   if (!is_stat_generation_stage(cpi) &&
@@ -3684,8 +3388,7 @@
       res = -1;
 #endif  //  CONFIG_DENOISE
 
-  if (av1_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
-                         use_highbitdepth, frame_flags))
+  if (av1_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags))
     res = -1;
 #if CONFIG_INTERNAL_STATS
   aom_usec_timer_mark(&timer);
@@ -3738,7 +3441,6 @@
 
 static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
   AV1_COMMON *const cm = &cpi->common;
-  double samples = 0.0;
   const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
   const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
 
@@ -3761,13 +3463,8 @@
                         &cpi->psnr);
       cpi->total_sq_error += psnr.sse[0];
       cpi->total_samples += psnr.samples[0];
-      samples = psnr.samples[0];
-      // TODO(yaowu): unify these two versions into one.
-      if (cm->seq_params.use_highbitdepth)
-        frame_ssim2 =
-            aom_highbd_calc_ssim(orig, recon, &weight, bit_depth, in_bit_depth);
-      else
-        frame_ssim2 = aom_calc_ssim(orig, recon, &weight);
+      frame_ssim2 =
+          aom_highbd_calc_ssim(orig, recon, &weight, bit_depth, in_bit_depth);
 
       cpi->worst_ssim = AOMMIN(cpi->worst_ssim, frame_ssim2);
       cpi->summed_quality += frame_ssim2 * weight;
@@ -3787,31 +3484,6 @@
       }
 #endif
     }
-    if (cpi->b_calculate_blockiness) {
-      if (!cm->seq_params.use_highbitdepth) {
-        const double frame_blockiness =
-            av1_get_blockiness(orig->y_buffer, orig->y_stride, recon->y_buffer,
-                               recon->y_stride, orig->y_width, orig->y_height);
-        cpi->worst_blockiness = AOMMAX(cpi->worst_blockiness, frame_blockiness);
-        cpi->total_blockiness += frame_blockiness;
-      }
-
-      if (cpi->b_calculate_consistency) {
-        if (!cm->seq_params.use_highbitdepth) {
-          const double this_inconsistency = aom_get_ssim_metrics(
-              orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride,
-              orig->y_width, orig->y_height, cpi->ssim_vars, &cpi->metrics, 1);
-
-          const double peak = (double)((1 << in_bit_depth) - 1);
-          const double consistency =
-              aom_sse_to_psnr(samples, peak, cpi->total_inconsistency);
-          if (consistency > 0.0)
-            cpi->worst_consistency =
-                AOMMIN(cpi->worst_consistency, consistency);
-          cpi->total_inconsistency += this_inconsistency;
-        }
-      }
-    }
 
     frame_all =
         aom_calc_fastssim(orig, recon, &y, &u, &v, bit_depth, in_bit_depth);

diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 8bc9407..aef4637 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h

@@ -1112,10 +1112,6 @@
   // Indicates if row-based multi-threading should be enabled or not.
   bool row_mt;
 
-  // Indicates if 16bit frame buffers are to be used i.e., the content is >
-  // 8-bit.
-  bool use_highbitdepth;
-
   // Indicates the bitstream syntax mode. 0 indicates bitstream is saved as
   // Section 5 bitstream, while 1 indicates the bitstream is saved in Annex - B
   // format.
@@ -2907,8 +2903,8 @@
 
 void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf);
 
-void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
-                             int subsampling_x, int subsampling_y);
+void av1_check_initial_width(AV1_COMP *cpi, int subsampling_x,
+                             int subsampling_y);
 
 void av1_init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm,
                                const AV1EncoderConfig *oxcf);

diff --git a/av1/encoder/encoder_alloc.h b/av1/encoder/encoder_alloc.h
index bfc9db1..9072031 100644
--- a/av1/encoder/encoder_alloc.h
+++ b/av1/encoder/encoder_alloc.h

@@ -144,8 +144,7 @@
     if (aom_alloc_frame_buffer(
             &tpl_data->tpl_rec_pool[frame], cm->width, cm->height,
             cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
-            cm->seq_params.use_highbitdepth, tpl_data->border_in_pixels,
-            cm->features.byte_alignment))
+            tpl_data->border_in_pixels, cm->features.byte_alignment))
       aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                          "Failed to allocate frame buffer");
   }
@@ -352,9 +351,8 @@
   if (aom_realloc_frame_buffer(
           &cpi->alt_ref_buffer, oxcf->frm_dim_cfg.width,
           oxcf->frm_dim_cfg.height, seq_params->subsampling_x,
-          seq_params->subsampling_y, seq_params->use_highbitdepth,
-          cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
-          NULL))
+          seq_params->subsampling_y, cpi->oxcf.border_in_pixels,
+          cm->features.byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate altref buffer");
 }
@@ -365,31 +363,30 @@
   const int byte_alignment = cm->features.byte_alignment;
   if (aom_realloc_frame_buffer(
           &cpi->last_frame_uf, cm->width, cm->height, seq_params->subsampling_x,
-          seq_params->subsampling_y, seq_params->use_highbitdepth,
-          cpi->oxcf.border_in_pixels, byte_alignment, NULL, NULL, NULL))
+          seq_params->subsampling_y, cpi->oxcf.border_in_pixels, byte_alignment,
+          NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate last frame buffer");
 
   if (aom_realloc_frame_buffer(
           &cpi->trial_frame_rst, cm->superres_upscaled_width,
           cm->superres_upscaled_height, seq_params->subsampling_x,
-          seq_params->subsampling_y, seq_params->use_highbitdepth,
-          AOM_RESTORATION_FRAME_BORDER, byte_alignment, NULL, NULL, NULL))
+          seq_params->subsampling_y, AOM_RESTORATION_FRAME_BORDER,
+          byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate trial restored frame buffer");
 
   if (aom_realloc_frame_buffer(
           &cpi->scaled_source, cm->width, cm->height, seq_params->subsampling_x,
-          seq_params->subsampling_y, seq_params->use_highbitdepth,
-          cpi->oxcf.border_in_pixels, byte_alignment, NULL, NULL, NULL))
+          seq_params->subsampling_y, cpi->oxcf.border_in_pixels, byte_alignment,
+          NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate scaled source buffer");
 
   if (aom_realloc_frame_buffer(
           &cpi->scaled_last_source, cm->width, cm->height,
           seq_params->subsampling_x, seq_params->subsampling_y,
-          seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
-          byte_alignment, NULL, NULL, NULL))
+          cpi->oxcf.border_in_pixels, byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate scaled last source buffer");
 }
@@ -407,8 +404,7 @@
   if (aom_realloc_frame_buffer(
           &cpi->scaled_source, scaled_width, scaled_height,
           cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
-          cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS,
-          cm->features.byte_alignment, NULL, NULL, NULL))
+          AOM_BORDER_IN_PIXELS, cm->features.byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to reallocate scaled source buffer");
   assert(cpi->scaled_source.y_crop_width == scaled_width);

diff --git a/av1/encoder/encoder_utils.c b/av1/encoder/encoder_utils.c
index 5e3bd61..ac68738 100644
--- a/av1/encoder/encoder_utils.c
+++ b/av1/encoder/encoder_utils.c

@@ -555,8 +555,8 @@
           if (aom_realloc_frame_buffer(
                   &new_fb->buf, cm->width, cm->height,
                   cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
-                  cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS,
-                  cm->features.byte_alignment, NULL, NULL, NULL)) {
+                  AOM_BORDER_IN_PIXELS, cm->features.byte_alignment, NULL, NULL,
+                  NULL)) {
             if (force_scaling) {
               // Release the reference acquired in the get_free_fb() call above.
               --new_fb->ref_count;
@@ -946,28 +946,16 @@
       p_cur += (y_pos * stride_cur + x_pos);
       p_ref += (y_pos * stride_ref + x_pos);
 
-      if (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH) {
-        uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur);
-        uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref);
-        for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
-          for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
-            if (p16_cur[tmpX] != p16_ref[tmpX]) {
-              match = 0;
-            }
+      uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur);
+      uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref);
+      for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+        for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+          if (p16_cur[tmpX] != p16_ref[tmpX]) {
+            match = 0;
           }
-          p16_cur += stride_cur;
-          p16_ref += stride_ref;
         }
-      } else {
-        for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
-          for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
-            if (p_cur[tmpX] != p_ref[tmpX]) {
-              match = 0;
-            }
-          }
-          p_cur += stride_cur;
-          p_ref += stride_ref;
-        }
+        p16_cur += stride_cur;
+        p16_ref += stride_ref;
       }
 
       if (match) {
@@ -1039,7 +1027,6 @@
   const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w;
   const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h;
   double log_sum = 0.0;
-  const int use_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH;
 
   // Loop through each 16x16 block.
   for (int row = 0; row < num_rows; ++row) {
@@ -1061,12 +1048,8 @@
           buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y;
           buf.stride = y_stride;
 
-          if (use_hbd) {
-            var += av1_high_get_sby_perpixel_variance(cpi, &buf, BLOCK_8X8,
-                                                      xd->bd);
-          } else {
-            var += av1_get_sby_perpixel_variance(cpi, &buf, BLOCK_8X8);
-          }
+          var +=
+              av1_high_get_sby_perpixel_variance(cpi, &buf, BLOCK_8X8, xd->bd);
 
           num_of_var += 1.0;
         }

diff --git a/av1/encoder/encoder_utils.h b/av1/encoder/encoder_utils.h
index e3dbb8d..e84d0eb 100644
--- a/av1/encoder/encoder_utils.h
+++ b/av1/encoder/encoder_utils.h

@@ -522,286 +522,284 @@
 
 static AOM_INLINE void highbd_set_var_fns(AV1_COMP *const cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  if (cm->seq_params.use_highbitdepth) {
-    switch (cm->seq_params.bit_depth) {
-      case AOM_BITS_8:
-        HIGHBD_BFP_WRAPPER(64, 16, 8)
-        HIGHBD_BFP_WRAPPER(16, 64, 8)
-        HIGHBD_BFP_WRAPPER(32, 8, 8)
-        HIGHBD_BFP_WRAPPER(8, 32, 8)
-        HIGHBD_BFP_WRAPPER(16, 4, 8)
-        HIGHBD_BFP_WRAPPER(4, 16, 8)
-        HIGHBD_BFP_WRAPPER(32, 16, 8)
-        HIGHBD_BFP_WRAPPER(16, 32, 8)
-        HIGHBD_BFP_WRAPPER(64, 32, 8)
-        HIGHBD_BFP_WRAPPER(32, 64, 8)
-        HIGHBD_BFP_WRAPPER(32, 32, 8)
-        HIGHBD_BFP_WRAPPER(64, 64, 8)
-        HIGHBD_BFP_WRAPPER(16, 16, 8)
-        HIGHBD_BFP_WRAPPER(16, 8, 8)
-        HIGHBD_BFP_WRAPPER(8, 16, 8)
-        HIGHBD_BFP_WRAPPER(8, 8, 8)
-        HIGHBD_BFP_WRAPPER(8, 4, 8)
-        HIGHBD_BFP_WRAPPER(4, 8, 8)
-        HIGHBD_BFP_WRAPPER(4, 4, 8)
-        HIGHBD_BFP_WRAPPER(128, 128, 8)
-        HIGHBD_BFP_WRAPPER(128, 64, 8)
-        HIGHBD_BFP_WRAPPER(64, 128, 8)
+  switch (cm->seq_params.bit_depth) {
+    case AOM_BITS_8:
+      HIGHBD_BFP_WRAPPER(64, 16, 8)
+      HIGHBD_BFP_WRAPPER(16, 64, 8)
+      HIGHBD_BFP_WRAPPER(32, 8, 8)
+      HIGHBD_BFP_WRAPPER(8, 32, 8)
+      HIGHBD_BFP_WRAPPER(16, 4, 8)
+      HIGHBD_BFP_WRAPPER(4, 16, 8)
+      HIGHBD_BFP_WRAPPER(32, 16, 8)
+      HIGHBD_BFP_WRAPPER(16, 32, 8)
+      HIGHBD_BFP_WRAPPER(64, 32, 8)
+      HIGHBD_BFP_WRAPPER(32, 64, 8)
+      HIGHBD_BFP_WRAPPER(32, 32, 8)
+      HIGHBD_BFP_WRAPPER(64, 64, 8)
+      HIGHBD_BFP_WRAPPER(16, 16, 8)
+      HIGHBD_BFP_WRAPPER(16, 8, 8)
+      HIGHBD_BFP_WRAPPER(8, 16, 8)
+      HIGHBD_BFP_WRAPPER(8, 8, 8)
+      HIGHBD_BFP_WRAPPER(8, 4, 8)
+      HIGHBD_BFP_WRAPPER(4, 8, 8)
+      HIGHBD_BFP_WRAPPER(4, 4, 8)
+      HIGHBD_BFP_WRAPPER(128, 128, 8)
+      HIGHBD_BFP_WRAPPER(128, 64, 8)
+      HIGHBD_BFP_WRAPPER(64, 128, 8)
 
-        HIGHBD_MBFP_WRAPPER(128, 128, 8)
-        HIGHBD_MBFP_WRAPPER(128, 64, 8)
-        HIGHBD_MBFP_WRAPPER(64, 128, 8)
-        HIGHBD_MBFP_WRAPPER(64, 64, 8)
-        HIGHBD_MBFP_WRAPPER(64, 32, 8)
-        HIGHBD_MBFP_WRAPPER(32, 64, 8)
-        HIGHBD_MBFP_WRAPPER(32, 32, 8)
-        HIGHBD_MBFP_WRAPPER(32, 16, 8)
-        HIGHBD_MBFP_WRAPPER(16, 32, 8)
-        HIGHBD_MBFP_WRAPPER(16, 16, 8)
-        HIGHBD_MBFP_WRAPPER(8, 16, 8)
-        HIGHBD_MBFP_WRAPPER(16, 8, 8)
-        HIGHBD_MBFP_WRAPPER(8, 8, 8)
-        HIGHBD_MBFP_WRAPPER(4, 8, 8)
-        HIGHBD_MBFP_WRAPPER(8, 4, 8)
-        HIGHBD_MBFP_WRAPPER(4, 4, 8)
-        HIGHBD_MBFP_WRAPPER(64, 16, 8)
-        HIGHBD_MBFP_WRAPPER(16, 64, 8)
-        HIGHBD_MBFP_WRAPPER(32, 8, 8)
-        HIGHBD_MBFP_WRAPPER(8, 32, 8)
-        HIGHBD_MBFP_WRAPPER(16, 4, 8)
-        HIGHBD_MBFP_WRAPPER(4, 16, 8)
+      HIGHBD_MBFP_WRAPPER(128, 128, 8)
+      HIGHBD_MBFP_WRAPPER(128, 64, 8)
+      HIGHBD_MBFP_WRAPPER(64, 128, 8)
+      HIGHBD_MBFP_WRAPPER(64, 64, 8)
+      HIGHBD_MBFP_WRAPPER(64, 32, 8)
+      HIGHBD_MBFP_WRAPPER(32, 64, 8)
+      HIGHBD_MBFP_WRAPPER(32, 32, 8)
+      HIGHBD_MBFP_WRAPPER(32, 16, 8)
+      HIGHBD_MBFP_WRAPPER(16, 32, 8)
+      HIGHBD_MBFP_WRAPPER(16, 16, 8)
+      HIGHBD_MBFP_WRAPPER(8, 16, 8)
+      HIGHBD_MBFP_WRAPPER(16, 8, 8)
+      HIGHBD_MBFP_WRAPPER(8, 8, 8)
+      HIGHBD_MBFP_WRAPPER(4, 8, 8)
+      HIGHBD_MBFP_WRAPPER(8, 4, 8)
+      HIGHBD_MBFP_WRAPPER(4, 4, 8)
+      HIGHBD_MBFP_WRAPPER(64, 16, 8)
+      HIGHBD_MBFP_WRAPPER(16, 64, 8)
+      HIGHBD_MBFP_WRAPPER(32, 8, 8)
+      HIGHBD_MBFP_WRAPPER(8, 32, 8)
+      HIGHBD_MBFP_WRAPPER(16, 4, 8)
+      HIGHBD_MBFP_WRAPPER(4, 16, 8)
 
-        LOWBD_OBFP_WRAPPER(128, 128)
-        LOWBD_OBFP_WRAPPER(128, 64)
-        LOWBD_OBFP_WRAPPER(64, 128)
-        LOWBD_OBFP_WRAPPER(64, 64)
-        LOWBD_OBFP_WRAPPER(64, 32)
-        LOWBD_OBFP_WRAPPER(32, 64)
-        LOWBD_OBFP_WRAPPER(32, 32)
-        LOWBD_OBFP_WRAPPER(32, 16)
-        LOWBD_OBFP_WRAPPER(16, 32)
-        LOWBD_OBFP_WRAPPER(16, 16)
-        LOWBD_OBFP_WRAPPER(8, 16)
-        LOWBD_OBFP_WRAPPER(16, 8)
-        LOWBD_OBFP_WRAPPER(8, 8)
-        LOWBD_OBFP_WRAPPER(4, 8)
-        LOWBD_OBFP_WRAPPER(8, 4)
-        LOWBD_OBFP_WRAPPER(4, 4)
-        LOWBD_OBFP_WRAPPER(64, 16)
-        LOWBD_OBFP_WRAPPER(16, 64)
-        LOWBD_OBFP_WRAPPER(32, 8)
-        LOWBD_OBFP_WRAPPER(8, 32)
-        LOWBD_OBFP_WRAPPER(16, 4)
-        LOWBD_OBFP_WRAPPER(4, 16)
+      LOWBD_OBFP_WRAPPER(128, 128)
+      LOWBD_OBFP_WRAPPER(128, 64)
+      LOWBD_OBFP_WRAPPER(64, 128)
+      LOWBD_OBFP_WRAPPER(64, 64)
+      LOWBD_OBFP_WRAPPER(64, 32)
+      LOWBD_OBFP_WRAPPER(32, 64)
+      LOWBD_OBFP_WRAPPER(32, 32)
+      LOWBD_OBFP_WRAPPER(32, 16)
+      LOWBD_OBFP_WRAPPER(16, 32)
+      LOWBD_OBFP_WRAPPER(16, 16)
+      LOWBD_OBFP_WRAPPER(8, 16)
+      LOWBD_OBFP_WRAPPER(16, 8)
+      LOWBD_OBFP_WRAPPER(8, 8)
+      LOWBD_OBFP_WRAPPER(4, 8)
+      LOWBD_OBFP_WRAPPER(8, 4)
+      LOWBD_OBFP_WRAPPER(4, 4)
+      LOWBD_OBFP_WRAPPER(64, 16)
+      LOWBD_OBFP_WRAPPER(16, 64)
+      LOWBD_OBFP_WRAPPER(32, 8)
+      LOWBD_OBFP_WRAPPER(8, 32)
+      LOWBD_OBFP_WRAPPER(16, 4)
+      LOWBD_OBFP_WRAPPER(4, 16)
 
-        HIGHBD_SDSFP_WRAPPER(128, 128, 8);
-        HIGHBD_SDSFP_WRAPPER(128, 64, 8);
-        HIGHBD_SDSFP_WRAPPER(64, 128, 8);
-        HIGHBD_SDSFP_WRAPPER(64, 64, 8);
-        HIGHBD_SDSFP_WRAPPER(64, 32, 8);
-        HIGHBD_SDSFP_WRAPPER(64, 16, 8);
-        HIGHBD_SDSFP_WRAPPER(32, 64, 8);
-        HIGHBD_SDSFP_WRAPPER(32, 32, 8);
-        HIGHBD_SDSFP_WRAPPER(32, 16, 8);
-        HIGHBD_SDSFP_WRAPPER(32, 8, 8);
-        HIGHBD_SDSFP_WRAPPER(16, 64, 8);
-        HIGHBD_SDSFP_WRAPPER(16, 32, 8);
-        HIGHBD_SDSFP_WRAPPER(16, 16, 8);
-        HIGHBD_SDSFP_WRAPPER(16, 8, 8);
-        HIGHBD_SDSFP_WRAPPER(8, 16, 8);
-        HIGHBD_SDSFP_WRAPPER(8, 8, 8);
-        HIGHBD_SDSFP_WRAPPER(4, 16, 8);
-        HIGHBD_SDSFP_WRAPPER(4, 8, 8);
-        HIGHBD_SDSFP_WRAPPER(8, 32, 8);
-        break;
+      HIGHBD_SDSFP_WRAPPER(128, 128, 8);
+      HIGHBD_SDSFP_WRAPPER(128, 64, 8);
+      HIGHBD_SDSFP_WRAPPER(64, 128, 8);
+      HIGHBD_SDSFP_WRAPPER(64, 64, 8);
+      HIGHBD_SDSFP_WRAPPER(64, 32, 8);
+      HIGHBD_SDSFP_WRAPPER(64, 16, 8);
+      HIGHBD_SDSFP_WRAPPER(32, 64, 8);
+      HIGHBD_SDSFP_WRAPPER(32, 32, 8);
+      HIGHBD_SDSFP_WRAPPER(32, 16, 8);
+      HIGHBD_SDSFP_WRAPPER(32, 8, 8);
+      HIGHBD_SDSFP_WRAPPER(16, 64, 8);
+      HIGHBD_SDSFP_WRAPPER(16, 32, 8);
+      HIGHBD_SDSFP_WRAPPER(16, 16, 8);
+      HIGHBD_SDSFP_WRAPPER(16, 8, 8);
+      HIGHBD_SDSFP_WRAPPER(8, 16, 8);
+      HIGHBD_SDSFP_WRAPPER(8, 8, 8);
+      HIGHBD_SDSFP_WRAPPER(4, 16, 8);
+      HIGHBD_SDSFP_WRAPPER(4, 8, 8);
+      HIGHBD_SDSFP_WRAPPER(8, 32, 8);
+      break;
 
-      case AOM_BITS_10:
-        HIGHBD_BFP_WRAPPER(64, 16, 10)
-        HIGHBD_BFP_WRAPPER(16, 64, 10)
-        HIGHBD_BFP_WRAPPER(32, 8, 10)
-        HIGHBD_BFP_WRAPPER(8, 32, 10)
-        HIGHBD_BFP_WRAPPER(16, 4, 10)
-        HIGHBD_BFP_WRAPPER(4, 16, 10)
-        HIGHBD_BFP_WRAPPER(32, 16, 10)
-        HIGHBD_BFP_WRAPPER(16, 32, 10)
-        HIGHBD_BFP_WRAPPER(64, 32, 10)
-        HIGHBD_BFP_WRAPPER(32, 64, 10)
-        HIGHBD_BFP_WRAPPER(32, 32, 10)
-        HIGHBD_BFP_WRAPPER(64, 64, 10)
-        HIGHBD_BFP_WRAPPER(16, 16, 10)
-        HIGHBD_BFP_WRAPPER(16, 8, 10)
-        HIGHBD_BFP_WRAPPER(8, 16, 10)
-        HIGHBD_BFP_WRAPPER(8, 8, 10)
-        HIGHBD_BFP_WRAPPER(8, 4, 10)
-        HIGHBD_BFP_WRAPPER(4, 8, 10)
-        HIGHBD_BFP_WRAPPER(4, 4, 10)
-        HIGHBD_BFP_WRAPPER(128, 128, 10)
-        HIGHBD_BFP_WRAPPER(128, 64, 10)
-        HIGHBD_BFP_WRAPPER(64, 128, 10)
+    case AOM_BITS_10:
+      HIGHBD_BFP_WRAPPER(64, 16, 10)
+      HIGHBD_BFP_WRAPPER(16, 64, 10)
+      HIGHBD_BFP_WRAPPER(32, 8, 10)
+      HIGHBD_BFP_WRAPPER(8, 32, 10)
+      HIGHBD_BFP_WRAPPER(16, 4, 10)
+      HIGHBD_BFP_WRAPPER(4, 16, 10)
+      HIGHBD_BFP_WRAPPER(32, 16, 10)
+      HIGHBD_BFP_WRAPPER(16, 32, 10)
+      HIGHBD_BFP_WRAPPER(64, 32, 10)
+      HIGHBD_BFP_WRAPPER(32, 64, 10)
+      HIGHBD_BFP_WRAPPER(32, 32, 10)
+      HIGHBD_BFP_WRAPPER(64, 64, 10)
+      HIGHBD_BFP_WRAPPER(16, 16, 10)
+      HIGHBD_BFP_WRAPPER(16, 8, 10)
+      HIGHBD_BFP_WRAPPER(8, 16, 10)
+      HIGHBD_BFP_WRAPPER(8, 8, 10)
+      HIGHBD_BFP_WRAPPER(8, 4, 10)
+      HIGHBD_BFP_WRAPPER(4, 8, 10)
+      HIGHBD_BFP_WRAPPER(4, 4, 10)
+      HIGHBD_BFP_WRAPPER(128, 128, 10)
+      HIGHBD_BFP_WRAPPER(128, 64, 10)
+      HIGHBD_BFP_WRAPPER(64, 128, 10)
 
-        HIGHBD_MBFP_WRAPPER(128, 128, 10)
-        HIGHBD_MBFP_WRAPPER(128, 64, 10)
-        HIGHBD_MBFP_WRAPPER(64, 128, 10)
-        HIGHBD_MBFP_WRAPPER(64, 64, 10)
-        HIGHBD_MBFP_WRAPPER(64, 32, 10)
-        HIGHBD_MBFP_WRAPPER(32, 64, 10)
-        HIGHBD_MBFP_WRAPPER(32, 32, 10)
-        HIGHBD_MBFP_WRAPPER(32, 16, 10)
-        HIGHBD_MBFP_WRAPPER(16, 32, 10)
-        HIGHBD_MBFP_WRAPPER(16, 16, 10)
-        HIGHBD_MBFP_WRAPPER(8, 16, 10)
-        HIGHBD_MBFP_WRAPPER(16, 8, 10)
-        HIGHBD_MBFP_WRAPPER(8, 8, 10)
-        HIGHBD_MBFP_WRAPPER(4, 8, 10)
-        HIGHBD_MBFP_WRAPPER(8, 4, 10)
-        HIGHBD_MBFP_WRAPPER(4, 4, 10)
-        HIGHBD_MBFP_WRAPPER(64, 16, 10)
-        HIGHBD_MBFP_WRAPPER(16, 64, 10)
-        HIGHBD_MBFP_WRAPPER(32, 8, 10)
-        HIGHBD_MBFP_WRAPPER(8, 32, 10)
-        HIGHBD_MBFP_WRAPPER(16, 4, 10)
-        HIGHBD_MBFP_WRAPPER(4, 16, 10)
+      HIGHBD_MBFP_WRAPPER(128, 128, 10)
+      HIGHBD_MBFP_WRAPPER(128, 64, 10)
+      HIGHBD_MBFP_WRAPPER(64, 128, 10)
+      HIGHBD_MBFP_WRAPPER(64, 64, 10)
+      HIGHBD_MBFP_WRAPPER(64, 32, 10)
+      HIGHBD_MBFP_WRAPPER(32, 64, 10)
+      HIGHBD_MBFP_WRAPPER(32, 32, 10)
+      HIGHBD_MBFP_WRAPPER(32, 16, 10)
+      HIGHBD_MBFP_WRAPPER(16, 32, 10)
+      HIGHBD_MBFP_WRAPPER(16, 16, 10)
+      HIGHBD_MBFP_WRAPPER(8, 16, 10)
+      HIGHBD_MBFP_WRAPPER(16, 8, 10)
+      HIGHBD_MBFP_WRAPPER(8, 8, 10)
+      HIGHBD_MBFP_WRAPPER(4, 8, 10)
+      HIGHBD_MBFP_WRAPPER(8, 4, 10)
+      HIGHBD_MBFP_WRAPPER(4, 4, 10)
+      HIGHBD_MBFP_WRAPPER(64, 16, 10)
+      HIGHBD_MBFP_WRAPPER(16, 64, 10)
+      HIGHBD_MBFP_WRAPPER(32, 8, 10)
+      HIGHBD_MBFP_WRAPPER(8, 32, 10)
+      HIGHBD_MBFP_WRAPPER(16, 4, 10)
+      HIGHBD_MBFP_WRAPPER(4, 16, 10)
 
-        HIGHBD_OBFP_WRAPPER(128, 128, 10)
-        HIGHBD_OBFP_WRAPPER(128, 64, 10)
-        HIGHBD_OBFP_WRAPPER(64, 128, 10)
-        HIGHBD_OBFP_WRAPPER(64, 64, 10)
-        HIGHBD_OBFP_WRAPPER(64, 32, 10)
-        HIGHBD_OBFP_WRAPPER(32, 64, 10)
-        HIGHBD_OBFP_WRAPPER(32, 32, 10)
-        HIGHBD_OBFP_WRAPPER(32, 16, 10)
-        HIGHBD_OBFP_WRAPPER(16, 32, 10)
-        HIGHBD_OBFP_WRAPPER(16, 16, 10)
-        HIGHBD_OBFP_WRAPPER(8, 16, 10)
-        HIGHBD_OBFP_WRAPPER(16, 8, 10)
-        HIGHBD_OBFP_WRAPPER(8, 8, 10)
-        HIGHBD_OBFP_WRAPPER(4, 8, 10)
-        HIGHBD_OBFP_WRAPPER(8, 4, 10)
-        HIGHBD_OBFP_WRAPPER(4, 4, 10)
-        HIGHBD_OBFP_WRAPPER(64, 16, 10)
-        HIGHBD_OBFP_WRAPPER(16, 64, 10)
-        HIGHBD_OBFP_WRAPPER(32, 8, 10)
-        HIGHBD_OBFP_WRAPPER(8, 32, 10)
-        HIGHBD_OBFP_WRAPPER(16, 4, 10)
-        HIGHBD_OBFP_WRAPPER(4, 16, 10)
+      HIGHBD_OBFP_WRAPPER(128, 128, 10)
+      HIGHBD_OBFP_WRAPPER(128, 64, 10)
+      HIGHBD_OBFP_WRAPPER(64, 128, 10)
+      HIGHBD_OBFP_WRAPPER(64, 64, 10)
+      HIGHBD_OBFP_WRAPPER(64, 32, 10)
+      HIGHBD_OBFP_WRAPPER(32, 64, 10)
+      HIGHBD_OBFP_WRAPPER(32, 32, 10)
+      HIGHBD_OBFP_WRAPPER(32, 16, 10)
+      HIGHBD_OBFP_WRAPPER(16, 32, 10)
+      HIGHBD_OBFP_WRAPPER(16, 16, 10)
+      HIGHBD_OBFP_WRAPPER(8, 16, 10)
+      HIGHBD_OBFP_WRAPPER(16, 8, 10)
+      HIGHBD_OBFP_WRAPPER(8, 8, 10)
+      HIGHBD_OBFP_WRAPPER(4, 8, 10)
+      HIGHBD_OBFP_WRAPPER(8, 4, 10)
+      HIGHBD_OBFP_WRAPPER(4, 4, 10)
+      HIGHBD_OBFP_WRAPPER(64, 16, 10)
+      HIGHBD_OBFP_WRAPPER(16, 64, 10)
+      HIGHBD_OBFP_WRAPPER(32, 8, 10)
+      HIGHBD_OBFP_WRAPPER(8, 32, 10)
+      HIGHBD_OBFP_WRAPPER(16, 4, 10)
+      HIGHBD_OBFP_WRAPPER(4, 16, 10)
 
-        HIGHBD_SDSFP_WRAPPER(128, 128, 10);
-        HIGHBD_SDSFP_WRAPPER(128, 64, 10);
-        HIGHBD_SDSFP_WRAPPER(64, 128, 10);
-        HIGHBD_SDSFP_WRAPPER(64, 64, 10);
-        HIGHBD_SDSFP_WRAPPER(64, 32, 10);
-        HIGHBD_SDSFP_WRAPPER(64, 16, 10);
-        HIGHBD_SDSFP_WRAPPER(32, 64, 10);
-        HIGHBD_SDSFP_WRAPPER(32, 32, 10);
-        HIGHBD_SDSFP_WRAPPER(32, 16, 10);
-        HIGHBD_SDSFP_WRAPPER(32, 8, 10);
-        HIGHBD_SDSFP_WRAPPER(16, 64, 10);
-        HIGHBD_SDSFP_WRAPPER(16, 32, 10);
-        HIGHBD_SDSFP_WRAPPER(16, 16, 10);
-        HIGHBD_SDSFP_WRAPPER(16, 8, 10);
-        HIGHBD_SDSFP_WRAPPER(8, 16, 10);
-        HIGHBD_SDSFP_WRAPPER(8, 8, 10);
-        HIGHBD_SDSFP_WRAPPER(4, 16, 10);
-        HIGHBD_SDSFP_WRAPPER(4, 8, 10);
-        HIGHBD_SDSFP_WRAPPER(8, 32, 10);
-        break;
+      HIGHBD_SDSFP_WRAPPER(128, 128, 10);
+      HIGHBD_SDSFP_WRAPPER(128, 64, 10);
+      HIGHBD_SDSFP_WRAPPER(64, 128, 10);
+      HIGHBD_SDSFP_WRAPPER(64, 64, 10);
+      HIGHBD_SDSFP_WRAPPER(64, 32, 10);
+      HIGHBD_SDSFP_WRAPPER(64, 16, 10);
+      HIGHBD_SDSFP_WRAPPER(32, 64, 10);
+      HIGHBD_SDSFP_WRAPPER(32, 32, 10);
+      HIGHBD_SDSFP_WRAPPER(32, 16, 10);
+      HIGHBD_SDSFP_WRAPPER(32, 8, 10);
+      HIGHBD_SDSFP_WRAPPER(16, 64, 10);
+      HIGHBD_SDSFP_WRAPPER(16, 32, 10);
+      HIGHBD_SDSFP_WRAPPER(16, 16, 10);
+      HIGHBD_SDSFP_WRAPPER(16, 8, 10);
+      HIGHBD_SDSFP_WRAPPER(8, 16, 10);
+      HIGHBD_SDSFP_WRAPPER(8, 8, 10);
+      HIGHBD_SDSFP_WRAPPER(4, 16, 10);
+      HIGHBD_SDSFP_WRAPPER(4, 8, 10);
+      HIGHBD_SDSFP_WRAPPER(8, 32, 10);
+      break;
 
-      case AOM_BITS_12:
-        HIGHBD_BFP_WRAPPER(64, 16, 12)
-        HIGHBD_BFP_WRAPPER(16, 64, 12)
-        HIGHBD_BFP_WRAPPER(32, 8, 12)
-        HIGHBD_BFP_WRAPPER(8, 32, 12)
-        HIGHBD_BFP_WRAPPER(16, 4, 12)
-        HIGHBD_BFP_WRAPPER(4, 16, 12)
-        HIGHBD_BFP_WRAPPER(32, 16, 12)
-        HIGHBD_BFP_WRAPPER(16, 32, 12)
-        HIGHBD_BFP_WRAPPER(64, 32, 12)
-        HIGHBD_BFP_WRAPPER(32, 64, 12)
-        HIGHBD_BFP_WRAPPER(32, 32, 12)
-        HIGHBD_BFP_WRAPPER(64, 64, 12)
-        HIGHBD_BFP_WRAPPER(16, 16, 12)
-        HIGHBD_BFP_WRAPPER(16, 8, 12)
-        HIGHBD_BFP_WRAPPER(8, 16, 12)
-        HIGHBD_BFP_WRAPPER(8, 8, 12)
-        HIGHBD_BFP_WRAPPER(8, 4, 12)
-        HIGHBD_BFP_WRAPPER(4, 8, 12)
-        HIGHBD_BFP_WRAPPER(4, 4, 12)
-        HIGHBD_BFP_WRAPPER(128, 128, 12)
-        HIGHBD_BFP_WRAPPER(128, 64, 12)
-        HIGHBD_BFP_WRAPPER(64, 128, 12)
+    case AOM_BITS_12:
+      HIGHBD_BFP_WRAPPER(64, 16, 12)
+      HIGHBD_BFP_WRAPPER(16, 64, 12)
+      HIGHBD_BFP_WRAPPER(32, 8, 12)
+      HIGHBD_BFP_WRAPPER(8, 32, 12)
+      HIGHBD_BFP_WRAPPER(16, 4, 12)
+      HIGHBD_BFP_WRAPPER(4, 16, 12)
+      HIGHBD_BFP_WRAPPER(32, 16, 12)
+      HIGHBD_BFP_WRAPPER(16, 32, 12)
+      HIGHBD_BFP_WRAPPER(64, 32, 12)
+      HIGHBD_BFP_WRAPPER(32, 64, 12)
+      HIGHBD_BFP_WRAPPER(32, 32, 12)
+      HIGHBD_BFP_WRAPPER(64, 64, 12)
+      HIGHBD_BFP_WRAPPER(16, 16, 12)
+      HIGHBD_BFP_WRAPPER(16, 8, 12)
+      HIGHBD_BFP_WRAPPER(8, 16, 12)
+      HIGHBD_BFP_WRAPPER(8, 8, 12)
+      HIGHBD_BFP_WRAPPER(8, 4, 12)
+      HIGHBD_BFP_WRAPPER(4, 8, 12)
+      HIGHBD_BFP_WRAPPER(4, 4, 12)
+      HIGHBD_BFP_WRAPPER(128, 128, 12)
+      HIGHBD_BFP_WRAPPER(128, 64, 12)
+      HIGHBD_BFP_WRAPPER(64, 128, 12)
 
-        HIGHBD_MBFP_WRAPPER(128, 128, 12)
-        HIGHBD_MBFP_WRAPPER(128, 64, 12)
-        HIGHBD_MBFP_WRAPPER(64, 128, 12)
-        HIGHBD_MBFP_WRAPPER(64, 64, 12)
-        HIGHBD_MBFP_WRAPPER(64, 32, 12)
-        HIGHBD_MBFP_WRAPPER(32, 64, 12)
-        HIGHBD_MBFP_WRAPPER(32, 32, 12)
-        HIGHBD_MBFP_WRAPPER(32, 16, 12)
-        HIGHBD_MBFP_WRAPPER(16, 32, 12)
-        HIGHBD_MBFP_WRAPPER(16, 16, 12)
-        HIGHBD_MBFP_WRAPPER(8, 16, 12)
-        HIGHBD_MBFP_WRAPPER(16, 8, 12)
-        HIGHBD_MBFP_WRAPPER(8, 8, 12)
-        HIGHBD_MBFP_WRAPPER(4, 8, 12)
-        HIGHBD_MBFP_WRAPPER(8, 4, 12)
-        HIGHBD_MBFP_WRAPPER(4, 4, 12)
-        HIGHBD_MBFP_WRAPPER(64, 16, 12)
-        HIGHBD_MBFP_WRAPPER(16, 64, 12)
-        HIGHBD_MBFP_WRAPPER(32, 8, 12)
-        HIGHBD_MBFP_WRAPPER(8, 32, 12)
-        HIGHBD_MBFP_WRAPPER(16, 4, 12)
-        HIGHBD_MBFP_WRAPPER(4, 16, 12)
+      HIGHBD_MBFP_WRAPPER(128, 128, 12)
+      HIGHBD_MBFP_WRAPPER(128, 64, 12)
+      HIGHBD_MBFP_WRAPPER(64, 128, 12)
+      HIGHBD_MBFP_WRAPPER(64, 64, 12)
+      HIGHBD_MBFP_WRAPPER(64, 32, 12)
+      HIGHBD_MBFP_WRAPPER(32, 64, 12)
+      HIGHBD_MBFP_WRAPPER(32, 32, 12)
+      HIGHBD_MBFP_WRAPPER(32, 16, 12)
+      HIGHBD_MBFP_WRAPPER(16, 32, 12)
+      HIGHBD_MBFP_WRAPPER(16, 16, 12)
+      HIGHBD_MBFP_WRAPPER(8, 16, 12)
+      HIGHBD_MBFP_WRAPPER(16, 8, 12)
+      HIGHBD_MBFP_WRAPPER(8, 8, 12)
+      HIGHBD_MBFP_WRAPPER(4, 8, 12)
+      HIGHBD_MBFP_WRAPPER(8, 4, 12)
+      HIGHBD_MBFP_WRAPPER(4, 4, 12)
+      HIGHBD_MBFP_WRAPPER(64, 16, 12)
+      HIGHBD_MBFP_WRAPPER(16, 64, 12)
+      HIGHBD_MBFP_WRAPPER(32, 8, 12)
+      HIGHBD_MBFP_WRAPPER(8, 32, 12)
+      HIGHBD_MBFP_WRAPPER(16, 4, 12)
+      HIGHBD_MBFP_WRAPPER(4, 16, 12)
 
-        HIGHBD_OBFP_WRAPPER(128, 128, 12)
-        HIGHBD_OBFP_WRAPPER(128, 64, 12)
-        HIGHBD_OBFP_WRAPPER(64, 128, 12)
-        HIGHBD_OBFP_WRAPPER(64, 64, 12)
-        HIGHBD_OBFP_WRAPPER(64, 32, 12)
-        HIGHBD_OBFP_WRAPPER(32, 64, 12)
-        HIGHBD_OBFP_WRAPPER(32, 32, 12)
-        HIGHBD_OBFP_WRAPPER(32, 16, 12)
-        HIGHBD_OBFP_WRAPPER(16, 32, 12)
-        HIGHBD_OBFP_WRAPPER(16, 16, 12)
-        HIGHBD_OBFP_WRAPPER(8, 16, 12)
-        HIGHBD_OBFP_WRAPPER(16, 8, 12)
-        HIGHBD_OBFP_WRAPPER(8, 8, 12)
-        HIGHBD_OBFP_WRAPPER(4, 8, 12)
-        HIGHBD_OBFP_WRAPPER(8, 4, 12)
-        HIGHBD_OBFP_WRAPPER(4, 4, 12)
-        HIGHBD_OBFP_WRAPPER(64, 16, 12)
-        HIGHBD_OBFP_WRAPPER(16, 64, 12)
-        HIGHBD_OBFP_WRAPPER(32, 8, 12)
-        HIGHBD_OBFP_WRAPPER(8, 32, 12)
-        HIGHBD_OBFP_WRAPPER(16, 4, 12)
-        HIGHBD_OBFP_WRAPPER(4, 16, 12)
+      HIGHBD_OBFP_WRAPPER(128, 128, 12)
+      HIGHBD_OBFP_WRAPPER(128, 64, 12)
+      HIGHBD_OBFP_WRAPPER(64, 128, 12)
+      HIGHBD_OBFP_WRAPPER(64, 64, 12)
+      HIGHBD_OBFP_WRAPPER(64, 32, 12)
+      HIGHBD_OBFP_WRAPPER(32, 64, 12)
+      HIGHBD_OBFP_WRAPPER(32, 32, 12)
+      HIGHBD_OBFP_WRAPPER(32, 16, 12)
+      HIGHBD_OBFP_WRAPPER(16, 32, 12)
+      HIGHBD_OBFP_WRAPPER(16, 16, 12)
+      HIGHBD_OBFP_WRAPPER(8, 16, 12)
+      HIGHBD_OBFP_WRAPPER(16, 8, 12)
+      HIGHBD_OBFP_WRAPPER(8, 8, 12)
+      HIGHBD_OBFP_WRAPPER(4, 8, 12)
+      HIGHBD_OBFP_WRAPPER(8, 4, 12)
+      HIGHBD_OBFP_WRAPPER(4, 4, 12)
+      HIGHBD_OBFP_WRAPPER(64, 16, 12)
+      HIGHBD_OBFP_WRAPPER(16, 64, 12)
+      HIGHBD_OBFP_WRAPPER(32, 8, 12)
+      HIGHBD_OBFP_WRAPPER(8, 32, 12)
+      HIGHBD_OBFP_WRAPPER(16, 4, 12)
+      HIGHBD_OBFP_WRAPPER(4, 16, 12)
 
-        HIGHBD_SDSFP_WRAPPER(128, 128, 12);
-        HIGHBD_SDSFP_WRAPPER(128, 64, 12);
-        HIGHBD_SDSFP_WRAPPER(64, 128, 12);
-        HIGHBD_SDSFP_WRAPPER(64, 64, 12);
-        HIGHBD_SDSFP_WRAPPER(64, 32, 12);
-        HIGHBD_SDSFP_WRAPPER(64, 16, 12);
-        HIGHBD_SDSFP_WRAPPER(32, 64, 12);
-        HIGHBD_SDSFP_WRAPPER(32, 32, 12);
-        HIGHBD_SDSFP_WRAPPER(32, 16, 12);
-        HIGHBD_SDSFP_WRAPPER(32, 8, 12);
-        HIGHBD_SDSFP_WRAPPER(16, 64, 12);
-        HIGHBD_SDSFP_WRAPPER(16, 32, 12);
-        HIGHBD_SDSFP_WRAPPER(16, 16, 12);
-        HIGHBD_SDSFP_WRAPPER(16, 8, 12);
-        HIGHBD_SDSFP_WRAPPER(8, 16, 12);
-        HIGHBD_SDSFP_WRAPPER(8, 8, 12);
-        HIGHBD_SDSFP_WRAPPER(4, 16, 12);
-        HIGHBD_SDSFP_WRAPPER(4, 8, 12);
-        HIGHBD_SDSFP_WRAPPER(8, 32, 12);
-        break;
+      HIGHBD_SDSFP_WRAPPER(128, 128, 12);
+      HIGHBD_SDSFP_WRAPPER(128, 64, 12);
+      HIGHBD_SDSFP_WRAPPER(64, 128, 12);
+      HIGHBD_SDSFP_WRAPPER(64, 64, 12);
+      HIGHBD_SDSFP_WRAPPER(64, 32, 12);
+      HIGHBD_SDSFP_WRAPPER(64, 16, 12);
+      HIGHBD_SDSFP_WRAPPER(32, 64, 12);
+      HIGHBD_SDSFP_WRAPPER(32, 32, 12);
+      HIGHBD_SDSFP_WRAPPER(32, 16, 12);
+      HIGHBD_SDSFP_WRAPPER(32, 8, 12);
+      HIGHBD_SDSFP_WRAPPER(16, 64, 12);
+      HIGHBD_SDSFP_WRAPPER(16, 32, 12);
+      HIGHBD_SDSFP_WRAPPER(16, 16, 12);
+      HIGHBD_SDSFP_WRAPPER(16, 8, 12);
+      HIGHBD_SDSFP_WRAPPER(8, 16, 12);
+      HIGHBD_SDSFP_WRAPPER(8, 8, 12);
+      HIGHBD_SDSFP_WRAPPER(4, 16, 12);
+      HIGHBD_SDSFP_WRAPPER(4, 8, 12);
+      HIGHBD_SDSFP_WRAPPER(8, 32, 12);
+      break;
 
-      default:
-        assert(0 &&
-               "cm->seq_params.bit_depth should be AOM_BITS_8, "
-               "AOM_BITS_10 or AOM_BITS_12");
-    }
+    default:
+      assert(0 &&
+             "cm->seq_params.bit_depth should be AOM_BITS_8, "
+             "AOM_BITS_10 or AOM_BITS_12");
   }
 }
 
@@ -836,9 +834,7 @@
   return a->y_height == b->y_height && a->y_width == b->y_width &&
          a->uv_height == b->uv_height && a->uv_width == b->uv_width &&
          a->y_stride == b->y_stride && a->uv_stride == b->uv_stride &&
-         a->border == b->border &&
-         (a->flags & YV12_FLAG_HIGHBITDEPTH) ==
-             (b->flags & YV12_FLAG_HIGHBITDEPTH);
+         a->border == b->border;
 }
 
 static AOM_INLINE int update_entropy(bool *ext_refresh_frame_context,
@@ -908,9 +904,8 @@
   // Reset the frame pointers to the current frame size.
   if (aom_realloc_frame_buffer(
           &tip_frame->buf, cm->width, cm->height, cm->seq_params.subsampling_x,
-          cm->seq_params.subsampling_y, cm->seq_params.use_highbitdepth,
-          cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
-          NULL)) {
+          cm->seq_params.subsampling_y, cpi->oxcf.border_in_pixels,
+          cm->features.byte_alignment, NULL, NULL, NULL)) {
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
   }

diff --git a/av1/encoder/extend.c b/av1/encoder/extend.c
index 258938c..2e4713f 100644
--- a/av1/encoder/extend.c
+++ b/av1/encoder/extend.c

@@ -17,47 +17,6 @@
 #include "av1/common/common.h"
 #include "av1/encoder/extend.h"
 
-static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
-                                  uint8_t *dst, int dst_pitch, int w, int h,
-                                  int extend_top, int extend_left,
-                                  int extend_bottom, int extend_right) {
-  int i, linesize;
-
-  // copy the left and right most columns out
-  const uint8_t *src_ptr1 = src;
-  const uint8_t *src_ptr2 = src + w - 1;
-  uint8_t *dst_ptr1 = dst - extend_left;
-  uint8_t *dst_ptr2 = dst + w;
-
-  for (i = 0; i < h; i++) {
-    memset(dst_ptr1, src_ptr1[0], extend_left);
-    memcpy(dst_ptr1 + extend_left, src_ptr1, w);
-    memset(dst_ptr2, src_ptr2[0], extend_right);
-    src_ptr1 += src_pitch;
-    src_ptr2 += src_pitch;
-    dst_ptr1 += dst_pitch;
-    dst_ptr2 += dst_pitch;
-  }
-
-  // Now copy the top and bottom lines into each line of the respective
-  // borders
-  src_ptr1 = dst - extend_left;
-  src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
-  dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
-  dst_ptr2 = dst + dst_pitch * (h)-extend_left;
-  linesize = extend_left + extend_right + w;
-
-  for (i = 0; i < extend_top; i++) {
-    memcpy(dst_ptr1, src_ptr1, linesize);
-    dst_ptr1 += dst_pitch;
-  }
-
-  for (i = 0; i < extend_bottom; i++) {
-    memcpy(dst_ptr2, src_ptr2, linesize);
-    dst_ptr2 += dst_pitch;
-  }
-}
-
 static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
                                          uint8_t *dst8, int dst_pitch, int w,
                                          int h, int extend_top, int extend_left,
@@ -119,30 +78,15 @@
   const int eb_uv = eb_y >> uv_height_subsampling;
   const int er_uv = er_y >> uv_width_subsampling;
 
-  if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
-    highbd_copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
-                                 dst->y_stride, src->y_crop_width,
-                                 src->y_crop_height, et_y, el_y, eb_y, er_y);
-    if (!src->monochrome) {
-      highbd_copy_and_extend_plane(
-          src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
-          src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
-      highbd_copy_and_extend_plane(
-          src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
-          src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
-    }
-    return;
-  }
-
-  copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
-                        dst->y_stride, src->y_crop_width, src->y_crop_height,
-                        et_y, el_y, eb_y, er_y);
+  highbd_copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
+                               dst->y_stride, src->y_crop_width,
+                               src->y_crop_height, et_y, el_y, eb_y, er_y);
   if (!src->monochrome) {
-    copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer,
-                          dst->uv_stride, src->uv_crop_width,
-                          src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
-    copy_and_extend_plane(src->v_buffer, src->uv_stride, dst->v_buffer,
-                          dst->uv_stride, src->uv_crop_width,
-                          src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+    highbd_copy_and_extend_plane(
+        src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+        src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+    highbd_copy_and_extend_plane(
+        src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+        src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
   }
 }

diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index 880683b..3021955 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c

@@ -143,24 +143,6 @@
     output_stats(cpi->twopass.stats_buf_ctx->total_stats, cpi->output_pkt_list);
 }
 
-static aom_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
-  switch (bsize) {
-    case BLOCK_8X8: return aom_mse8x8;
-    case BLOCK_16X8: return aom_mse16x8;
-    case BLOCK_8X16: return aom_mse8x16;
-    default: return aom_mse16x16;
-  }
-}
-
-static unsigned int get_prediction_error(BLOCK_SIZE bsize,
-                                         const struct buf_2d *src,
-                                         const struct buf_2d *ref) {
-  unsigned int sse;
-  const aom_variance_fn_t fn = get_block_variance_fn(bsize);
-  fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
-  return sse;
-}
-
 static aom_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
                                                       int bd) {
   switch (bd) {
@@ -362,17 +344,15 @@
 
   av1_encode_intra_block_plane(cpi, x, bsize, 0, DRY_RUN_NORMAL, 0);
   int this_intra_error = aom_get_mb_ss(x->plane[0].src_diff);
-  if (seq_params->use_highbitdepth) {
-    switch (seq_params->bit_depth) {
-      case AOM_BITS_8: break;
-      case AOM_BITS_10: this_intra_error >>= 4; break;
-      case AOM_BITS_12: this_intra_error >>= 8; break;
-      default:
-        assert(0 &&
-               "seq_params->bit_depth should be AOM_BITS_8, "
-               "AOM_BITS_10 or AOM_BITS_12");
-        return -1;
-    }
+  switch (seq_params->bit_depth) {
+    case AOM_BITS_8: break;
+    case AOM_BITS_10: this_intra_error >>= 4; break;
+    case AOM_BITS_12: this_intra_error >>= 8; break;
+    default:
+      assert(0 &&
+             "seq_params->bit_depth should be AOM_BITS_8, "
+             "AOM_BITS_10 or AOM_BITS_12");
+      return -1;
   }
 
   if (this_intra_error < UL_INTRA_THRESH) {
@@ -390,23 +370,17 @@
   }
 
   int level_sample;
-  if (seq_params->use_highbitdepth) {
-    level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
-  } else {
-    level_sample = x->plane[0].src.buf[0];
-  }
+  level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
 
-  if (seq_params->use_highbitdepth) {
-    switch (seq_params->bit_depth) {
-      case AOM_BITS_8: break;
-      case AOM_BITS_10: level_sample >>= 2; break;
-      case AOM_BITS_12: level_sample >>= 4; break;
-      default:
-        assert(0 &&
-               "seq_params->bit_depth should be AOM_BITS_8, "
-               "AOM_BITS_10 or AOM_BITS_12");
-        return -1;
-    }
+  switch (seq_params->bit_depth) {
+    case AOM_BITS_8: break;
+    case AOM_BITS_10: level_sample >>= 2; break;
+    case AOM_BITS_12: level_sample >>= 4; break;
+    default:
+      assert(0 &&
+             "seq_params->bit_depth should be AOM_BITS_8, "
+             "AOM_BITS_10 or AOM_BITS_12");
+      return -1;
   }
   if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) {
     stats->brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));
@@ -426,13 +400,12 @@
   // Accumulate the intra error.
   stats->intra_error += (int64_t)this_intra_error;
 
-  const int hbd = is_cur_buf_hbd(xd);
   const int stride = x->plane[0].src.stride;
   uint8_t *buf = x->plane[0].src.buf;
   for (int r8 = 0; r8 < 2; ++r8) {
     for (int c8 = 0; c8 < 2; ++c8) {
       stats->frame_avg_wavelet_energy += av1_haar_ac_sad_8x8_uint8_input(
-          buf + c8 * 8 + r8 * 8 * stride, stride, hbd);
+          buf + c8 * 8 + r8 * 8 * stride, stride);
     }
   }
 
@@ -440,15 +413,11 @@
 }
 
 // Returns the sum of square error between source and reference blocks.
-static int get_prediction_error_bitdepth(const int is_high_bitdepth,
-                                         const int bitdepth,
+static int get_prediction_error_bitdepth(const int bitdepth,
                                          const BLOCK_SIZE block_size,
                                          const struct buf_2d *src,
                                          const struct buf_2d *ref) {
-  if (is_high_bitdepth) {
-    return highbd_get_prediction_error(block_size, src, ref, bitdepth);
-  }
-  return get_prediction_error(block_size, src, ref);
+  return highbd_get_prediction_error(block_size, src, ref, bitdepth);
 }
 
 // Accumulates motion vector stats.
@@ -541,7 +510,6 @@
   CurrentFrame *const current_frame = &cm->current_frame;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const int is_high_bitdepth = is_cur_buf_hbd(xd);
   const int bitdepth = xd->bd;
   const int mb_scale = mi_size_wide[fp_block_size];
   const BLOCK_SIZE bsize = get_bsize(mi_params, mb_row, mb_col);
@@ -556,9 +524,8 @@
                         (fp_block_size_height >> MI_SIZE_LOG2),
                         cpi->oxcf.border_in_pixels);
 
-  int motion_error =
-      get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize,
-                                    &x->plane[0].src, &xd->plane[0].pre[0]);
+  int motion_error = get_prediction_error_bitdepth(
+      bitdepth, bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
 
   // Compute the motion error of the 0,0 motion using the last source
   // frame as the reference. Skip the further motion search on
@@ -568,8 +535,7 @@
       cpi->unscaled_last_source->y_buffer + src_yoffset;
   unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride;
   const int raw_motion_error = get_prediction_error_bitdepth(
-      is_high_bitdepth, bitdepth, bsize, &x->plane[0].src,
-      &unscaled_last_source_buf_2d);
+      bitdepth, bsize, &x->plane[0].src, &unscaled_last_source_buf_2d);
   raw_motion_err_list[raw_motion_err_counts] = raw_motion_error;
 
   // TODO(pengchong): Replace the hard-coded threshold
@@ -596,9 +562,8 @@
       // Assume 0,0 motion with no mv overhead.
       xd->plane[0].pre[0].buf = golden_frame->y_buffer + recon_yoffset;
       xd->plane[0].pre[0].stride = golden_frame->y_stride;
-      gf_motion_error =
-          get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize,
-                                        &x->plane[0].src, &xd->plane[0].pre[0]);
+      gf_motion_error = get_prediction_error_bitdepth(
+          bitdepth, bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
       first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &gf_motion_error);
     }
     if (gf_motion_error < motion_error && gf_motion_error < this_intra_error) {
@@ -621,9 +586,8 @@
     if (alt_ref_frame != NULL) {
       xd->plane[0].pre[0].buf = alt_ref_frame->y_buffer + alt_ref_frame_yoffset;
       xd->plane[0].pre[0].stride = alt_ref_frame->y_stride;
-      alt_motion_error =
-          get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize,
-                                        &x->plane[0].src, &xd->plane[0].pre[0]);
+      alt_motion_error = get_prediction_error_bitdepth(
+          bitdepth, bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
       first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &alt_motion_error);
     }
     if (alt_motion_error < motion_error && alt_motion_error < gf_motion_error &&

diff --git a/av1/encoder/global_motion.c b/av1/encoder/global_motion.c
index 66f9bef..6785782 100644
--- a/av1/encoder/global_motion.c
+++ b/av1/encoder/global_motion.c

@@ -194,62 +194,19 @@
   return gm_sumerr;
 }
 
-static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
-                          int width, int height, int stride,
-                          const uint8_t *const dst, int p_col, int p_row,
-                          int p_width, int p_height, int p_stride,
-                          int subsampling_x, int subsampling_y,
-                          int64_t best_error, uint8_t *segment_map,
-                          int segment_map_stride) {
-  int64_t gm_sumerr = 0;
-  int warp_w, warp_h;
-  const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
-  const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
-  DECLARE_ALIGNED(16, uint8_t, tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]);
-  ConvolveParams conv_params = get_conv_params(0, 0, 8);
-
-  for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
-    for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
-      int seg_x = j >> WARP_ERROR_BLOCK_LOG;
-      int seg_y = i >> WARP_ERROR_BLOCK_LOG;
-      // Only compute the error if this block contains inliers from the motion
-      // model
-      if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
-      // avoid warping extra 8x8 blocks in the padded region of the frame
-      // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
-      warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
-      warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
-      warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w, warp_h,
-                 WARP_ERROR_BLOCK, subsampling_x, subsampling_y, &conv_params);
-
-      gm_sumerr +=
-          av1_calc_frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride,
-                               warp_w, warp_h, p_stride);
-      if (gm_sumerr > best_error) return INT64_MAX;
-    }
-  }
-  return gm_sumerr;
-}
-
-int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
-                       const uint8_t *ref, int width, int height, int stride,
-                       uint8_t *dst, int p_col, int p_row, int p_width,
-                       int p_height, int p_stride, int subsampling_x,
-                       int subsampling_y, int64_t best_error,
-                       uint8_t *segment_map, int segment_map_stride) {
+int64_t av1_warp_error(WarpedMotionParams *wm, int bd, const uint8_t *ref,
+                       int width, int height, int stride, uint8_t *dst,
+                       int p_col, int p_row, int p_width, int p_height,
+                       int p_stride, int subsampling_x, int subsampling_y,
+                       int64_t best_error, uint8_t *segment_map,
+                       int segment_map_stride) {
   if (wm->wmtype <= AFFINE)
     if (!av1_get_shear_params(wm)) return INT64_MAX;
 
-  if (use_hbd)
-    return highbd_warp_error(wm, CONVERT_TO_SHORTPTR(ref), width, height,
-                             stride, CONVERT_TO_SHORTPTR(dst), p_col, p_row,
-                             p_width, p_height, p_stride, subsampling_x,
-                             subsampling_y, bd, best_error, segment_map,
-                             segment_map_stride);
-
-  return warp_error(wm, ref, width, height, stride, dst, p_col, p_row, p_width,
-                    p_height, p_stride, subsampling_x, subsampling_y,
-                    best_error, segment_map, segment_map_stride);
+  return highbd_warp_error(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride,
+                           CONVERT_TO_SHORTPTR(dst), p_col, p_row, p_width,
+                           p_height, p_stride, subsampling_x, subsampling_y, bd,
+                           best_error, segment_map, segment_map_stride);
 }
 
 // Factors used to calculate the thresholds for av1_warp_error
@@ -265,11 +222,10 @@
 }
 
 int64_t av1_refine_integerized_param(
-    WarpedMotionParams *wm, TransformationType wmtype, int use_hbd, int bd,
-    uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst,
-    int d_width, int d_height, int d_stride, int n_refinements,
-    int64_t best_frame_error, uint8_t *segment_map, int segment_map_stride,
-    int64_t erroradv_threshold) {
+    WarpedMotionParams *wm, TransformationType wmtype, int bd, uint8_t *ref,
+    int r_width, int r_height, int r_stride, uint8_t *dst, int d_width,
+    int d_height, int d_stride, int n_refinements, int64_t best_frame_error,
+    uint8_t *segment_map, int segment_map_stride, int64_t erroradv_threshold) {
   static const int max_trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
   const int border = ERRORADV_BORDER;
   int i = 0, p;
@@ -283,7 +239,7 @@
 
   force_wmtype(wm, wmtype);
   best_error =
-      av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+      av1_warp_error(wm, bd, ref, r_width, r_height, r_stride,
                      dst + border * d_stride + border, border, border,
                      d_width - 2 * border, d_height - 2 * border, d_stride, 0,
                      0, best_frame_error, segment_map, segment_map_stride);
@@ -301,7 +257,7 @@
       // look to the left
       *param = add_param_offset(p, curr_param, -step);
       step_error =
-          av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+          av1_warp_error(wm, bd, ref, r_width, r_height, r_stride,
                          dst + border * d_stride + border, border, border,
                          d_width - 2 * border, d_height - 2 * border, d_stride,
                          0, 0, AOMMIN(best_error, error_adv_thresh),
@@ -315,7 +271,7 @@
       // look to the right
       *param = add_param_offset(p, curr_param, step);
       step_error =
-          av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+          av1_warp_error(wm, bd, ref, r_width, r_height, r_stride,
                          dst + border * d_stride + border, border, border,
                          d_width - 2 * border, d_height - 2 * border, d_stride,
                          0, 0, AOMMIN(best_error, error_adv_thresh),
@@ -332,7 +288,7 @@
       while (step_dir) {
         *param = add_param_offset(p, best_param, step * step_dir);
         step_error =
-            av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+            av1_warp_error(wm, bd, ref, r_width, r_height, r_stride,
                            dst + border * d_stride + border, border, border,
                            d_width - 2 * border, d_height - 2 * border,
                            d_stride, 0, 0, AOMMIN(best_error, error_adv_thresh),
@@ -434,12 +390,10 @@
   int num_correspondences;
   int *correspondences;
   int ref_corners[2 * MAX_CORNERS];
-  unsigned char *ref_buffer = ref->y_buffer;
+  unsigned char *ref_buffer;
   RansacFunc ransac = av1_get_ransac_type(type);
 
-  if (ref->flags & YV12_FLAG_HIGHBITDEPTH) {
-    ref_buffer = av1_downconvert_frame(ref, bit_depth);
-  }
+  ref_buffer = av1_downconvert_frame(ref, bit_depth);
 
   num_ref_corners =
       av1_fast_corner_detect(ref_buffer, ref->y_width, ref->y_height,
@@ -916,7 +870,7 @@
     int frm_height, int frm_stride, int *frm_corners, int num_frm_corners,
     YV12_BUFFER_CONFIG *ref, int bit_depth, int *num_inliers_by_motion,
     MotionModel *params_by_motion, int num_motions) {
-  unsigned char *ref_buffer = ref->y_buffer;
+  unsigned char *ref_buffer;
   const int ref_width = ref->y_width;
   const int ref_height = ref->y_height;
   const int pad_size = AOMMAX(PATCH_SIZE, MIN_PAD);
@@ -931,9 +885,7 @@
       frm_width < frm_height ? get_msb(frm_width) : get_msb(frm_height);
   const int n_levels = AOMMIN(msb, N_LEVELS);
 
-  if (ref->flags & YV12_FLAG_HIGHBITDEPTH) {
-    ref_buffer = av1_downconvert_frame(ref, bit_depth);
-  }
+  ref_buffer = av1_downconvert_frame(ref, bit_depth);
 
   // TODO(sarahparker) We will want to do the source pyramid computation
   // outside of this function so it doesn't get recomputed for every

diff --git a/av1/encoder/global_motion.h b/av1/encoder/global_motion.h
index bfc1fec..14f3608 100644
--- a/av1/encoder/global_motion.h
+++ b/av1/encoder/global_motion.h

@@ -113,22 +113,21 @@
 
 // Returns the error between the result of applying motion 'wm' to the frame
 // described by 'ref' and the frame described by 'dst'.
-int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
-                       const uint8_t *ref, int width, int height, int stride,
-                       uint8_t *dst, int p_col, int p_row, int p_width,
-                       int p_height, int p_stride, int subsampling_x,
-                       int subsampling_y, int64_t best_error,
-                       uint8_t *segment_map, int segment_map_stride);
+int64_t av1_warp_error(WarpedMotionParams *wm, int bd, const uint8_t *ref,
+                       int width, int height, int stride, uint8_t *dst,
+                       int p_col, int p_row, int p_width, int p_height,
+                       int p_stride, int subsampling_x, int subsampling_y,
+                       int64_t best_error, uint8_t *segment_map,
+                       int segment_map_stride);
 
 // Returns the av1_warp_error between "dst" and the result of applying the
 // motion params that result from fine-tuning "wm" to "ref". Note that "wm" is
 // modified in place.
 int64_t av1_refine_integerized_param(
-    WarpedMotionParams *wm, TransformationType wmtype, int use_hbd, int bd,
-    uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst,
-    int d_width, int d_height, int d_stride, int n_refinements,
-    int64_t best_frame_error, uint8_t *segment_map, int segment_map_stride,
-    int64_t erroradv_threshold);
+    WarpedMotionParams *wm, TransformationType wmtype, int bd, uint8_t *ref,
+    int r_width, int r_height, int r_stride, uint8_t *dst, int d_width,
+    int d_height, int d_stride, int n_refinements, int64_t best_frame_error,
+    uint8_t *segment_map, int segment_map_stride, int64_t erroradv_threshold);
 
 /*
   Computes "num_motions" candidate global motion parameters between two frames.

diff --git a/av1/encoder/global_motion_facade.c b/av1/encoder/global_motion_facade.c
index 4e80024..405c4b7 100644
--- a/av1/encoder/global_motion_facade.c
+++ b/av1/encoder/global_motion_facade.c

@@ -154,15 +154,15 @@
             params_by_motion[i].inliers, params_by_motion[i].num_inliers);
 
         ref_frame_error = av1_segmented_frame_error(
-            is_cur_buf_hbd(xd), xd->bd, ref_buf[frame]->y_buffer,
-            ref_buf[frame]->y_stride, cpi->source->y_buffer, src_width,
-            src_height, src_stride, segment_map, segment_map_w);
+            xd->bd, ref_buf[frame]->y_buffer, ref_buf[frame]->y_stride,
+            cpi->source->y_buffer, src_width, src_height, src_stride,
+            segment_map, segment_map_w);
 
         const int64_t erroradv_threshold =
             calc_erroradv_threshold(ref_frame_error);
 
         const int64_t warp_error = av1_refine_integerized_param(
-            &tmp_wm_params, tmp_wm_params.wmtype, is_cur_buf_hbd(xd), xd->bd,
+            &tmp_wm_params, tmp_wm_params.wmtype, xd->bd,
             ref_buf[frame]->y_buffer, ref_buf[frame]->y_width,
             ref_buf[frame]->y_height, ref_buf[frame]->y_stride,
             cpi->source->y_buffer, src_width, src_height, src_stride,
@@ -467,12 +467,10 @@
   YV12_BUFFER_CONFIG *source = cpi->source;
 
   gm_info->src_buffer = source->y_buffer;
-  if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
-    // The source buffer is 16-bit, so we need to convert to 8 bits for the
-    // following code. We cache the result until the source frame is released.
-    gm_info->src_buffer =
-        av1_downconvert_frame(source, cpi->common.seq_params.bit_depth);
-  }
+  // The source buffer is 16-bit, so we need to convert to 8 bits for the
+  // following code. We cache the result until the source frame is released.
+  gm_info->src_buffer =
+      av1_downconvert_frame(source, cpi->common.seq_params.bit_depth);
 
   gm_info->segment_map_w =
       (source->y_width + WARP_ERROR_BLOCK) >> WARP_ERROR_BLOCK_LOG;

diff --git a/av1/encoder/hash_motion.c b/av1/encoder/hash_motion.c
index 402a01b..ea6854b 100644
--- a/av1/encoder/hash_motion.c
+++ b/av1/encoder/hash_motion.c

@@ -22,21 +22,6 @@
 #define kBlockSizeBits 3
 #define kMaxAddr (1 << (kSrcBits + kBlockSizeBits))
 
-// TODO(youzhou@microsoft.com): is higher than 8 bits screen content supported?
-// If yes, fix this function
-static void get_pixels_in_1D_char_array_by_block_2x2(const uint8_t *y_src,
-                                                     int stride,
-                                                     uint8_t *p_pixels_in1D) {
-  const uint8_t *p_pel = y_src;
-  int index = 0;
-  for (int i = 0; i < 2; i++) {
-    for (int j = 0; j < 2; j++) {
-      p_pixels_in1D[index++] = p_pel[j];
-    }
-    p_pel += stride;
-  }
-}
-
 static void get_pixels_in_1D_short_array_by_block_2x2(const uint16_t *y_src,
                                                       int stride,
                                                       uint16_t *p_pixels_in1D) {
@@ -50,13 +35,6 @@
   }
 }
 
-static int is_block_2x2_row_same_value(const uint8_t *p) {
-  if (p[0] != p[1] || p[2] != p[3]) {
-    return 0;
-  }
-  return 1;
-}
-
 static int is_block16_2x2_row_same_value(const uint16_t *p) {
   if (p[0] != p[1] || p[2] != p[3]) {
     return 0;
@@ -64,13 +42,6 @@
   return 1;
 }
 
-static int is_block_2x2_col_same_value(const uint8_t *p) {
-  if ((p[0] != p[2]) || (p[1] != p[3])) {
-    return 0;
-  }
-  return 1;
-}
-
 static int is_block16_2x2_col_same_value(const uint16_t *p) {
   if ((p[0] != p[2]) || (p[1] != p[3])) {
     return 0;
@@ -193,45 +164,24 @@
   CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2;
 
   const int length = width * 2;
-  if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
-    uint16_t p[4];
-    int pos = 0;
-    for (int y_pos = 0; y_pos < y_end; y_pos++) {
-      for (int x_pos = 0; x_pos < x_end; x_pos++) {
-        get_pixels_in_1D_short_array_by_block_2x2(
-            CONVERT_TO_SHORTPTR(picture->y_buffer) + y_pos * picture->y_stride +
-                x_pos,
-            picture->y_stride, p);
-        pic_block_same_info[0][pos] = is_block16_2x2_row_same_value(p);
-        pic_block_same_info[1][pos] = is_block16_2x2_col_same_value(p);
+  uint16_t p[4];
+  int pos = 0;
+  for (int y_pos = 0; y_pos < y_end; y_pos++) {
+    for (int x_pos = 0; x_pos < x_end; x_pos++) {
+      get_pixels_in_1D_short_array_by_block_2x2(
+          CONVERT_TO_SHORTPTR(picture->y_buffer) + y_pos * picture->y_stride +
+              x_pos,
+          picture->y_stride, p);
+      pic_block_same_info[0][pos] = is_block16_2x2_row_same_value(p);
+      pic_block_same_info[1][pos] = is_block16_2x2_col_same_value(p);
 
-        pic_block_hash[0][pos] =
-            av1_get_crc_value(calc_1, (uint8_t *)p, length * sizeof(p[0]));
-        pic_block_hash[1][pos] =
-            av1_get_crc_value(calc_2, (uint8_t *)p, length * sizeof(p[0]));
-        pos++;
-      }
-      pos += width - 1;
+      pic_block_hash[0][pos] =
+          av1_get_crc_value(calc_1, (uint8_t *)p, length * sizeof(p[0]));
+      pic_block_hash[1][pos] =
+          av1_get_crc_value(calc_2, (uint8_t *)p, length * sizeof(p[0]));
+      pos++;
     }
-  } else {
-    uint8_t p[4];
-    int pos = 0;
-    for (int y_pos = 0; y_pos < y_end; y_pos++) {
-      for (int x_pos = 0; x_pos < x_end; x_pos++) {
-        get_pixels_in_1D_char_array_by_block_2x2(
-            picture->y_buffer + y_pos * picture->y_stride + x_pos,
-            picture->y_stride, p);
-        pic_block_same_info[0][pos] = is_block_2x2_row_same_value(p);
-        pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p);
-
-        pic_block_hash[0][pos] =
-            av1_get_crc_value(calc_1, p, length * sizeof(p[0]));
-        pic_block_hash[1][pos] =
-            av1_get_crc_value(calc_2, p, length * sizeof(p[0]));
-        pos++;
-      }
-      pos += width - 1;
-    }
+    pos += width - 1;
   }
 }
 
@@ -347,25 +297,14 @@
   const int stride = picture->y_stride;
   const uint8_t *p = picture->y_buffer + y_start * stride + x_start;
 
-  if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
-    const uint16_t *p16 = CONVERT_TO_SHORTPTR(p);
-    for (int i = 0; i < block_size; i++) {
-      for (int j = 1; j < block_size; j++) {
-        if (p16[j] != p16[0]) {
-          return 0;
-        }
+  const uint16_t *p16 = CONVERT_TO_SHORTPTR(p);
+  for (int i = 0; i < block_size; i++) {
+    for (int j = 1; j < block_size; j++) {
+      if (p16[j] != p16[0]) {
+        return 0;
       }
-      p16 += stride;
     }
-  } else {
-    for (int i = 0; i < block_size; i++) {
-      for (int j = 1; j < block_size; j++) {
-        if (p[j] != p[0]) {
-          return 0;
-        }
-      }
-      p += stride;
-    }
+    p16 += stride;
   }
 
   return 1;
@@ -376,21 +315,11 @@
   const int stride = picture->y_stride;
   const uint8_t *p = picture->y_buffer + y_start * stride + x_start;
 
-  if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
-    const uint16_t *p16 = CONVERT_TO_SHORTPTR(p);
-    for (int i = 0; i < block_size; i++) {
-      for (int j = 1; j < block_size; j++) {
-        if (p16[j * stride + i] != p16[i]) {
-          return 0;
-        }
-      }
-    }
-  } else {
-    for (int i = 0; i < block_size; i++) {
-      for (int j = 1; j < block_size; j++) {
-        if (p[j * stride + i] != p[i]) {
-          return 0;
-        }
+  const uint16_t *p16 = CONVERT_TO_SHORTPTR(p);
+  for (int i = 0; i < block_size; i++) {
+    for (int j = 1; j < block_size; j++) {
+      if (p16[j * stride + i] != p16[i]) {
+        return 0;
       }
     }
   }
@@ -399,8 +328,7 @@
 
 void av1_get_block_hash_value(IntraBCHashInfo *intrabc_hash_info,
                               const uint8_t *y_src, int stride, int block_size,
-                              uint32_t *hash_value1, uint32_t *hash_value2,
-                              int use_highbitdepth) {
+                              uint32_t *hash_value1, uint32_t *hash_value2) {
   int add_value = hash_block_size_to_index(block_size);
   assert(add_value >= 0);
   add_value <<= kSrcBits;
@@ -413,34 +341,18 @@
 
   // 2x2 subblock hash values in current CU
   int sub_block_in_width = (block_size >> 1);
-  if (use_highbitdepth) {
-    uint16_t pixel_to_hash[4];
-    uint16_t *y16_src = CONVERT_TO_SHORTPTR(y_src);
-    for (int y_pos = 0; y_pos < block_size; y_pos += 2) {
-      for (int x_pos = 0; x_pos < block_size; x_pos += 2) {
-        int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1);
-        get_pixels_in_1D_short_array_by_block_2x2(
-            y16_src + y_pos * stride + x_pos, stride, pixel_to_hash);
-        assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
-        buf_1[0][pos] = av1_get_crc_value(calc_1, (uint8_t *)pixel_to_hash,
-                                          sizeof(pixel_to_hash));
-        buf_2[0][pos] = av1_get_crc_value(calc_2, (uint8_t *)pixel_to_hash,
-                                          sizeof(pixel_to_hash));
-      }
-    }
-  } else {
-    uint8_t pixel_to_hash[4];
-    for (int y_pos = 0; y_pos < block_size; y_pos += 2) {
-      for (int x_pos = 0; x_pos < block_size; x_pos += 2) {
-        int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1);
-        get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos,
-                                                 stride, pixel_to_hash);
-        assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
-        buf_1[0][pos] =
-            av1_get_crc_value(calc_1, pixel_to_hash, sizeof(pixel_to_hash));
-        buf_2[0][pos] =
-            av1_get_crc_value(calc_2, pixel_to_hash, sizeof(pixel_to_hash));
-      }
+  uint16_t pixel_to_hash[4];
+  uint16_t *y16_src = CONVERT_TO_SHORTPTR(y_src);
+  for (int y_pos = 0; y_pos < block_size; y_pos += 2) {
+    for (int x_pos = 0; x_pos < block_size; x_pos += 2) {
+      int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1);
+      get_pixels_in_1D_short_array_by_block_2x2(
+          y16_src + y_pos * stride + x_pos, stride, pixel_to_hash);
+      assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+      buf_1[0][pos] = av1_get_crc_value(calc_1, (uint8_t *)pixel_to_hash,
+                                        sizeof(pixel_to_hash));
+      buf_2[0][pos] = av1_get_crc_value(calc_2, (uint8_t *)pixel_to_hash,
+                                        sizeof(pixel_to_hash));
     }
   }
 

diff --git a/av1/encoder/hash_motion.h b/av1/encoder/hash_motion.h
index 4a56e2c..fbc49d7 100644
--- a/av1/encoder/hash_motion.h
+++ b/av1/encoder/hash_motion.h

@@ -92,8 +92,7 @@
 
 void av1_get_block_hash_value(IntraBCHashInfo *intrabc_hash_info,
                               const uint8_t *y_src, int stride, int block_size,
-                              uint32_t *hash_value1, uint32_t *hash_value2,
-                              int use_highbitdepth);
+                              uint32_t *hash_value1, uint32_t *hash_value2);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/av1/encoder/intra_mode_search.c b/av1/encoder/intra_mode_search.c
index ea65dfd..2583578 100644
--- a/av1/encoder/intra_mode_search.c
+++ b/av1/encoder/intra_mode_search.c

@@ -127,24 +127,6 @@
   }
 }
 
-void av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
-                      int *val_count, int *num_colors) {
-  const int max_pix_val = 1 << 8;
-  memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
-  for (int r = 0; r < rows; ++r) {
-    for (int c = 0; c < cols; ++c) {
-      const int this_val = src[r * stride + c];
-      assert(this_val < max_pix_val);
-      ++val_count[this_val];
-    }
-  }
-  int n = 0;
-  for (int i = 0; i < max_pix_val; ++i) {
-    if (val_count[i]) ++n;
-  }
-  *num_colors = n;
-}
-
 void av1_count_colors_highbd(const uint8_t *src8, int stride, int rows,
                              int cols, int bit_depth, int *val_count,
                              int *bin_val_count, int *num_color_bins,

diff --git a/av1/encoder/intra_mode_search.h b/av1/encoder/intra_mode_search.h
index c8fb804..a2907d2 100644
--- a/av1/encoder/intra_mode_search.h
+++ b/av1/encoder/intra_mode_search.h

@@ -317,11 +317,6 @@
 
 /*! \brief Return the number of colors in src. Used by palette mode.
  */
-void av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
-                      int *val_count, int *num_colors);
-
-/*! \brief See \ref av1_count_colors(), but for highbd.
- */
 void av1_count_colors_highbd(const uint8_t *src8, int stride, int rows,
                              int cols, int bit_depth, int *val_count,
                              int *val_count_8bit, int *num_color_bins,

diff --git a/av1/encoder/intra_mode_search_utils.h b/av1/encoder/intra_mode_search_utils.h
index 99dcfd0..0c7a32e 100644
--- a/av1/encoder/intra_mode_search_utils.h
+++ b/av1/encoder/intra_mode_search_utils.h

@@ -121,40 +121,6 @@
 }
 #undef FIX_PREC_BITS
 
-static AOM_INLINE void generate_hog(const uint8_t *src, int stride, int rows,
-                                    int cols, float *hist) {
-  float total = 0.1f;
-  src += stride;
-  for (int r = 1; r < rows - 1; ++r) {
-    for (int c = 1; c < cols - 1; ++c) {
-      const uint8_t *above = &src[c - stride];
-      const uint8_t *below = &src[c + stride];
-      const uint8_t *left = &src[c - 1];
-      const uint8_t *right = &src[c + 1];
-      // Calculate gradient using Sobel fitlers.
-      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
-                     (left[-stride] + 2 * left[0] + left[stride]);
-      const int dy = (below[-1] + 2 * below[0] + below[1]) -
-                     (above[-1] + 2 * above[0] + above[1]);
-      if (dx == 0 && dy == 0) continue;
-      const int temp = abs(dx) + abs(dy);
-      if (!temp) continue;
-      total += temp;
-      if (dx == 0) {
-        hist[0] += temp / 2;
-        hist[BINS - 1] += temp / 2;
-      } else {
-        const int idx = get_hist_bin_idx(dx, dy);
-        assert(idx >= 0 && idx < BINS);
-        hist[idx] += temp;
-      }
-    }
-    src += stride;
-  }
-
-  for (int i = 0; i < BINS; ++i) hist[i] /= total;
-}
-
 static AOM_INLINE void generate_hog_hbd(const uint8_t *src8, int stride,
                                         int rows, int cols, float *hist) {
   float total = 0.1f;
@@ -205,11 +171,7 @@
   const int src_stride = x->plane[0].src.stride;
   const uint8_t *src = x->plane[0].src.buf;
   float hist[BINS] = { 0.0f };
-  if (is_cur_buf_hbd(xd)) {
-    generate_hog_hbd(src, src_stride, rows, cols, hist);
-  } else {
-    generate_hog(src, src_stride, rows, cols, hist);
-  }
+  generate_hog_hbd(src, src_stride, rows, cols, hist);
 
   for (int i = 0; i < DIRECTIONAL_MODES; ++i) {
     float this_score = intra_hog_model_bias[i];

diff --git a/av1/encoder/lookahead.c b/av1/encoder/lookahead.c
index 9c79ecb..251feaf 100644
--- a/av1/encoder/lookahead.c
+++ b/av1/encoder/lookahead.c

@@ -45,8 +45,8 @@
 
 struct lookahead_ctx *av1_lookahead_init(
     unsigned int width, unsigned int height, unsigned int subsampling_x,
-    unsigned int subsampling_y, int use_highbitdepth, unsigned int depth,
-    const int border_in_pixels, int byte_alignment, int num_lap_buffers) {
+    unsigned int subsampling_y, unsigned int depth, const int border_in_pixels,
+    int byte_alignment, int num_lap_buffers) {
   struct lookahead_ctx *ctx = NULL;
   int lag_in_frames = AOMMAX(1, depth);
 
@@ -72,10 +72,9 @@
     if (!ctx->buf) goto fail;
     for (i = 0; i < depth; i++) {
       aom_free_frame_buffer(&ctx->buf[i].img);
-      if (aom_realloc_frame_buffer(&ctx->buf[i].img, width, height,
-                                   subsampling_x, subsampling_y,
-                                   use_highbitdepth, border_in_pixels,
-                                   byte_alignment, NULL, NULL, NULL))
+      if (aom_realloc_frame_buffer(
+              &ctx->buf[i].img, width, height, subsampling_x, subsampling_y,
+              border_in_pixels, byte_alignment, NULL, NULL, NULL))
         goto fail;
     }
   }
@@ -86,7 +85,7 @@
 }
 
 int av1_lookahead_push(struct lookahead_ctx *ctx, const YV12_BUFFER_CONFIG *src,
-                       int64_t ts_start, int64_t ts_end, int use_highbitdepth,
+                       int64_t ts_start, int64_t ts_end,
                        aom_enc_frame_flags_t flags) {
   struct lookahead_entry *buf;
   int width = src->y_crop_width;
@@ -119,8 +118,7 @@
     YV12_BUFFER_CONFIG new_img;
     memset(&new_img, 0, sizeof(new_img));
     if (aom_alloc_frame_buffer(&new_img, width, height, subsampling_x,
-                               subsampling_y, use_highbitdepth,
-                               AOM_BORDER_IN_PIXELS, 0))
+                               subsampling_y, AOM_BORDER_IN_PIXELS, 0))
       return 1;
     aom_free_frame_buffer(&buf->img);
     buf->img = new_img;

diff --git a/av1/encoder/lookahead.h b/av1/encoder/lookahead.h
index 9506c9b..524bb0d 100644
--- a/av1/encoder/lookahead.h
+++ b/av1/encoder/lookahead.h

@@ -63,8 +63,8 @@
  */
 struct lookahead_ctx *av1_lookahead_init(
     unsigned int width, unsigned int height, unsigned int subsampling_x,
-    unsigned int subsampling_y, int use_highbitdepth, unsigned int depth,
-    const int border_in_pixels, int byte_alignment, int num_lap_buffers);
+    unsigned int subsampling_y, unsigned int depth, const int border_in_pixels,
+    int byte_alignment, int num_lap_buffers);
 
 /**\brief Destroys the lookahead stage
  */
@@ -82,11 +82,10 @@
  * \param[in] src         Pointer to the image to enqueue
  * \param[in] ts_start    Timestamp for the start of this frame
  * \param[in] ts_end      Timestamp for the end of this frame
- * \param[in] use_highbitdepth Tell if HBD is used
  * \param[in] flags       Flags set on this frame
  */
 int av1_lookahead_push(struct lookahead_ctx *ctx, const YV12_BUFFER_CONFIG *src,
-                       int64_t ts_start, int64_t ts_end, int use_highbitdepth,
+                       int64_t ts_start, int64_t ts_end,
                        aom_enc_frame_flags_t flags);
 
 /**\brief Get the next source buffer to encode

diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index 140b115..cfdc907 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c

@@ -2203,7 +2203,7 @@
   hash_table *ref_frame_hash = &intrabc_hash_info->intrabc_hash_table;
 
   av1_get_block_hash_value(intrabc_hash_info, src, src_stride, block_width,
-                           &hash_value1, &hash_value2, is_cur_buf_hbd(xd));
+                           &hash_value1, &hash_value2);
 
   const int count = av1_hash_table_count(ref_frame_hash, hash_value1);
   if (count <= 1) {
@@ -2759,48 +2759,26 @@
 
   unsigned int besterr;
 
-  if (is_cur_buf_hbd(xd)) {
-    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
-    uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16);
-    if (second_pred != NULL) {
-      if (mask) {
-        aom_highbd_comp_mask_upsampled_pred(
-            xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h,
-            subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride,
-            invert_mask, xd->bd, subpel_search_type);
-      } else {
-        aom_highbd_comp_avg_upsampled_pred(
-            xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h,
-            subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd,
-            subpel_search_type);
-      }
+  DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+  uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16);
+  if (second_pred != NULL) {
+    if (mask) {
+      aom_highbd_comp_mask_upsampled_pred(
+          xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h,
+          subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride,
+          invert_mask, xd->bd, subpel_search_type);
     } else {
-      aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h,
-                                subpel_x_q3, subpel_y_q3, ref, ref_stride,
-                                xd->bd, subpel_search_type);
+      aom_highbd_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8,
+                                         second_pred, w, h, subpel_x_q3,
+                                         subpel_y_q3, ref, ref_stride, xd->bd,
+                                         subpel_search_type);
     }
-    besterr = vfp->vf(pred8, w, src, src_stride, sse);
   } else {
-    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
-    if (second_pred != NULL) {
-      if (mask) {
-        aom_comp_mask_upsampled_pred(
-            xd, cm, mi_row, mi_col, this_mv, pred, second_pred, w, h,
-            subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride,
-            invert_mask, subpel_search_type);
-      } else {
-        aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
-                                    second_pred, w, h, subpel_x_q3, subpel_y_q3,
-                                    ref, ref_stride, subpel_search_type);
-      }
-    } else {
-      aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h,
-                         subpel_x_q3, subpel_y_q3, ref, ref_stride,
-                         subpel_search_type);
-    }
-
-    besterr = vfp->vf(pred, w, src, src_stride, sse);
+    aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h,
+                              subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd,
+                              subpel_search_type);
   }
+  besterr = vfp->vf(pred8, w, src, src_stride, sse);
 
   return besterr;
 }
@@ -3097,6 +3075,7 @@
     const MACROBLOCKD *xd, const MV *bestmv,
     const SUBPEL_SEARCH_VAR_PARAMS *var_params,
     const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
+  (void)xd;
   const aom_variance_fn_ptr_t *vfp = var_params->vfp;
   const int w = var_params->w;
   const int h = var_params->h;
@@ -3114,26 +3093,15 @@
   unsigned int besterr;
 
   if (second_pred != NULL) {
-    if (is_cur_buf_hbd(xd)) {
-      DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
-      uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16);
-      if (mask) {
-        aom_highbd_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride,
-                                  mask, mask_stride, invert_mask);
-      } else {
-        aom_highbd_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
-      }
-      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+    DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
+    uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16);
+    if (mask) {
+      aom_highbd_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask,
+                                mask_stride, invert_mask);
     } else {
-      DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
-      if (mask) {
-        aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask,
-                           mask_stride, invert_mask);
-      } else {
-        aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
-      }
-      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+      aom_highbd_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
     }
+    besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
   } else {
     besterr = vfp->vf(y, y_stride, src, src_stride, sse1);
   }
@@ -4168,18 +4136,11 @@
 
   unsigned int besterr;
   DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]);
-  if (is_cur_buf_hbd(xd)) {
-    uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred);
-    aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h,
-                              subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd,
-                              subpel_search_type);
-    besterr = vfp->ovf(pred8, w, wsrc, mask, sse);
-  } else {
-    aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
-                       subpel_y_q3, ref, ref_stride, subpel_search_type);
-
-    besterr = vfp->ovf(pred, w, wsrc, mask, sse);
-  }
+  uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred);
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h,
+                            subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd,
+                            subpel_search_type);
+  besterr = vfp->ovf(pred8, w, wsrc, mask, sse);
 
   return besterr;
 }

diff --git a/av1/encoder/mips/msa/error_msa.c b/av1/encoder/mips/msa/error_msa.c
deleted file mode 100644
index 4e99b3d..0000000
--- a/av1/encoder/mips/msa/error_msa.c
+++ /dev/null

@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include "config/av1_rtcd.h"
-
-#include "aom_dsp/mips/macros_msa.h"
-
-#define BLOCK_ERROR_BLOCKSIZE_MSA(BSize)                                     \
-  static int64_t block_error_##BSize##size_msa(                              \
-      const int16_t *coeff_ptr, const int16_t *dq_coeff_ptr, int64_t *ssz) { \
-    int64_t err = 0;                                                         \
-    uint32_t loop_cnt;                                                       \
-    v8i16 coeff, dq_coeff, coeff_r_h, coeff_l_h;                             \
-    v4i32 diff_r, diff_l, coeff_r_w, coeff_l_w;                              \
-    v2i64 sq_coeff_r, sq_coeff_l;                                            \
-    v2i64 err0, err_dup0, err1, err_dup1;                                    \
-                                                                             \
-    coeff = LD_SH(coeff_ptr);                                                \
-    dq_coeff = LD_SH(dq_coeff_ptr);                                          \
-    UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                                \
-    ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                      \
-    HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                       \
-    DOTP_SW2_SD(coeff_r_w, coeff_l_w, coeff_r_w, coeff_l_w, sq_coeff_r,      \
-                sq_coeff_l);                                                 \
-    DOTP_SW2_SD(diff_r, diff_l, diff_r, diff_l, err0, err1);                 \
-                                                                             \
-    coeff = LD_SH(coeff_ptr + 8);                                            \
-    dq_coeff = LD_SH(dq_coeff_ptr + 8);                                      \
-    UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                                \
-    ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                      \
-    HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                       \
-    DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);              \
-    DPADD_SD2_SD(diff_r, diff_l, err0, err1);                                \
-                                                                             \
-    coeff_ptr += 16;                                                         \
-    dq_coeff_ptr += 16;                                                      \
-                                                                             \
-    for (loop_cnt = ((BSize >> 4) - 1); loop_cnt--;) {                       \
-      coeff = LD_SH(coeff_ptr);                                              \
-      dq_coeff = LD_SH(dq_coeff_ptr);                                        \
-      UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                              \
-      ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                    \
-      HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                     \
-      DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);            \
-      DPADD_SD2_SD(diff_r, diff_l, err0, err1);                              \
-                                                                             \
-      coeff = LD_SH(coeff_ptr + 8);                                          \
-      dq_coeff = LD_SH(dq_coeff_ptr + 8);                                    \
-      UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                              \
-      ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                    \
-      HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                     \
-      DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);            \
-      DPADD_SD2_SD(diff_r, diff_l, err0, err1);                              \
-                                                                             \
-      coeff_ptr += 16;                                                       \
-      dq_coeff_ptr += 16;                                                    \
-    }                                                                        \
-                                                                             \
-    err_dup0 = __msa_splati_d(sq_coeff_r, 1);                                \
-    err_dup1 = __msa_splati_d(sq_coeff_l, 1);                                \
-    sq_coeff_r += err_dup0;                                                  \
-    sq_coeff_l += err_dup1;                                                  \
-    *ssz = __msa_copy_s_d(sq_coeff_r, 0);                                    \
-    *ssz += __msa_copy_s_d(sq_coeff_l, 0);                                   \
-                                                                             \
-    err_dup0 = __msa_splati_d(err0, 1);                                      \
-    err_dup1 = __msa_splati_d(err1, 1);                                      \
-    err0 += err_dup0;                                                        \
-    err1 += err_dup1;                                                        \
-    err = __msa_copy_s_d(err0, 0);                                           \
-    err += __msa_copy_s_d(err1, 0);                                          \
-                                                                             \
-    return err;                                                              \
-  }
-
-/* clang-format off */
-BLOCK_ERROR_BLOCKSIZE_MSA(16)
-BLOCK_ERROR_BLOCKSIZE_MSA(64)
-BLOCK_ERROR_BLOCKSIZE_MSA(256)
-BLOCK_ERROR_BLOCKSIZE_MSA(1024)
-/* clang-format on */
-
-int64_t av1_block_error_msa(const tran_low_t *coeff_ptr,
-                            const tran_low_t *dq_coeff_ptr, intptr_t blk_size,
-                            int64_t *ssz) {
-  int64_t err;
-  const int16_t *coeff = (const int16_t *)coeff_ptr;
-  const int16_t *dq_coeff = (const int16_t *)dq_coeff_ptr;
-
-  switch (blk_size) {
-    case 16: err = block_error_16size_msa(coeff, dq_coeff, ssz); break;
-    case 64: err = block_error_64size_msa(coeff, dq_coeff, ssz); break;
-    case 256: err = block_error_256size_msa(coeff, dq_coeff, ssz); break;
-    case 1024: err = block_error_1024size_msa(coeff, dq_coeff, ssz); break;
-    default:
-      err = av1_block_error_c(coeff_ptr, dq_coeff_ptr, blk_size, ssz);
-      break;
-  }
-
-  return err;
-}

diff --git a/av1/encoder/model_rd.h b/av1/encoder/model_rd.h
index 4febcfe..b420db3 100644
--- a/av1/encoder/model_rd.h
+++ b/av1/encoder/model_rd.h

@@ -56,13 +56,8 @@
                              const int bh) {
   int64_t sse = 0;
   const int shift = xd->bd - 8;
-  if (is_cur_buf_hbd(xd)) {
-    sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
-                         bw, bh);
-  } else {
-    sse =
-        aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh);
-  }
+  sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                       bw, bh);
   sse = ROUND_POWER_OF_TWO(sse, shift * 2);
   return sse;
 }
@@ -90,7 +85,7 @@
   (void)num_samples;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
-  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+  const int dequant_shift = xd->bd - 5;
 
   // Fast approximate the modelling function.
   if (cpi->sf.rd_sf.simple_model_rd_from_var) {
@@ -124,7 +119,7 @@
   (void)plane_bsize;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
-  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+  const int dequant_shift = xd->bd - 5;
   const int qstep = AOMMAX(
       ROUND_POWER_OF_TWO(p->dequant_QTX[1], QUANT_TABLE_BITS) >> dequant_shift,
       1);

diff --git a/av1/encoder/motion_search_facade.c b/av1/encoder/motion_search_facade.c
index 790365d..083d7ac 100644
--- a/av1/encoder/motion_search_facade.c
+++ b/av1/encoder/motion_search_facade.c

@@ -371,7 +371,7 @@
 
   // Prediction buffer from second frame.
   DECLARE_ALIGNED(16, uint8_t, second_pred16[MAX_SB_SQUARE * sizeof(uint16_t)]);
-  uint8_t *second_pred = get_buf_by_bd(xd, second_pred16);
+  uint8_t *second_pred = CONVERT_TO_BYTEPTR(second_pred16);
   int_mv best_mv;
 
   // Allow joint search multiple times iteratively for each reference frame
@@ -424,8 +424,8 @@
     InterPredParams inter_pred_params;
     const InterpFilter interp_filters = EIGHTTAP_REGULAR;
     av1_init_inter_params(&inter_pred_params, pw, ph, mi_row * MI_SIZE,
-                          mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0,
-                          &cm->sf_identity, &ref_yv12[!id], interp_filters);
+                          mi_col * MI_SIZE, 0, 0, xd->bd, 0, &cm->sf_identity,
+                          &ref_yv12[!id], interp_filters);
     inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd);
 
     // Since we have scaled the reference frames to match the size of the
@@ -604,8 +604,8 @@
     other_mv->col = ref_other_mv.as_mv.col;
     struct buf_2d ref_yv12 = xd->plane[0].pre[!ref_idx];
     av1_init_inter_params(&inter_pred_params, pw, ph, mi_row * MI_SIZE,
-                          mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0,
-                          &cm->sf_identity, &ref_yv12, mbmi->interp_fltr);
+                          mi_col * MI_SIZE, 0, 0, xd->bd, 0, &cm->sf_identity,
+                          &ref_yv12, mbmi->interp_fltr);
     inter_pred_params.conv_params = get_conv_params(0, PLANE_TYPE_Y, xd->bd);
   }
 #endif  // CONFIG_JOINT_MVD
@@ -790,9 +790,8 @@
   InterPredParams inter_pred_params;
 
   av1_init_inter_params(&inter_pred_params, pw, ph, p_row, p_col,
-                        pd->subsampling_x, pd->subsampling_y, xd->bd,
-                        is_cur_buf_hbd(xd), 0, &sf, &ref_yv12,
-                        mbmi->interp_fltr);
+                        pd->subsampling_x, pd->subsampling_y, xd->bd, 0, &sf,
+                        &ref_yv12, mbmi->interp_fltr);
   inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
 
   // Get the prediction block from the 'other' reference frame.
@@ -806,16 +805,14 @@
     const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv,
     const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx) {
   MACROBLOCKD *xd = &x->e_mbd;
+  (void)xd;
   // This function should only ever be called for compound modes
   assert(has_second_ref(xd->mi[0]));
 
   // Prediction buffer from second frame.
   DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
   uint8_t *second_pred;
-  if (is_cur_buf_hbd(xd))
-    second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
-  else
-    second_pred = (uint8_t *)second_pred_alloc_16;
+  second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
 
   MV *this_mv = &cur_mv[ref_idx].as_mv;
 #if CONFIG_JOINT_MVD

diff --git a/av1/encoder/mv_prec.c b/av1/encoder/mv_prec.c
index 8d476e8..dac68cc 100644
--- a/av1/encoder/mv_prec.c
+++ b/av1/encoder/mv_prec.c

@@ -307,35 +307,19 @@
   const int num_cols = block_size_wide[bsize];
   const int y_stride = cpi->source->y_stride;
   const int px_row = 4 * mi_row, px_col = 4 * mi_col;
-  const int buf_is_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH;
   const int bd = cm->seq_params.bit_depth;
-  if (buf_is_hbd) {
-    uint16_t *source_buf =
-        CONVERT_TO_SHORTPTR(cpi->source->y_buffer) + px_row * y_stride + px_col;
-    for (int row = 0; row < num_rows - 1; row++) {
-      for (int col = 0; col < num_cols - 1; col++) {
-        const int offset = row * y_stride + col;
-        const int horz_diff =
-            abs(source_buf[offset + 1] - source_buf[offset]) >> (bd - 8);
-        const int vert_diff =
-            abs(source_buf[offset + y_stride] - source_buf[offset]) >> (bd - 8);
-        mv_stats->horz_text += horz_diff;
-        mv_stats->vert_text += vert_diff;
-        mv_stats->diag_text += horz_diff * vert_diff;
-      }
-    }
-  } else {
-    uint8_t *source_buf = cpi->source->y_buffer + px_row * y_stride + px_col;
-    for (int row = 0; row < num_rows - 1; row++) {
-      for (int col = 0; col < num_cols - 1; col++) {
-        const int offset = row * y_stride + col;
-        const int horz_diff = abs(source_buf[offset + 1] - source_buf[offset]);
-        const int vert_diff =
-            abs(source_buf[offset + y_stride] - source_buf[offset]);
-        mv_stats->horz_text += horz_diff;
-        mv_stats->vert_text += vert_diff;
-        mv_stats->diag_text += horz_diff * vert_diff;
-      }
+  uint16_t *source_buf =
+      CONVERT_TO_SHORTPTR(cpi->source->y_buffer) + px_row * y_stride + px_col;
+  for (int row = 0; row < num_rows - 1; row++) {
+    for (int col = 0; col < num_cols - 1; col++) {
+      const int offset = row * y_stride + col;
+      const int horz_diff =
+          abs(source_buf[offset + 1] - source_buf[offset]) >> (bd - 8);
+      const int vert_diff =
+          abs(source_buf[offset + y_stride] - source_buf[offset]) >> (bd - 8);
+      mv_stats->horz_text += horz_diff;
+      mv_stats->vert_text += vert_diff;
+      mv_stats->diag_text += horz_diff * vert_diff;
     }
   }
 }

diff --git a/av1/encoder/palette.c b/av1/encoder/palette.c
index fd67d95..6b1119f 100644
--- a/av1/encoder/palette.c
+++ b/av1/encoder/palette.c

@@ -242,15 +242,9 @@
     return;
   }
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  if (cpi->common.seq_params.use_highbitdepth) {
-    for (int i = 0; i < num_unique_colors; ++i) {
-      pmi->palette_colors[i] = clip_pixel_highbd(
-          (int)centroids[i], cpi->common.seq_params.bit_depth);
-    }
-  } else {
-    for (int i = 0; i < num_unique_colors; ++i) {
-      pmi->palette_colors[i] = clip_pixel(centroids[i]);
-    }
+  for (int i = 0; i < num_unique_colors; ++i) {
+    pmi->palette_colors[i] =
+        clip_pixel_highbd((int)centroids[i], cpi->common.seq_params.bit_depth);
   }
   pmi->palette_size[0] = num_unique_colors;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -447,54 +441,32 @@
   av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
                            &cols);
   const SequenceHeader *const seq_params = &cpi->common.seq_params;
-  const int is_hbd = seq_params->use_highbitdepth;
   const int bit_depth = seq_params->bit_depth;
   int unused;
 
   int count_buf[1 << 12];      // Maximum (1 << 12) color levels.
   int count_buf_8bit[1 << 8];  // Maximum (1 << 8) bins for hbd path.
   int colors, colors_threshold = 0;
-  if (is_hbd) {
-    av1_count_colors_highbd(src, src_stride, rows, cols, bit_depth, count_buf,
-                            count_buf_8bit, &colors_threshold, &colors);
-  } else {
-    av1_count_colors(src, src_stride, rows, cols, count_buf, &colors);
-    colors_threshold = colors;
-  }
+  av1_count_colors_highbd(src, src_stride, rows, cols, bit_depth, count_buf,
+                          count_buf_8bit, &colors_threshold, &colors);
 
   uint8_t *const color_map = xd->plane[0].color_index_map;
   if (colors_threshold > 1 && colors_threshold <= 64) {
     int *const data = x->palette_buffer->kmeans_data_buf;
     int centroids[PALETTE_MAX_SIZE];
     int lb, ub;
-    if (is_hbd) {
-      int *data_pt = data;
-      const uint16_t *src_pt = CONVERT_TO_SHORTPTR(src);
-      lb = ub = src_pt[0];
-      for (int r = 0; r < rows; ++r) {
-        for (int c = 0; c < cols; ++c) {
-          const int val = src_pt[c];
-          data_pt[c] = val;
-          lb = AOMMIN(lb, val);
-          ub = AOMMAX(ub, val);
-        }
-        src_pt += src_stride;
-        data_pt += cols;
+    int *data_pt = data;
+    const uint16_t *src_pt = CONVERT_TO_SHORTPTR(src);
+    lb = ub = src_pt[0];
+    for (int r = 0; r < rows; ++r) {
+      for (int c = 0; c < cols; ++c) {
+        const int val = src_pt[c];
+        data_pt[c] = val;
+        lb = AOMMIN(lb, val);
+        ub = AOMMAX(ub, val);
       }
-    } else {
-      int *data_pt = data;
-      const uint8_t *src_pt = src;
-      lb = ub = src[0];
-      for (int r = 0; r < rows; ++r) {
-        for (int c = 0; c < cols; ++c) {
-          const int val = src_pt[c];
-          data_pt[c] = val;
-          lb = AOMMIN(lb, val);
-          ub = AOMMAX(ub, val);
-        }
-        src_pt += src_stride;
-        data_pt += cols;
-      }
+      src_pt += src_stride;
+      data_pt += cols;
     }
 
     mbmi->mode = DC_PRED;
@@ -701,19 +673,12 @@
 #endif                         // CONFIG_AIMC
   int count_buf[1 << 12];      // Maximum (1 << 12) color levels.
   int count_buf_8bit[1 << 8];  // Maximum (1 << 8) bins for hbd path.
-  if (seq_params->use_highbitdepth) {
-    av1_count_colors_highbd(src_u, src_stride, rows, cols,
-                            seq_params->bit_depth, count_buf, count_buf_8bit,
-                            &colors_threshold_u, &colors_u);
-    av1_count_colors_highbd(src_v, src_stride, rows, cols,
-                            seq_params->bit_depth, count_buf, count_buf_8bit,
-                            &colors_threshold_v, &colors_v);
-  } else {
-    av1_count_colors(src_u, src_stride, rows, cols, count_buf, &colors_u);
-    av1_count_colors(src_v, src_stride, rows, cols, count_buf, &colors_v);
-    colors_threshold_u = colors_u;
-    colors_threshold_v = colors_v;
-  }
+  av1_count_colors_highbd(src_u, src_stride, rows, cols, seq_params->bit_depth,
+                          count_buf, count_buf_8bit, &colors_threshold_u,
+                          &colors_u);
+  av1_count_colors_highbd(src_v, src_stride, rows, cols, seq_params->bit_depth,
+                          count_buf, count_buf_8bit, &colors_threshold_v,
+                          &colors_v);
 
   uint16_t color_cache[2 * PALETTE_MAX_SIZE];
   const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
@@ -732,31 +697,17 @@
 
     uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
     uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
-    if (seq_params->use_highbitdepth) {
-      lb_u = src_u16[0];
-      ub_u = src_u16[0];
-      lb_v = src_v16[0];
-      ub_v = src_v16[0];
-    } else {
-      lb_u = src_u[0];
-      ub_u = src_u[0];
-      lb_v = src_v[0];
-      ub_v = src_v[0];
-    }
+    lb_u = src_u16[0];
+    ub_u = src_u16[0];
+    lb_v = src_v16[0];
+    ub_v = src_v16[0];
 
     for (r = 0; r < rows; ++r) {
       for (c = 0; c < cols; ++c) {
-        if (seq_params->use_highbitdepth) {
-          val_u = src_u16[r * src_stride + c];
-          val_v = src_v16[r * src_stride + c];
-          data[(r * cols + c) * 2] = val_u;
-          data[(r * cols + c) * 2 + 1] = val_v;
-        } else {
-          val_u = src_u[r * src_stride + c];
-          val_v = src_v[r * src_stride + c];
-          data[(r * cols + c) * 2] = val_u;
-          data[(r * cols + c) * 2 + 1] = val_v;
-        }
+        val_u = src_u16[r * src_stride + c];
+        val_v = src_v16[r * src_stride + c];
+        data[(r * cols + c) * 2] = val_u;
+        data[(r * cols + c) * 2 + 1] = val_v;
         if (val_u < lb_u)
           lb_u = val_u;
         else if (val_u > ub_u)
@@ -796,12 +747,8 @@
       pmi->palette_size[1] = n;
       for (i = 1; i < 3; ++i) {
         for (j = 0; j < n; ++j) {
-          if (seq_params->use_highbitdepth)
-            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd(
-                (int)centroids[j * 2 + i - 1], seq_params->bit_depth);
-          else
-            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
-                clip_pixel((int)centroids[j * 2 + i - 1]);
+          pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd(
+              (int)centroids[j * 2 + i - 1], seq_params->bit_depth);
         }
       }
 
@@ -846,18 +793,14 @@
   const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
   const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
   int plane_block_width, plane_block_height, rows, cols;
+  (void)cpi;
   av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
                            &plane_block_height, &rows, &cols);
 
   for (r = 0; r < rows; ++r) {
     for (c = 0; c < cols; ++c) {
-      if (cpi->common.seq_params.use_highbitdepth) {
-        data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
-        data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
-      } else {
-        data[(r * cols + c) * 2] = src_u[r * src_stride + c];
-        data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c];
-      }
+      data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
+      data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
     }
   }
 

diff --git a/av1/encoder/partition_search.c b/av1/encoder/partition_search.c
index 94b8eb3..5cdd990 100644
--- a/av1/encoder/partition_search.c
+++ b/av1/encoder/partition_search.c

@@ -415,8 +415,7 @@
           continue;
         mismatch_record_block_pre(pd->dst.buf, pd->dst.stride,
                                   cm->current_frame.order_hint, plane, pixel_c,
-                                  pixel_r, pd->width, pd->height,
-                                  xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+                                  pixel_r, pd->width, pd->height);
       }
     }
 #else
@@ -763,13 +762,8 @@
   // Reset skip mode flag.
   mbmi->skip_mode = 0;
 
-  if (is_cur_buf_hbd(xd)) {
-    x->source_variance = av1_high_get_sby_perpixel_variance(
-        cpi, &x->plane[0].src, bsize, xd->bd);
-  } else {
-    x->source_variance =
-        av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
-  }
+  x->source_variance =
+      av1_high_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize, xd->bd);
 
   // Initialize default mode evaluation params
   set_mode_eval_params(cpi, x, DEFAULT_EVAL);
@@ -3366,13 +3360,8 @@
 
   if (pb_source_variance == UINT_MAX) {
     av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
-    if (is_cur_buf_hbd(xd)) {
-      pb_source_variance = av1_high_get_sby_perpixel_variance(
-          cpi, &x->plane[0].src, bsize, xd->bd);
-    } else {
-      pb_source_variance =
-          av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
-    }
+    pb_source_variance = av1_high_get_sby_perpixel_variance(
+        cpi, &x->plane[0].src, bsize, xd->bd);
   }
 
   assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,

diff --git a/av1/encoder/partition_strategy.c b/av1/encoder/partition_strategy.c
index 4fd8701..964fb6f 100644
--- a/av1/encoder/partition_strategy.c
+++ b/av1/encoder/partition_strategy.c

@@ -115,20 +115,12 @@
     const int width = 65, height = 65,
               stride = x->plane[AOM_PLANE_Y].src.stride;
 
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      uint16_t *image[1] = {
-        CONVERT_TO_SHORTPTR(x->plane[AOM_PLANE_Y].src.buf) - stride - 1
-      };
+    uint16_t *image[1] = { CONVERT_TO_SHORTPTR(x->plane[AOM_PLANE_Y].src.buf) -
+                           stride - 1 };
 
-      av1_cnn_predict_img_multi_out_highbd(image, width, height, stride,
-                                           cnn_config, &thread_data, bit_depth,
-                                           &output);
-    } else {
-      uint8_t *image[1] = { x->plane[AOM_PLANE_Y].src.buf - stride - 1 };
-
-      av1_cnn_predict_img_multi_out(image, width, height, stride, cnn_config,
-                                    &thread_data, &output);
-    }
+    av1_cnn_predict_img_multi_out_highbd(image, width, height, stride,
+                                         cnn_config, &thread_data, bit_depth,
+                                         &output);
 
     part_info->cnn_output_valid = 1;
   }
@@ -782,11 +774,8 @@
     const MACROBLOCKD *const xd = &x->e_mbd;
     // TODO(debargha): x->source_variance is unavailable at this point,
     // so compute. The redundant recomputation later can be removed.
-    const unsigned int source_variance =
-        is_cur_buf_hbd(xd)
-            ? av1_high_get_sby_perpixel_variance(cpi, &x->plane[0].src, sb_size,
-                                                 xd->bd)
-            : av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, sb_size);
+    const unsigned int source_variance = av1_high_get_sby_perpixel_variance(
+        cpi, &x->plane[0].src, sb_size, xd->bd);
     if (source_variance > 16) {
       const double thresh = source_variance < 128 ? 0.05 : 0.1;
       for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0;
@@ -985,13 +974,8 @@
   // Variance ratios
   const MACROBLOCKD *const xd = &x->e_mbd;
   int whole_block_variance;
-  if (is_cur_buf_hbd(xd)) {
-    whole_block_variance = av1_high_get_sby_perpixel_variance(
-        cpi, &x->plane[0].src, bsize, xd->bd);
-  } else {
-    whole_block_variance =
-        av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
-  }
+  whole_block_variance =
+      av1_high_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize, xd->bd);
   whole_block_variance = AOMMAX(whole_block_variance, 1);
 
   int split_variance[SUB_PARTITIONS_SPLIT];
@@ -1003,12 +987,8 @@
     const int x_idx = (i & 1) * bw / 2;
     const int y_idx = (i >> 1) * bw / 2;
     buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride;
-    if (is_cur_buf_hbd(xd)) {
-      split_variance[i] =
-          av1_high_get_sby_perpixel_variance(cpi, &buf, subsize, xd->bd);
-    } else {
-      split_variance[i] = av1_get_sby_perpixel_variance(cpi, &buf, subsize);
-    }
+    split_variance[i] =
+        av1_high_get_sby_perpixel_variance(cpi, &buf, subsize, xd->bd);
   }
 
   for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++)
@@ -1189,17 +1169,10 @@
       horz_4_src.buf = src + i * block_size_high[horz_4_bs] * src_stride;
       vert_4_src.buf = src + i * block_size_wide[vert_4_bs];
 
-      if (is_cur_buf_hbd(xd)) {
-        horz_4_source_var[i] = av1_high_get_sby_perpixel_variance(
-            cpi, &horz_4_src, horz_4_bs, xd->bd);
-        vert_4_source_var[i] = av1_high_get_sby_perpixel_variance(
-            cpi, &vert_4_src, vert_4_bs, xd->bd);
-      } else {
-        horz_4_source_var[i] =
-            av1_get_sby_perpixel_variance(cpi, &horz_4_src, horz_4_bs);
-        vert_4_source_var[i] =
-            av1_get_sby_perpixel_variance(cpi, &vert_4_src, vert_4_bs);
-      }
+      horz_4_source_var[i] = av1_high_get_sby_perpixel_variance(
+          cpi, &horz_4_src, horz_4_bs, xd->bd);
+      vert_4_source_var[i] = av1_high_get_sby_perpixel_variance(
+          cpi, &vert_4_src, vert_4_bs, xd->bd);
     }
   }
 

diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index 63c623f..80d8999 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c

@@ -237,17 +237,6 @@
     memcpy(dst + r * dstride, base + r * sstride, hsize * sizeof(*base));
 }
 
-static void copy_sb16_16(uint16_t *dst, int dstride, const void *src,
-                         int src_voffset, int src_hoffset, int sstride,
-                         int vsize, int hsize) {
-  int r, c;
-  const uint8_t *src8 = (uint8_t *)src;
-  const uint8_t *base = &src8[src_voffset * sstride + src_hoffset];
-  for (r = 0; r < vsize; r++)
-    for (c = 0; c < hsize; c++)
-      dst[r * dstride + c] = (uint16_t)base[r * sstride + c];
-}
-
 static INLINE void init_src_params(int *src_stride, int *width, int *height,
                                    int *width_log2, int *height_log2,
                                    BLOCK_SIZE bsize) {
@@ -282,29 +271,6 @@
   return sum >> 2 * coeff_shift;
 }
 
-static uint64_t compute_cdef_dist(void *dst, int dstride, uint16_t *src,
-                                  cdef_list *dlist, int cdef_count,
-                                  BLOCK_SIZE bsize, int coeff_shift, int row,
-                                  int col) {
-  assert(bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 ||
-         bsize == BLOCK_8X8);
-  uint64_t sum = 0;
-  int bi, bx, by;
-  uint8_t *dst8 = (uint8_t *)dst;
-  uint8_t *dst_buff = &dst8[row * dstride + col];
-  int src_stride, width, height, width_log2, height_log2;
-  init_src_params(&src_stride, &width, &height, &width_log2, &height_log2,
-                  bsize);
-  for (bi = 0; bi < cdef_count; bi++) {
-    by = dlist[bi].by;
-    bx = dlist[bi].bx;
-    sum += aom_mse_wxh_16bit(
-        &dst_buff[(by << height_log2) * dstride + (bx << width_log2)], dstride,
-        &src[bi << (height_log2 + width_log2)], src_stride, width, height);
-  }
-  return sum >> 2 * coeff_shift;
-}
-
 static int sb_all_skip(const CommonModeInfoParams *const mi_params, int mi_row,
                        int mi_col) {
   const int maxr = AOMMIN(mi_params->mi_rows - mi_row, MI_SIZE_64X64);
@@ -441,13 +407,8 @@
   copy_fn_t copy_fn;
   compute_cdef_dist_t compute_cdef_dist_fn;
 
-  if (cm->seq_params.use_highbitdepth) {
-    copy_fn = copy_sb16_16_highbd;
-    compute_cdef_dist_fn = compute_cdef_dist_highbd;
-  } else {
-    copy_fn = copy_sb16_16;
-    compute_cdef_dist_fn = compute_cdef_dist;
-  }
+  copy_fn = copy_sb16_16_highbd;
+  compute_cdef_dist_fn = compute_cdef_dist_highbd;
 
   DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]);
   uint16_t *const in = inbuf + CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER;

diff --git a/av1/encoder/pickcdef.h b/av1/encoder/pickcdef.h
index 26bd946..0732a9a 100644
--- a/av1/encoder/pickcdef.h
+++ b/av1/encoder/pickcdef.h

@@ -12,6 +12,7 @@
 #ifndef AOM_AV1_ENCODER_PICKCDEF_H_
 #define AOM_AV1_ENCODER_PICKCDEF_H_
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/cdef.h"
 #include "av1/encoder/speed_features.h"
 

diff --git a/av1/encoder/picklpf.c b/av1/encoder/picklpf.c
index 4098606..ed6c168 100644
--- a/av1/encoder/picklpf.c
+++ b/av1/encoder/picklpf.c

@@ -126,8 +126,7 @@
 #endif
                           plane, plane + 1, partial_frame);
 
-  filt_err = aom_get_sse_plane(sd, &cm->cur_frame->buf, plane,
-                               cm->seq_params.use_highbitdepth);
+  filt_err = aom_get_sse_plane(sd, &cm->cur_frame->buf, plane);
 
   // Re-instate the unfiltered frame
   yv12_copy_plane(&cpi->last_frame_uf, &cm->cur_frame->buf, plane);

diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index ae1bfd4..86fad19 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c

@@ -68,31 +68,30 @@
                                             int hstart, int width, int vstart,
                                             int height);
 
-#define NUM_EXTRACTORS (3 * (1 + 1))
+#define NUM_EXTRACTORS 3
 
 static const sse_part_extractor_type sse_part_extractors[NUM_EXTRACTORS] = {
-  aom_get_y_sse_part,        aom_get_u_sse_part,
-  aom_get_v_sse_part,        aom_highbd_get_y_sse_part,
-  aom_highbd_get_u_sse_part, aom_highbd_get_v_sse_part,
+  aom_highbd_get_y_sse_part,
+  aom_highbd_get_u_sse_part,
+  aom_highbd_get_v_sse_part,
 };
 static const var_part_extractor_type var_part_extractors[NUM_EXTRACTORS] = {
-  aom_get_y_var,        aom_get_u_var,        aom_get_v_var,
-  aom_highbd_get_y_var, aom_highbd_get_u_var, aom_highbd_get_v_var,
+  aom_highbd_get_y_var,
+  aom_highbd_get_u_var,
+  aom_highbd_get_v_var,
 };
 
 static int64_t sse_restoration_unit(const RestorationTileLimits *limits,
                                     const YV12_BUFFER_CONFIG *src,
-                                    const YV12_BUFFER_CONFIG *dst, int plane,
-                                    int highbd) {
-  return sse_part_extractors[3 * highbd + plane](
+                                    const YV12_BUFFER_CONFIG *dst, int plane) {
+  return sse_part_extractors[plane](
       src, dst, limits->h_start, limits->h_end - limits->h_start,
       limits->v_start, limits->v_end - limits->v_start);
 }
 
 static uint64_t var_restoration_unit(const RestorationTileLimits *limits,
-                                     const YV12_BUFFER_CONFIG *src, int plane,
-                                     int highbd) {
-  return var_part_extractors[3 * highbd + plane](
+                                     const YV12_BUFFER_CONFIG *src, int plane) {
+  return var_part_extractors[plane](
       src, limits->h_start, limits->h_end - limits->h_start, limits->v_start,
       limits->v_end - limits->v_start);
 }
@@ -193,7 +192,6 @@
   const RestorationInfo *rsi = &cm->rst_info[plane];
   RestorationLineBuffers rlbs;
   const int bit_depth = cm->seq_params.bit_depth;
-  const int highbd = cm->seq_params.use_highbitdepth;
 
   const YV12_BUFFER_CONFIG *fts = &cm->cur_frame->buf;
   // TODO(yunqing): For now, only use optimized LR filter in decoder. Can be
@@ -203,82 +201,11 @@
   av1_loop_restoration_filter_unit(
       limits, rui, &rsi->boundaries, &rlbs, tile_rect, rsc->tile_stripe0,
       is_uv && cm->seq_params.subsampling_x,
-      is_uv && cm->seq_params.subsampling_y, highbd, bit_depth,
-      fts->buffers[plane], fts->strides[is_uv], rsc->dst->buffers[plane],
-      rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr);
+      is_uv && cm->seq_params.subsampling_y, bit_depth, fts->buffers[plane],
+      fts->strides[is_uv], rsc->dst->buffers[plane], rsc->dst->strides[is_uv],
+      cm->rst_tmpbuf, optimized_lr);
 
-  return sse_restoration_unit(limits, rsc->src, rsc->dst, plane, highbd);
-}
-
-int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height,
-                                     int src_stride, const uint8_t *dat8,
-                                     int dat_stride, int32_t *flt0,
-                                     int flt0_stride, int32_t *flt1,
-                                     int flt1_stride, int xq[2],
-                                     const sgr_params_type *params) {
-  int i, j;
-  const uint8_t *src = src8;
-  const uint8_t *dat = dat8;
-  int64_t err = 0;
-  if (params->r[0] > 0 && params->r[1] > 0) {
-    for (i = 0; i < height; ++i) {
-      for (j = 0; j < width; ++j) {
-        assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15));
-        assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15));
-        const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
-        int32_t v = u << SGRPROJ_PRJ_BITS;
-        v += xq[0] * (flt0[j] - u) + xq[1] * (flt1[j] - u);
-        const int32_t e =
-            ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
-        err += ((int64_t)e * e);
-      }
-      dat += dat_stride;
-      src += src_stride;
-      flt0 += flt0_stride;
-      flt1 += flt1_stride;
-    }
-  } else if (params->r[0] > 0) {
-    for (i = 0; i < height; ++i) {
-      for (j = 0; j < width; ++j) {
-        assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15));
-        const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
-        int32_t v = u << SGRPROJ_PRJ_BITS;
-        v += xq[0] * (flt0[j] - u);
-        const int32_t e =
-            ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
-        err += ((int64_t)e * e);
-      }
-      dat += dat_stride;
-      src += src_stride;
-      flt0 += flt0_stride;
-    }
-  } else if (params->r[1] > 0) {
-    for (i = 0; i < height; ++i) {
-      for (j = 0; j < width; ++j) {
-        assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15));
-        const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
-        int32_t v = u << SGRPROJ_PRJ_BITS;
-        v += xq[1] * (flt1[j] - u);
-        const int32_t e =
-            ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
-        err += ((int64_t)e * e);
-      }
-      dat += dat_stride;
-      src += src_stride;
-      flt1 += flt1_stride;
-    }
-  } else {
-    for (i = 0; i < height; ++i) {
-      for (j = 0; j < width; ++j) {
-        const int32_t e = (int32_t)(dat[j]) - src[j];
-        err += ((int64_t)e * e);
-      }
-      dat += dat_stride;
-      src += src_stride;
-    }
-  }
-
-  return err;
+  return sse_restoration_unit(limits, rsc->src, rsc->dst, plane);
 }
 
 int64_t av1_highbd_pixel_proj_error_c(const uint8_t *src8, int width,
@@ -357,34 +284,27 @@
 
 static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height,
                                     int src_stride, const uint8_t *dat8,
-                                    int dat_stride, int use_highbitdepth,
-                                    int32_t *flt0, int flt0_stride,
-                                    int32_t *flt1, int flt1_stride, int *xqd,
+                                    int dat_stride, int32_t *flt0,
+                                    int flt0_stride, int32_t *flt1,
+                                    int flt1_stride, int *xqd,
                                     const sgr_params_type *params) {
   int xq[2];
   av1_decode_xq(xqd, xq, params);
 
-  if (use_highbitdepth) {
-    return av1_highbd_pixel_proj_error(src8, width, height, src_stride, dat8,
-                                       dat_stride, flt0, flt0_stride, flt1,
-                                       flt1_stride, xq, params);
-
-  } else {
-    return av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8,
-                                      dat_stride, flt0, flt0_stride, flt1,
-                                      flt1_stride, xq, params);
-  }
+  return av1_highbd_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                     dat_stride, flt0, flt0_stride, flt1,
+                                     flt1_stride, xq, params);
 }
 
 #define USE_SGRPROJ_REFINEMENT_SEARCH 1
 static int64_t finer_search_pixel_proj_error(
     const uint8_t *src8, int width, int height, int src_stride,
-    const uint8_t *dat8, int dat_stride, int use_highbitdepth, int32_t *flt0,
-    int flt0_stride, int32_t *flt1, int flt1_stride, int start_step, int *xqd,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int start_step, int *xqd,
     const sgr_params_type *params) {
-  int64_t err = get_pixel_proj_error(
-      src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0,
-      flt0_stride, flt1, flt1_stride, xqd, params);
+  int64_t err =
+      get_pixel_proj_error(src8, width, height, src_stride, dat8, dat_stride,
+                           flt0, flt0_stride, flt1, flt1_stride, xqd, params);
   (void)start_step;
 #if USE_SGRPROJ_REFINEMENT_SEARCH
   int64_t err2;
@@ -399,10 +319,9 @@
       do {
         if (xqd[p] - s >= tap_min[p]) {
           xqd[p] -= s;
-          err2 =
-              get_pixel_proj_error(src8, width, height, src_stride, dat8,
-                                   dat_stride, use_highbitdepth, flt0,
-                                   flt0_stride, flt1, flt1_stride, xqd, params);
+          err2 = get_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                      dat_stride, flt0, flt0_stride, flt1,
+                                      flt1_stride, xqd, params);
           if (err2 > err) {
             xqd[p] += s;
           } else {
@@ -418,10 +337,9 @@
       do {
         if (xqd[p] + s <= tap_max[p]) {
           xqd[p] += s;
-          err2 =
-              get_pixel_proj_error(src8, width, height, src_stride, dat8,
-                                   dat_stride, use_highbitdepth, flt0,
-                                   flt0_stride, flt1, flt1_stride, xqd, params);
+          err2 = get_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                      dat_stride, flt0, flt0_stride, flt1,
+                                      flt1_stride, xqd, params);
           if (err2 > err) {
             xqd[p] -= s;
           } else {
@@ -445,35 +363,6 @@
     return (dividend + divisor / 2) / divisor;
 }
 
-static AOM_INLINE void calc_proj_params_r0_r1_c(
-    const uint8_t *src8, int width, int height, int src_stride,
-    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
-    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
-  const int size = width * height;
-  const uint8_t *src = src8;
-  const uint8_t *dat = dat8;
-  for (int i = 0; i < height; ++i) {
-    for (int j = 0; j < width; ++j) {
-      const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
-      const int32_t s =
-          (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
-      const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u;
-      const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u;
-      H[0][0] += (int64_t)f1 * f1;
-      H[1][1] += (int64_t)f2 * f2;
-      H[0][1] += (int64_t)f1 * f2;
-      C[0] += (int64_t)f1 * s;
-      C[1] += (int64_t)f2 * s;
-    }
-  }
-  H[0][0] /= size;
-  H[0][1] /= size;
-  H[1][1] /= size;
-  H[1][0] = H[0][1];
-  C[0] /= size;
-  C[1] /= size;
-}
-
 static AOM_INLINE void calc_proj_params_r0_r1_high_bd_c(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
@@ -503,29 +392,6 @@
   C[1] /= size;
 }
 
-static AOM_INLINE void calc_proj_params_r0_c(const uint8_t *src8, int width,
-                                             int height, int src_stride,
-                                             const uint8_t *dat8,
-                                             int dat_stride, int32_t *flt0,
-                                             int flt0_stride, int64_t H[2][2],
-                                             int64_t C[2]) {
-  const int size = width * height;
-  const uint8_t *src = src8;
-  const uint8_t *dat = dat8;
-  for (int i = 0; i < height; ++i) {
-    for (int j = 0; j < width; ++j) {
-      const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
-      const int32_t s =
-          (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
-      const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u;
-      H[0][0] += (int64_t)f1 * f1;
-      C[0] += (int64_t)f1 * s;
-    }
-  }
-  H[0][0] /= size;
-  C[0] /= size;
-}
-
 static AOM_INLINE void calc_proj_params_r0_high_bd_c(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
@@ -547,29 +413,6 @@
   C[0] /= size;
 }
 
-static AOM_INLINE void calc_proj_params_r1_c(const uint8_t *src8, int width,
-                                             int height, int src_stride,
-                                             const uint8_t *dat8,
-                                             int dat_stride, int32_t *flt1,
-                                             int flt1_stride, int64_t H[2][2],
-                                             int64_t C[2]) {
-  const int size = width * height;
-  const uint8_t *src = src8;
-  const uint8_t *dat = dat8;
-  for (int i = 0; i < height; ++i) {
-    for (int j = 0; j < width; ++j) {
-      const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
-      const int32_t s =
-          (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
-      const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u;
-      H[1][1] += (int64_t)f2 * f2;
-      C[1] += (int64_t)f2 * s;
-    }
-  }
-  H[1][1] /= size;
-  C[1] /= size;
-}
-
 static AOM_INLINE void calc_proj_params_r1_high_bd_c(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
@@ -598,23 +441,6 @@
 // non-zero and need to be computed.
 // 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are
 // non-zero and need to be computed.
-void av1_calc_proj_params_c(const uint8_t *src8, int width, int height,
-                            int src_stride, const uint8_t *dat8, int dat_stride,
-                            int32_t *flt0, int flt0_stride, int32_t *flt1,
-                            int flt1_stride, int64_t H[2][2], int64_t C[2],
-                            const sgr_params_type *params) {
-  if ((params->r[0] > 0) && (params->r[1] > 0)) {
-    calc_proj_params_r0_r1_c(src8, width, height, src_stride, dat8, dat_stride,
-                             flt0, flt0_stride, flt1, flt1_stride, H, C);
-  } else if (params->r[0] > 0) {
-    calc_proj_params_r0_c(src8, width, height, src_stride, dat8, dat_stride,
-                          flt0, flt0_stride, H, C);
-  } else if (params->r[1] > 0) {
-    calc_proj_params_r1_c(src8, width, height, src_stride, dat8, dat_stride,
-                          flt1, flt1_stride, H, C);
-  }
-}
-
 static AOM_INLINE void av1_calc_proj_params_high_bd_c(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
@@ -633,13 +459,10 @@
   }
 }
 
-static AOM_INLINE void get_proj_subspace(const uint8_t *src8, int width,
-                                         int height, int src_stride,
-                                         const uint8_t *dat8, int dat_stride,
-                                         int use_highbitdepth, int32_t *flt0,
-                                         int flt0_stride, int32_t *flt1,
-                                         int flt1_stride, int *xq,
-                                         const sgr_params_type *params) {
+static AOM_INLINE void get_proj_subspace(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int *xq, const sgr_params_type *params) {
   int64_t H[2][2] = { { 0, 0 }, { 0, 0 } };
   int64_t C[2] = { 0, 0 };
 
@@ -647,20 +470,9 @@
   xq[0] = 0;
   xq[1] = 0;
 
-  if (!use_highbitdepth) {
-    if ((width & 0x7) == 0) {
-      av1_calc_proj_params(src8, width, height, src_stride, dat8, dat_stride,
-                           flt0, flt0_stride, flt1, flt1_stride, H, C, params);
-    } else {
-      av1_calc_proj_params_c(src8, width, height, src_stride, dat8, dat_stride,
-                             flt0, flt0_stride, flt1, flt1_stride, H, C,
-                             params);
-    }
-  } else {
-    av1_calc_proj_params_high_bd_c(src8, width, height, src_stride, dat8,
-                                   dat_stride, flt0, flt0_stride, flt1,
-                                   flt1_stride, H, C, params);
-  }
+  av1_calc_proj_params_high_bd_c(src8, width, height, src_stride, dat8,
+                                 dat_stride, flt0, flt0_stride, flt1,
+                                 flt1_stride, H, C, params);
 
   if (params->r[0] == 0) {
     // H matrix is now only the scalar H[1][1]
@@ -717,9 +529,8 @@
 // Apply the self-guided filter across an entire restoration unit.
 static AOM_INLINE void apply_sgr(int sgr_params_idx, const uint8_t *dat8,
                                  int width, int height, int dat_stride,
-                                 int use_highbd, int bit_depth, int pu_width,
-                                 int pu_height, int32_t *flt0, int32_t *flt1,
-                                 int flt_stride) {
+                                 int bit_depth, int pu_width, int pu_height,
+                                 int32_t *flt0, int32_t *flt1, int flt_stride) {
   for (int i = 0; i < height; i += pu_height) {
     const int h = AOMMIN(pu_height, height - i);
     int32_t *flt0_row = flt0 + i * flt_stride;
@@ -731,7 +542,7 @@
       const int w = AOMMIN(pu_width, width - j);
       const int ret = av1_selfguided_restoration(
           dat8_row + j, w, h, dat_stride, flt0_row + j, flt1_row + j,
-          flt_stride, sgr_params_idx, bit_depth, use_highbd);
+          flt_stride, sgr_params_idx, bit_depth);
       (void)ret;
       assert(!ret);
     }
@@ -741,22 +552,21 @@
 static AOM_INLINE void compute_sgrproj_err(
     const uint8_t *dat8, const int width, const int height,
     const int dat_stride, const uint8_t *src8, const int src_stride,
-    const int use_highbitdepth, const int bit_depth, const int pu_width,
-    const int pu_height, const int ep, int32_t *flt0, int32_t *flt1,
-    const int flt_stride, int *exqd, int64_t *err) {
+    const int bit_depth, const int pu_width, const int pu_height, const int ep,
+    int32_t *flt0, int32_t *flt1, const int flt_stride, int *exqd,
+    int64_t *err) {
   int exq[2];
-  apply_sgr(ep, dat8, width, height, dat_stride, use_highbitdepth, bit_depth,
-            pu_width, pu_height, flt0, flt1, flt_stride);
+  apply_sgr(ep, dat8, width, height, dat_stride, bit_depth, pu_width, pu_height,
+            flt0, flt1, flt_stride);
   aom_clear_system_state();
   const sgr_params_type *const params = &av1_sgr_params[ep];
-  get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride,
-                    use_highbitdepth, flt0, flt_stride, flt1, flt_stride, exq,
-                    params);
+  get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride, flt0,
+                    flt_stride, flt1, flt_stride, exq, params);
   aom_clear_system_state();
   encode_xq(exq, exqd, params);
-  *err = finer_search_pixel_proj_error(
-      src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0,
-      flt_stride, flt1, flt_stride, 2, exqd, params);
+  *err = finer_search_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                       dat_stride, flt0, flt_stride, flt1,
+                                       flt_stride, 2, exqd, params);
 }
 
 static AOM_INLINE void get_best_error(int64_t *besterr, const int64_t err,
@@ -772,8 +582,8 @@
 
 static SgrprojInfo search_selfguided_restoration(
     const uint8_t *dat8, int width, int height, int dat_stride,
-    const uint8_t *src8, int src_stride, int use_highbitdepth, int bit_depth,
-    int pu_width, int pu_height, int32_t *rstbuf, int enable_sgr_ep_pruning) {
+    const uint8_t *src8, int src_stride, int bit_depth, int pu_width,
+    int pu_height, int32_t *rstbuf, int enable_sgr_ep_pruning) {
   int32_t *flt0 = rstbuf;
   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
   int ep, idx, bestep = 0;
@@ -788,8 +598,8 @@
     for (ep = 0; ep < SGRPROJ_PARAMS; ep++) {
       int64_t err;
       compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
-                          use_highbitdepth, bit_depth, pu_width, pu_height, ep,
-                          flt0, flt1, flt_stride, exqd, &err);
+                          bit_depth, pu_width, pu_height, ep, flt0, flt1,
+                          flt_stride, exqd, &err);
       get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
     }
   } else {
@@ -798,8 +608,8 @@
       ep = sgproj_ep_grp1_seed[idx];
       int64_t err;
       compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
-                          use_highbitdepth, bit_depth, pu_width, pu_height, ep,
-                          flt0, flt1, flt_stride, exqd, &err);
+                          bit_depth, pu_width, pu_height, ep, flt0, flt1,
+                          flt_stride, exqd, &err);
       get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
     }
     // evaluate left and right ep of winner in seed ep
@@ -809,8 +619,8 @@
         continue;
       int64_t err;
       compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
-                          use_highbitdepth, bit_depth, pu_width, pu_height, ep,
-                          flt0, flt1, flt_stride, exqd, &err);
+                          bit_depth, pu_width, pu_height, ep, flt0, flt1,
+                          flt_stride, exqd, &err);
       get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
     }
     // evaluate last two group
@@ -818,8 +628,8 @@
       ep = sgproj_ep_grp2_3[idx][bestep];
       int64_t err;
       compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
-                          use_highbitdepth, bit_depth, pu_width, pu_height, ep,
-                          flt0, flt1, flt_stride, exqd, &err);
+                          bit_depth, pu_width, pu_height, ep, flt0, flt1,
+                          flt_stride, exqd, &err);
       get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
     }
   }
@@ -859,7 +669,6 @@
 
   const MACROBLOCK *const x = rsc->x;
   const AV1_COMMON *const cm = rsc->cm;
-  const int highbd = cm->seq_params.use_highbitdepth;
   const int bit_depth = cm->seq_params.bit_depth;
 
   const int64_t bits_none = x->mode_costs.sgrproj_restore_cost[0];
@@ -886,8 +695,8 @@
   rusi->sgrproj = search_selfguided_restoration(
       dgd_start, limits->h_end - limits->h_start,
       limits->v_end - limits->v_start, rsc->dgd_stride, src_start,
-      rsc->src_stride, highbd, bit_depth, procunit_width, procunit_height,
-      tmpbuf, rsc->lpf_sf->enable_sgr_ep_pruning);
+      rsc->src_stride, bit_depth, procunit_width, procunit_height, tmpbuf,
+      rsc->lpf_sf->enable_sgr_ep_pruning);
 
   RestorationUnitInfo rui;
   rui.restoration_type = RESTORE_SGRPROJ;
@@ -915,47 +724,6 @@
   if (cost_sgr < cost_none) rsc->sgrproj = rusi->sgrproj;
 }
 
-void av1_compute_stats_c(int wiener_win, const uint8_t *dgd, const uint8_t *src,
-                         int h_start, int h_end, int v_start, int v_end,
-                         int dgd_stride, int src_stride, int64_t *M,
-                         int64_t *H) {
-  int i, j, k, l;
-  int16_t Y[WIENER_WIN2];
-  const int wiener_win2 = wiener_win * wiener_win;
-  const int wiener_halfwin = (wiener_win >> 1);
-  uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
-
-  memset(M, 0, sizeof(*M) * wiener_win2);
-  memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
-  for (i = v_start; i < v_end; i++) {
-    for (j = h_start; j < h_end; j++) {
-      const int16_t X = (int16_t)src[i * src_stride + j] - (int16_t)avg;
-      int idx = 0;
-      for (k = -wiener_halfwin; k <= wiener_halfwin; k++) {
-        for (l = -wiener_halfwin; l <= wiener_halfwin; l++) {
-          Y[idx] = (int16_t)dgd[(i + l) * dgd_stride + (j + k)] - (int16_t)avg;
-          idx++;
-        }
-      }
-      assert(idx == wiener_win2);
-      for (k = 0; k < wiener_win2; ++k) {
-        M[k] += (int32_t)Y[k] * X;
-        for (l = k; l < wiener_win2; ++l) {
-          // H is a symmetric matrix, so we only need to fill out the upper
-          // triangle here. We can copy it down to the lower triangle outside
-          // the (i, j) loops.
-          H[k * wiener_win2 + l] += (int32_t)Y[k] * Y[l];
-        }
-      }
-    }
-  }
-  for (k = 0; k < wiener_win2; ++k) {
-    for (l = k + 1; l < wiener_win2; ++l) {
-      H[l * wiener_win2 + k] = H[k * wiener_win2 + l];
-    }
-  }
-}
-
 void av1_compute_stats_highbd_c(int wiener_win, const uint8_t *dgd8,
                                 const uint8_t *src8, int h_start, int h_end,
                                 int v_start, int v_end, int dgd_stride,
@@ -1457,9 +1225,7 @@
     // Derive threshold as sqr(normalized Qscale) * scale / 16,
     const uint64_t thresh =
         (qs * qs * scale[rsc->lpf_sf->prune_wiener_based_on_src_var]) >> 4;
-    const int highbd = rsc->cm->seq_params.use_highbitdepth;
-    const uint64_t src_var =
-        var_restoration_unit(limits, rsc->src, rsc->plane, highbd);
+    const uint64_t src_var = var_restoration_unit(limits, rsc->src, rsc->plane);
     // Do not perform Wiener search if source variance is lower than threshold
     // or if the reconstruction error is zero
     int prune_wiener = (src_var < thresh) || (rusi->sse[RESTORE_NONE] == 0);
@@ -1487,16 +1253,10 @@
   int32_t vfilter[WIENER_WIN], hfilter[WIENER_WIN];
 
   const AV1_COMMON *const cm = rsc->cm;
-  if (cm->seq_params.use_highbitdepth) {
-    av1_compute_stats_highbd(reduced_wiener_win, rsc->dgd_buffer,
-                             rsc->src_buffer, limits->h_start, limits->h_end,
-                             limits->v_start, limits->v_end, rsc->dgd_stride,
-                             rsc->src_stride, M, H, cm->seq_params.bit_depth);
-  } else {
-    av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer,
-                      limits->h_start, limits->h_end, limits->v_start,
-                      limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H);
-  }
+  av1_compute_stats_highbd(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer,
+                           limits->h_start, limits->h_end, limits->v_start,
+                           limits->v_end, rsc->dgd_stride, rsc->src_stride, M,
+                           H, cm->seq_params.bit_depth);
 
   if (!wiener_decompose_sep_sym(reduced_wiener_win, M, H, vfilter, hfilter)) {
     rsc->bits += bits_none;
@@ -1580,9 +1340,8 @@
   RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
   RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
 
-  const int highbd = rsc->cm->seq_params.use_highbitdepth;
   rusi->sse[RESTORE_NONE] = sse_restoration_unit(
-      limits, rsc->src, &rsc->cm->cur_frame->buf, rsc->plane, highbd);
+      limits, rsc->src, &rsc->cm->cur_frame->buf, rsc->plane);
 
   rsc->sse += rusi->sse[RESTORE_NONE];
 }
@@ -1719,11 +1478,9 @@
     double best_cost = 0;
     RestorationType best_rtype = RESTORE_NONE;
 
-    const int highbd = rsc.cm->seq_params.use_highbitdepth;
     if (!cpi->sf.lpf_sf.disable_loop_restoration_chroma || !plane) {
       av1_extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height,
-                       rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER,
-                       highbd);
+                       rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER);
 
       for (RestorationType r = 0; r < num_rtypes; ++r) {
         if ((force_restore_type != RESTORE_TYPES) && (r != RESTORE_NONE) &&

diff --git a/av1/encoder/rc_utils.h b/av1/encoder/rc_utils.h
index d5325e8..fbceca5 100644
--- a/av1/encoder/rc_utils.h
+++ b/av1/encoder/rc_utils.h

@@ -263,11 +263,7 @@
     const int64_t high_err_target = cpi->ambient_err;
     const int64_t low_err_target = cpi->ambient_err >> 1;
 
-    if (cm->seq_params.use_highbitdepth) {
-      kf_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
-    } else {
-      kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
-    }
+    kf_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
 
     // Prevent possible divide by zero error below for perfect KF
     kf_err += !kf_err;

diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 3902841..848c589 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c

@@ -799,33 +799,6 @@
   return total_sse;
 }
 
-int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
-                          intptr_t block_size, int64_t *ssz) {
-  int i;
-  int64_t error = 0, sqcoeff = 0;
-
-  for (i = 0; i < block_size; i++) {
-    const int diff = coeff[i] - dqcoeff[i];
-    error += diff * diff;
-    sqcoeff += coeff[i] * coeff[i];
-  }
-
-  *ssz = sqcoeff;
-  return error;
-}
-
-int64_t av1_block_error_lp_c(const int16_t *coeff, const int16_t *dqcoeff,
-                             intptr_t block_size) {
-  int64_t error = 0;
-
-  for (int i = 0; i < block_size; i++) {
-    const int diff = coeff[i] - dqcoeff[i];
-    error += diff * diff;
-  }
-
-  return error;
-}
-
 int64_t av1_highbd_block_error_c(const tran_low_t *coeff,
                                  const tran_low_t *dqcoeff, intptr_t block_size,
                                  int64_t *ssz, int bd) {
@@ -4832,28 +4805,18 @@
 }
 
 static AOM_INLINE void init_neighbor_pred_buf(
-    const OBMCBuffer *const obmc_buffer, HandleInterModeArgs *const args,
-    int is_hbd) {
-  if (is_hbd) {
-    const int len = sizeof(uint16_t);
-    args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(obmc_buffer->above_pred);
-    args->above_pred_buf[1] = CONVERT_TO_BYTEPTR(obmc_buffer->above_pred +
-                                                 (MAX_SB_SQUARE >> 1) * len);
-    args->above_pred_buf[2] =
-        CONVERT_TO_BYTEPTR(obmc_buffer->above_pred + MAX_SB_SQUARE * len);
-    args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(obmc_buffer->left_pred);
-    args->left_pred_buf[1] =
-        CONVERT_TO_BYTEPTR(obmc_buffer->left_pred + (MAX_SB_SQUARE >> 1) * len);
-    args->left_pred_buf[2] =
-        CONVERT_TO_BYTEPTR(obmc_buffer->left_pred + MAX_SB_SQUARE * len);
-  } else {
-    args->above_pred_buf[0] = obmc_buffer->above_pred;
-    args->above_pred_buf[1] = obmc_buffer->above_pred + (MAX_SB_SQUARE >> 1);
-    args->above_pred_buf[2] = obmc_buffer->above_pred + MAX_SB_SQUARE;
-    args->left_pred_buf[0] = obmc_buffer->left_pred;
-    args->left_pred_buf[1] = obmc_buffer->left_pred + (MAX_SB_SQUARE >> 1);
-    args->left_pred_buf[2] = obmc_buffer->left_pred + MAX_SB_SQUARE;
-  }
+    const OBMCBuffer *const obmc_buffer, HandleInterModeArgs *const args) {
+  const int len = sizeof(uint16_t);
+  args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(obmc_buffer->above_pred);
+  args->above_pred_buf[1] =
+      CONVERT_TO_BYTEPTR(obmc_buffer->above_pred + (MAX_SB_SQUARE >> 1) * len);
+  args->above_pred_buf[2] =
+      CONVERT_TO_BYTEPTR(obmc_buffer->above_pred + MAX_SB_SQUARE * len);
+  args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(obmc_buffer->left_pred);
+  args->left_pred_buf[1] =
+      CONVERT_TO_BYTEPTR(obmc_buffer->left_pred + (MAX_SB_SQUARE >> 1) * len);
+  args->left_pred_buf[2] =
+      CONVERT_TO_BYTEPTR(obmc_buffer->left_pred + MAX_SB_SQUARE * len);
 }
 
 #if CONFIG_NEW_REF_SIGNALING
@@ -4941,7 +4904,7 @@
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   unsigned char segment_id = mbmi->segment_id;
 
-  init_neighbor_pred_buf(&x->obmc_buffer, args, is_cur_buf_hbd(&x->e_mbd));
+  init_neighbor_pred_buf(&x->obmc_buffer, args);
   av1_collect_neighbors_ref_counts(xd);
   estimate_ref_frame_costs(cm, xd, &x->mode_costs, segment_id, ref_costs_single,
                            ref_costs_comp);
@@ -6351,7 +6314,7 @@
   int num_single_modes_processed = 0;
 
   // Temporary buffers used by handle_inter_mode().
-  uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_pred_bufs[0]);
+  uint8_t *const tmp_buf = CONVERT_TO_BYTEPTR(x->tmp_pred_bufs[0]);
 
   // The best RD found for the reference frame, among single reference modes.
   // Note that the 0-th element will contain a cut-off that is later used
@@ -7246,34 +7209,19 @@
   int32_t *wsrc = ctxt->obmc_buffer->wsrc + (rel_mi_col * MI_SIZE);
   int32_t *mask = ctxt->obmc_buffer->mask + (rel_mi_col * MI_SIZE);
   const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE;
-  const int is_hbd = is_cur_buf_hbd(xd);
 
-  if (!is_hbd) {
-    for (int row = 0; row < ctxt->overlap; ++row) {
-      const uint8_t m0 = mask1d[row];
-      const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
-      for (int col = 0; col < op_mi_size * MI_SIZE; ++col) {
-        wsrc[col] = m1 * tmp[col];
-        mask[col] = m0;
-      }
-      wsrc += bw;
-      mask += bw;
-      tmp += ctxt->tmp_stride;
-    }
-  } else {
-    const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
+  const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
 
-    for (int row = 0; row < ctxt->overlap; ++row) {
-      const uint8_t m0 = mask1d[row];
-      const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
-      for (int col = 0; col < op_mi_size * MI_SIZE; ++col) {
-        wsrc[col] = m1 * tmp16[col];
-        mask[col] = m0;
-      }
-      wsrc += bw;
-      mask += bw;
-      tmp16 += ctxt->tmp_stride;
+  for (int row = 0; row < ctxt->overlap; ++row) {
+    const uint8_t m0 = mask1d[row];
+    const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+    for (int col = 0; col < op_mi_size * MI_SIZE; ++col) {
+      wsrc[col] = m1 * tmp16[col];
+      mask[col] = m0;
     }
+    wsrc += bw;
+    mask += bw;
+    tmp16 += ctxt->tmp_stride;
   }
 }
 
@@ -7294,36 +7242,19 @@
   int32_t *wsrc = ctxt->obmc_buffer->wsrc + (rel_mi_row * MI_SIZE * bw);
   int32_t *mask = ctxt->obmc_buffer->mask + (rel_mi_row * MI_SIZE * bw);
   const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride);
-  const int is_hbd = is_cur_buf_hbd(xd);
+  const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
 
-  if (!is_hbd) {
-    for (int row = 0; row < op_mi_size * MI_SIZE; ++row) {
-      for (int col = 0; col < ctxt->overlap; ++col) {
-        const uint8_t m0 = mask1d[col];
-        const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
-        wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
-                    (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
-        mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
-      }
-      wsrc += bw;
-      mask += bw;
-      tmp += ctxt->tmp_stride;
+  for (int row = 0; row < op_mi_size * MI_SIZE; ++row) {
+    for (int col = 0; col < ctxt->overlap; ++col) {
+      const uint8_t m0 = mask1d[col];
+      const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+      wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
+                  (tmp16[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
+      mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
     }
-  } else {
-    const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
-
-    for (int row = 0; row < op_mi_size * MI_SIZE; ++row) {
-      for (int col = 0; col < ctxt->overlap; ++col) {
-        const uint8_t m0 = mask1d[col];
-        const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
-        wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
-                    (tmp16[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
-        mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
-      }
-      wsrc += bw;
-      mask += bw;
-      tmp16 += ctxt->tmp_stride;
-    }
+    wsrc += bw;
+    mask += bw;
+    tmp16 += ctxt->tmp_stride;
   }
 }
 
@@ -7376,7 +7307,6 @@
   int32_t *mask_buf = obmc_buffer->mask;
   int32_t *wsrc_buf = obmc_buffer->wsrc;
 
-  const int is_hbd = is_cur_buf_hbd(xd);
   const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
 
   // plane 0 should not be sub-sampled
@@ -7413,26 +7343,14 @@
                                  calc_target_weighted_pred_left, &ctxt);
   }
 
-  if (!is_hbd) {
-    const uint8_t *src = x->plane[0].src.buf;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
 
-    for (int row = 0; row < bh; ++row) {
-      for (int col = 0; col < bw; ++col) {
-        wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
-      }
-      wsrc_buf += bw;
-      src += x->plane[0].src.stride;
+  for (int row = 0; row < bh; ++row) {
+    for (int col = 0; col < bw; ++col) {
+      wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
     }
-  } else {
-    const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
-
-    for (int row = 0; row < bh; ++row) {
-      for (int col = 0; col < bw; ++col) {
-        wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
-      }
-      wsrc_buf += bw;
-      src += x->plane[0].src.stride;
-    }
+    wsrc_buf += bw;
+    src += x->plane[0].src.stride;
   }
 }
 
@@ -7453,18 +7371,12 @@
    2 * (src)[(i) + (stride) * ((j) + 1)] -  /* NOLINT */ \
    (src)[((i) + 1) + (stride) * ((j) + 1)]) /* NOLINT */
 
-sobel_xy av1_sobel(const uint8_t *input, int stride, int i, int j,
-                   bool high_bd) {
+sobel_xy av1_sobel(const uint8_t *input, int stride, int i, int j) {
   int16_t s_x;
   int16_t s_y;
-  if (high_bd) {
-    const uint16_t *src = CONVERT_TO_SHORTPTR(input);
-    s_x = SOBEL_X(src, stride, i, j);
-    s_y = SOBEL_Y(src, stride, i, j);
-  } else {
-    s_x = SOBEL_X(input, stride, i, j);
-    s_y = SOBEL_Y(input, stride, i, j);
-  }
+  const uint16_t *src = CONVERT_TO_SHORTPTR(input);
+  s_x = SOBEL_X(src, stride, i, j);
+  s_y = SOBEL_Y(src, stride, i, j);
   sobel_xy r = { .x = s_x, .y = s_y };
   return r;
 }
@@ -7475,7 +7387,7 @@
                                                                30, 12, 2,  0 };
 
 void av1_gaussian_blur(const uint8_t *src, int src_stride, int w, int h,
-                       uint8_t *dst, bool high_bd, int bd) {
+                       uint8_t *dst, int bd) {
   ConvolveParams conv_params = get_conv_params(0, 0, bd);
   InterpFilterParams filter = { .filter_ptr = gauss_filter,
                                 .taps = 8,
@@ -7485,18 +7397,12 @@
   assert(w % 8 == 0);
   // Because we use an eight tap filter, the stride should be at least 7 + w.
   assert(src_stride >= w + 7);
-  if (high_bd) {
-    av1_highbd_convolve_2d_sr(CONVERT_TO_SHORTPTR(src), src_stride,
-                              CONVERT_TO_SHORTPTR(dst), w, w, h, &filter,
-                              &filter, 0, 0, &conv_params, bd);
-  } else {
-    av1_convolve_2d_sr(src, src_stride, dst, w, w, h, &filter, &filter, 0, 0,
-                       &conv_params);
-  }
+  av1_highbd_convolve_2d_sr(CONVERT_TO_SHORTPTR(src), src_stride,
+                            CONVERT_TO_SHORTPTR(dst), w, w, h, &filter, &filter,
+                            0, 0, &conv_params, bd);
 }
 
-static EdgeInfo edge_probability(const uint8_t *input, int w, int h,
-                                 bool high_bd, int bd) {
+static EdgeInfo edge_probability(const uint8_t *input, int w, int h, int bd) {
   // The probability of an edge in the whole image is the same as the highest
   // probability of an edge for any individual pixel. Use Sobel as the metric
   // for finding an edge.
@@ -7506,7 +7412,7 @@
   // Ignore the 1 pixel border around the image for the computation.
   for (int j = 1; j < h - 1; ++j) {
     for (int i = 1; i < w - 1; ++i) {
-      sobel_xy g = av1_sobel(input, w, i, j, high_bd);
+      sobel_xy g = av1_sobel(input, w, i, j);
       // Scale down to 8-bit to get same output regardless of bit depth.
       int16_t g_x = g.x >> (bd - 8);
       int16_t g_y = g.y >> (bd - 8);
@@ -7524,27 +7430,19 @@
  * edges in the image.
  */
 EdgeInfo av1_edge_exists(const uint8_t *src, int src_stride, int w, int h,
-                         bool high_bd, int bd) {
+                         int bd) {
   if (w < 3 || h < 3) {
     EdgeInfo n = { .magnitude = 0, .x = 0, .y = 0 };
     return n;
   }
   uint8_t *blurred;
-  if (high_bd) {
-    blurred = CONVERT_TO_BYTEPTR(aom_memalign(32, sizeof(uint16_t) * w * h));
-  } else {
-    blurred = (uint8_t *)aom_memalign(32, sizeof(uint8_t) * w * h);
-  }
-  av1_gaussian_blur(src, src_stride, w, h, blurred, high_bd, bd);
+  blurred = CONVERT_TO_BYTEPTR(aom_memalign(32, sizeof(uint16_t) * w * h));
+  av1_gaussian_blur(src, src_stride, w, h, blurred, bd);
   // Skip the non-maximum suppression step in Canny edge detection. We just
   // want a probability of an edge existing in the buffer, which is determined
   // by the strongest edge in it -- we don't need to eliminate the weaker
   // edges. Use Sobel for the edge detection.
-  EdgeInfo prob = edge_probability(blurred, w, h, high_bd, bd);
-  if (high_bd) {
-    aom_free(CONVERT_TO_SHORTPTR(blurred));
-  } else {
-    aom_free(blurred);
-  }
+  EdgeInfo prob = edge_probability(blurred, w, h, bd);
+  aom_free(CONVERT_TO_SHORTPTR(blurred));
   return prob;
 }

diff --git a/av1/encoder/rdopt.h b/av1/encoder/rdopt.h
index 9c7ca19..0c5b35e 100644
--- a/av1/encoder/rdopt.h
+++ b/av1/encoder/rdopt.h

@@ -121,13 +121,13 @@
  * as a 16-bit array. bd is the bit depth.
  */
 EdgeInfo av1_edge_exists(const uint8_t *src, int src_stride, int w, int h,
-                         bool high_bd, int bd);
+                         int bd);
 
 /** Applies a Gaussian blur with sigma = 1.3. Used by av1_edge_exists and
  * tests.
  */
 void av1_gaussian_blur(const uint8_t *src, int src_stride, int w, int h,
-                       uint8_t *dst, bool high_bd, int bd);
+                       uint8_t *dst, int bd);
 
 /*!\cond */
 /* Applies standard 3x3 Sobel matrix. */
@@ -137,8 +137,7 @@
 } sobel_xy;
 /*!\endcond */
 
-sobel_xy av1_sobel(const uint8_t *input, int stride, int i, int j,
-                   bool high_bd);
+sobel_xy av1_sobel(const uint8_t *input, int stride, int i, int j);
 
 void av1_inter_mode_data_init(struct TileDataEnc *tile_data);
 void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult);

diff --git a/av1/encoder/rdopt_utils.h b/av1/encoder/rdopt_utils.h
index cf77864..6142355 100644
--- a/av1/encoder/rdopt_utils.h
+++ b/av1/encoder/rdopt_utils.h

@@ -840,10 +840,6 @@
       AOMMIN(x->winner_mode_count + 1, max_winner_mode_count);
 }
 
-unsigned int av1_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
-                                           const struct buf_2d *ref,
-                                           BLOCK_SIZE bs);
-
 unsigned int av1_high_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
                                                 const struct buf_2d *ref,
                                                 BLOCK_SIZE bs, int bd);

diff --git a/av1/encoder/reconinter_enc.c b/av1/encoder/reconinter_enc.c
index cb894df..d096df3 100644
--- a/av1/encoder/reconinter_enc.c
+++ b/av1/encoder/reconinter_enc.c

@@ -140,9 +140,8 @@
   const struct scale_factors *const sf = xd->block_ref_scale_factors[0];
 
   av1_init_inter_params(&inter_pred_params, pd->width, pd->height, mi_y, mi_x,
-                        pd->subsampling_x, pd->subsampling_y, xd->bd,
-                        is_cur_buf_hbd(xd), false, sf, pd->pre,
-                        xd->mi[0]->interp_fltr);
+                        pd->subsampling_x, pd->subsampling_y, xd->bd, false, sf,
+                        pd->pre, xd->mi[0]->interp_fltr);
 
   inter_pred_params.conv_params = get_conv_params_no_round(
       0, AOM_PLANE_Y, xd->tmp_conv_dst, MAX_SB_SIZE, false, xd->bd);
@@ -245,11 +244,10 @@
     const struct buf_2d *const pre_buf = &pd->pre[0];
     const MV mv = above_mbmi->mv[0].as_mv;
 
-    av1_init_inter_params(&inter_pred_params, bw, bh, mi_y >> pd->subsampling_y,
-                          mi_x >> pd->subsampling_x, pd->subsampling_x,
-                          pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), 0,
-                          xd->block_ref_scale_factors[0], pre_buf,
-                          above_mbmi->interp_fltr);
+    av1_init_inter_params(
+        &inter_pred_params, bw, bh, mi_y >> pd->subsampling_y,
+        mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, xd->bd,
+        0, xd->block_ref_scale_factors[0], pre_buf, above_mbmi->interp_fltr);
     inter_pred_params.conv_params = get_conv_params(0, j, xd->bd);
 
     av1_enc_build_one_inter_predictor(pd->dst.buf, pd->dst.stride, &mv,
@@ -334,35 +332,20 @@
 
   InterPredParams inter_pred_params;
 
-  av1_init_inter_params(&inter_pred_params, bw, bh, mi_y >> pd->subsampling_y,
-                        mi_x >> pd->subsampling_x, pd->subsampling_x,
-                        pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), 0,
-                        xd->block_ref_scale_factors[ref], &pd->pre[ref],
-                        mi->interp_fltr);
+  av1_init_inter_params(
+      &inter_pred_params, bw, bh, mi_y >> pd->subsampling_y,
+      mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, xd->bd,
+      0, xd->block_ref_scale_factors[ref], &pd->pre[ref], mi->interp_fltr);
   inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
   av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi);
 
-  uint8_t *const dst = get_buf_by_bd(xd, ext_dst);
+  uint8_t *const dst = CONVERT_TO_BYTEPTR(ext_dst);
   const MV mv = mi->mv[ref].as_mv;
 
   av1_enc_build_one_inter_predictor(dst, ext_dst_stride, &mv,
                                     &inter_pred_params);
 }
 
-static void build_masked_compound(
-    uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
-    const uint8_t *src1, int src1_stride,
-    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
-    int w) {
-  // Derive subsampling from h and w passed in. May be refactored to
-  // pass in subsampling factors directly.
-  const int subh = (2 << mi_size_high_log2[sb_type]) == h;
-  const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
-  const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
-  aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                     mask, block_size_wide[sb_type], w, h, subw, subh);
-}
-
 static void build_masked_compound_highbd(
     uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
     const uint8_t *src1_8, int src1_stride,
@@ -390,39 +373,22 @@
   uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
   mbmi->interinter_comp.seg_mask = xd->seg_mask;
   const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp;
-  const int is_hbd = is_cur_buf_hbd(xd);
 
   if (is_compound && is_masked_compound_type(comp_data->type)) {
     if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
-      if (is_hbd) {
-        av1_build_compound_diffwtd_mask_highbd(
-            comp_data->seg_mask, comp_data->mask_type,
-            CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
-            CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd);
-      } else {
-        av1_build_compound_diffwtd_mask(
-            comp_data->seg_mask, comp_data->mask_type, ext_dst0,
-            ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w);
-      }
+      av1_build_compound_diffwtd_mask_highbd(
+          comp_data->seg_mask, comp_data->mask_type,
+          CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd);
     }
 
-    if (is_hbd) {
-      build_masked_compound_highbd(
-          dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
-          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data,
-          mbmi->sb_type[PLANE_TYPE_Y], h, w, xd->bd);
-    } else {
-      build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
-                            ext_dst1, ext_dst_stride1, comp_data,
-                            mbmi->sb_type[PLANE_TYPE_Y], h, w);
-    }
+    build_masked_compound_highbd(
+        dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+        CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data,
+        mbmi->sb_type[PLANE_TYPE_Y], h, w, xd->bd);
   } else {
-    if (is_hbd) {
-      aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(ext_dst0), ext_dst_stride0,
-                               CONVERT_TO_SHORTPTR(dst), dst_buf->stride, w, h);
-    } else {
-      aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, w, h);
-    }
+    aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(ext_dst0), ext_dst_stride0,
+                             CONVERT_TO_SHORTPTR(dst), dst_buf->stride, w, h);
   }
 }
 

diff --git a/av1/encoder/superres_scale.c b/av1/encoder/superres_scale.c
index d7f92f0..9db0b54 100644
--- a/av1/encoder/superres_scale.c
+++ b/av1/encoder/superres_scale.c

@@ -26,43 +26,19 @@
   DECLARE_ALIGNED(16, int32_t, coeff[16 * 4]);
   int n = 0;
   memset(freq_energy, 0, sizeof(freq_energy));
-  if (buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    const int16_t *src16 = (const int16_t *)CONVERT_TO_SHORTPTR(buf->y_buffer);
-    for (int i = 0; i < height - 4; i += 4) {
-      for (int j = 0; j < width - 16; j += 16) {
-        av1_fwd_txfm2d_16x4(src16 + i * buf->y_stride + j, coeff, buf->y_stride,
-                            H_DCT, bd);
-        for (int k = 1; k < 16; ++k) {
-          const uint64_t this_energy =
-              ((int64_t)coeff[k] * coeff[k]) +
-              ((int64_t)coeff[k + 16] * coeff[k + 16]) +
-              ((int64_t)coeff[k + 32] * coeff[k + 32]) +
-              ((int64_t)coeff[k + 48] * coeff[k + 48]);
-          freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2 + 2 * (bd - 8));
-        }
-        n++;
+  const int16_t *src16 = (const int16_t *)CONVERT_TO_SHORTPTR(buf->y_buffer);
+  for (int i = 0; i < height - 4; i += 4) {
+    for (int j = 0; j < width - 16; j += 16) {
+      av1_fwd_txfm2d_16x4(src16 + i * buf->y_stride + j, coeff, buf->y_stride,
+                          H_DCT, bd);
+      for (int k = 1; k < 16; ++k) {
+        const uint64_t this_energy = ((int64_t)coeff[k] * coeff[k]) +
+                                     ((int64_t)coeff[k + 16] * coeff[k + 16]) +
+                                     ((int64_t)coeff[k + 32] * coeff[k + 32]) +
+                                     ((int64_t)coeff[k + 48] * coeff[k + 48]);
+        freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2 + 2 * (bd - 8));
       }
-    }
-  } else {
-    assert(bd == 8);
-    DECLARE_ALIGNED(16, int16_t, src16[16 * 4]);
-    for (int i = 0; i < height - 4; i += 4) {
-      for (int j = 0; j < width - 16; j += 16) {
-        for (int ii = 0; ii < 4; ++ii)
-          for (int jj = 0; jj < 16; ++jj)
-            src16[ii * 16 + jj] =
-                buf->y_buffer[(i + ii) * buf->y_stride + (j + jj)];
-        av1_fwd_txfm2d_16x4(src16, coeff, 16, H_DCT, bd);
-        for (int k = 1; k < 16; ++k) {
-          const uint64_t this_energy =
-              ((int64_t)coeff[k] * coeff[k]) +
-              ((int64_t)coeff[k + 16] * coeff[k + 16]) +
-              ((int64_t)coeff[k + 32] * coeff[k + 32]) +
-              ((int64_t)coeff[k + 48] * coeff[k + 48]);
-          freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2);
-        }
-        n++;
-      }
+      n++;
     }
   }
   if (n) {

diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 4b62b2e..fa662a2 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c

@@ -273,11 +273,6 @@
   }
 }
 
-// Helper function to determine whether a frame is encoded with high bit-depth.
-static INLINE int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) {
-  return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
-}
-
 /*!\endcond */
 /*!\brief Builds predictor for blocks in temporal filtering. This is the
  * second step for temporal filtering, which is to construct predictions from
@@ -321,7 +316,6 @@
   const int mb_x = mb_width * mb_col;                 // X-coord (Top-left).
   const int bit_depth = mbd->bd;                      // Bit depth.
   const int is_intrabc = 0;                           // Is intra-copied?
-  const int is_high_bitdepth = is_frame_high_bitdepth(ref_frame);
 
   // Default interpolation filters.
   const InterpFilter interp_filters = MULTITAP_SHARP2;
@@ -360,8 +354,8 @@
         // Build predictior for each sub-block on current plane.
         InterPredParams inter_pred_params;
         av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x,
-                              subsampling_y, bit_depth, is_high_bitdepth,
-                              is_intrabc, scale, &ref_buf, interp_filters);
+                              subsampling_y, bit_depth, is_intrabc, scale,
+                              &ref_buf, interp_filters);
         inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
         av1_enc_build_one_inter_predictor(&pred[plane_offset + i * plane_w + j],
                                           plane_w, &mv, &inter_pred_params);
@@ -393,7 +387,6 @@
   const int mb_height = block_size_high[block_size];
   const int mb_width = block_size_wide[block_size];
   const int mb_pels = mb_height * mb_width;
-  const int is_high_bitdepth = is_cur_buf_hbd(mbd);
   const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
 
   int plane_offset = 0;
@@ -407,7 +400,7 @@
     for (int i = 0; i < h; ++i) {
       for (int j = 0; j < w; ++j) {
         const int idx = plane_offset + pred_idx;  // Index with plane shift.
-        const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
+        const int pred_value = pred16[idx];
         accum[idx] += TF_WEIGHT_SCALE * pred_value;
         count[idx] += TF_WEIGHT_SCALE;
         ++pred_idx;
@@ -427,7 +420,6 @@
 //   tgt_stride: Stride for target buffer.
 //   height: Height of block for computation.
 //   width: Width of block for computation.
-//   is_high_bitdepth: Whether the two buffers point to high bit-depth frames.
 //   square_diff: Pointer to save the squared differces.
 // Returns:
 //   Nothing will be returned. But the content to which `square_diff` points
@@ -436,9 +428,7 @@
                                        const int ref_stride, const uint8_t *tgt,
                                        const int tgt_offset,
                                        const int tgt_stride, const int height,
-                                       const int width,
-                                       const int is_high_bitdepth,
-                                       uint32_t *square_diff) {
+                                       const int width, uint32_t *square_diff) {
   const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
   const uint16_t *tgt16 = CONVERT_TO_SHORTPTR(tgt);
 
@@ -447,10 +437,8 @@
   int idx = 0;
   for (int i = 0; i < height; ++i) {
     for (int j = 0; j < width; ++j) {
-      const uint16_t ref_value = is_high_bitdepth ? ref16[ref_offset + ref_idx]
-                                                  : ref[ref_offset + ref_idx];
-      const uint16_t tgt_value = is_high_bitdepth ? tgt16[tgt_offset + tgt_idx]
-                                                  : tgt[tgt_offset + tgt_idx];
+      const uint16_t ref_value = ref16[ref_offset + ref_idx];
+      const uint16_t tgt_value = tgt16[tgt_offset + tgt_idx];
       const uint32_t diff = (ref_value > tgt_value) ? (ref_value - tgt_value)
                                                     : (tgt_value - ref_value);
       square_diff[idx] = diff * diff;
@@ -497,7 +485,7 @@
  * Nothing returned, But the contents of `accum`, `pred` and 'count'
  * will be modified.
  */
-void av1_apply_temporal_filter_c(
+void av1_highbd_apply_temporal_filter_c(
     const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
     const int num_planes, const double *noise_levels, const MV *subblock_mvs,
@@ -507,7 +495,6 @@
   const int mb_height = block_size_high[block_size];
   const int mb_width = block_size_wide[block_size];
   const int mb_pels = mb_height * mb_width;
-  const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
   const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
   // Frame information.
   const int frame_height = frame_to_filter->y_crop_height;
@@ -529,8 +516,7 @@
     const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
     const uint8_t *ref = frame_to_filter->buffers[plane];
     compute_square_diff(ref, frame_offset, frame_stride, pred, plane_offset,
-                        plane_w, plane_h, plane_w, is_high_bitdepth,
-                        square_diff + plane_offset);
+                        plane_w, plane_h, plane_w, square_diff + plane_offset);
     plane_offset += mb_pels;
   }
 
@@ -613,7 +599,7 @@
         const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
 
         const int idx = plane_offset + pred_idx;  // Index with plane shift.
-        const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
+        const int pred_value = pred16[idx];
         accum[idx] += weight * pred_value;
         count[idx] += weight;
 
@@ -626,19 +612,6 @@
   aom_free(square_diff);
 }
 
-// Calls High bit-depth temporal filter
-void av1_highbd_apply_temporal_filter_c(
-    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
-    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
-    const int *subblock_mses, const int q_factor, const int filter_strength,
-    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
-  av1_apply_temporal_filter_c(frame_to_filter, mbd, block_size, mb_row, mb_col,
-                              num_planes, noise_levels, subblock_mvs,
-                              subblock_mses, q_factor, filter_strength, pred,
-                              accum, count);
-}
-
 /*!\brief Normalizes the accumulated filtering result to produce the filtered
  *        frame
  *
@@ -665,7 +638,6 @@
   const int mb_height = block_size_high[block_size];
   const int mb_width = block_size_wide[block_size];
   const int mb_pels = mb_height * mb_width;
-  const int is_high_bitdepth = is_frame_high_bitdepth(result_buffer);
 
   int plane_offset = 0;
   for (int plane = 0; plane < num_planes; ++plane) {
@@ -682,12 +654,7 @@
       for (int j = 0; j < plane_w; ++j) {
         const int idx = plane_idx + plane_offset;
         const uint16_t rounding = count[idx] >> 1;
-        if (is_high_bitdepth) {
-          buf16[frame_idx] =
-              (uint16_t)OD_DIVU(accum[idx] + rounding, count[idx]);
-        } else {
-          buf[frame_idx] = (uint8_t)OD_DIVU(accum[idx] + rounding, count[idx]);
-        }
+        buf16[frame_idx] = (uint16_t)OD_DIVU(accum[idx] + rounding, count[idx]);
         ++plane_idx;
         ++frame_idx;
       }
@@ -760,7 +727,6 @@
   const int mi_h = mi_size_high_log2[block_size];
   const int mi_w = mi_size_wide_log2[block_size];
   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-  const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
 
   // Quantization factor used in temporal filtering.
   const int q_factor = get_q(cpi);
@@ -796,13 +762,11 @@
   mbd->mi = &tmp_mb_mode_info;
   mbd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
   // Allocate memory for predictor, accumulator and count.
-  uint8_t *pred8 = aom_memalign(32, num_planes * mb_pels * sizeof(uint8_t));
   uint16_t *pred16 = aom_memalign(32, num_planes * mb_pels * sizeof(uint16_t));
   uint32_t *accum = aom_memalign(16, num_planes * mb_pels * sizeof(uint32_t));
   uint16_t *count = aom_memalign(16, num_planes * mb_pels * sizeof(uint16_t));
-  memset(pred8, 0, num_planes * mb_pels * sizeof(pred8[0]));
   memset(pred16, 0, num_planes * mb_pels * sizeof(pred16[0]));
-  uint8_t *const pred = is_high_bitdepth ? CONVERT_TO_BYTEPTR(pred16) : pred8;
+  uint8_t *const pred = CONVERT_TO_BYTEPTR(pred16);
 
   // Do filtering.
   FRAME_DIFF diff = { 0, 0 };
@@ -847,32 +811,17 @@
           // only supports 32x32 block size, 5x5 filtering window, 8-bit
           // encoding, and the case when the video is not with `YUV 4:2:2`
           // format.
-          if (is_frame_high_bitdepth(frame_to_filter)) {  // for high bit-depth
-            if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5 &&
-                !is_yuv422_format) {
-              av1_highbd_apply_temporal_filter(
-                  frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
-                  noise_levels, subblock_mvs, subblock_mses, q_factor,
-                  filter_strength, pred, accum, count);
-            } else {
-              av1_apply_temporal_filter_c(
-                  frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
-                  noise_levels, subblock_mvs, subblock_mses, q_factor,
-                  filter_strength, pred, accum, count);
-            }
-          } else {  // for 8-bit
-            if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5 &&
-                !is_yuv422_format) {
-              av1_apply_temporal_filter(
-                  frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
-                  noise_levels, subblock_mvs, subblock_mses, q_factor,
-                  filter_strength, pred, accum, count);
-            } else {
-              av1_apply_temporal_filter_c(
-                  frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
-                  noise_levels, subblock_mvs, subblock_mses, q_factor,
-                  filter_strength, pred, accum, count);
-            }
+          if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5 &&
+              !is_yuv422_format) {
+            av1_highbd_apply_temporal_filter(
+                frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+                noise_levels, subblock_mvs, subblock_mses, q_factor,
+                filter_strength, pred, accum, count);
+          } else {
+            av1_highbd_apply_temporal_filter_c(
+                frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+                noise_levels, subblock_mvs, subblock_mses, q_factor,
+                filter_strength, pred, accum, count);
           }
         }
       }
@@ -906,7 +855,6 @@
   mbd->mi = input_mb_mode_info;
 
   free(tmp_mb_mode_info);
-  aom_free(pred8);
   aom_free(pred16);
   aom_free(accum);
   aom_free(count);
@@ -1055,7 +1003,6 @@
   const int stride = frame->strides[is_y_plane ? 0 : 1];
   const uint8_t *src = frame->buffers[plane];
   const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
-  const int is_high_bitdepth = is_frame_high_bitdepth(frame);
 
   int64_t accum = 0;
   int count = 0;
@@ -1067,7 +1014,7 @@
       for (int ii = -1; ii <= 1; ++ii) {
         for (int jj = -1; jj <= 1; ++jj) {
           const int idx = center_idx + ii * stride + jj;
-          mat[ii + 1][jj + 1] = is_high_bitdepth ? src16[idx] : src[idx];
+          mat[ii + 1][jj + 1] = src16[idx];
         }
       }
       // Compute sobel gradients.

diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index b19360d..8b74a2d 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c

@@ -49,16 +49,10 @@
   QUANT_PARAM quant_param;
   av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_FP, 0, &quant_param);
 
-  if (is_cur_buf_hbd(xd)) {
-    av1_highbd_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob,
-                                  scan_order, &quant_param);
-    *recon_error =
-        av1_highbd_block_error(coeff, dqcoeff, pix_num, sse, xd->bd) >> shift;
-  } else {
-    av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob, scan_order,
-                           &quant_param);
-    *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
-  }
+  av1_highbd_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob,
+                                scan_order, &quant_param);
+  *recon_error =
+      av1_highbd_block_error(coeff, dqcoeff, pix_num, sse, xd->bd) >> shift;
 
   *recon_error = AOMMAX(*recon_error, 1);
 
@@ -68,7 +62,7 @@
 
 static AOM_INLINE void tpl_fwd_txfm(const int16_t *src_diff, int bw,
                                     tran_low_t *coeff, TX_SIZE tx_size,
-                                    int bit_depth, int is_hbd) {
+                                    int bit_depth) {
   TxfmParam txfm_param;
   txfm_param.tx_type = DCT_DCT;
 #if CONFIG_IST
@@ -79,7 +73,6 @@
   txfm_param.tx_set_type = EXT_TX_SET_ALL16;
 
   txfm_param.bd = bit_depth;
-  txfm_param.is_hbd = is_hbd;
   av1_fwd_txfm(src_diff, coeff, bw, &txfm_param);
 }
 
@@ -94,7 +87,7 @@
 
   av1_subtract_block(xd, bh, bw, src_diff, diff_stride, src, src_stride, dst,
                      dst_stride);
-  tpl_fwd_txfm(src_diff, bw, coeff, tx_size, xd->bd, is_cur_buf_hbd(xd));
+  tpl_fwd_txfm(src_diff, bw, coeff, tx_size, xd->bd);
   return aom_satd(coeff, pix_num);
 }
 
@@ -122,8 +115,7 @@
   uint16_t eob;
   av1_subtract_block(xd, bh, bw, src_diff, diff_stride, src, src_stride, dst,
                      dst_stride);
-  tpl_fwd_txfm(src_diff, diff_stride, coeff, tx_size, xd->bd,
-               is_cur_buf_hbd(xd));
+  tpl_fwd_txfm(src_diff, diff_stride, coeff, tx_size, xd->bd);
 
   get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, &eob, recon_error,
                      sse);
@@ -260,8 +252,7 @@
   tran_low_t *dqcoeff = aom_memalign(32, tpl_block_pels * sizeof(tran_low_t));
   tran_low_t *best_coeff =
       aom_memalign(32, tpl_block_pels * sizeof(tran_low_t));
-  uint8_t *predictor =
-      is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8;
+  uint8_t *predictor = CONVERT_TO_BYTEPTR(predictor8);
   int64_t recon_error = 1, sse = 1;
 
   memset(tpl_stats, 0, sizeof(*tpl_stats));
@@ -283,16 +274,10 @@
   // Pre-load the bottom left line.
   if (xd->left_available &&
       mi_row + tx_size_high_unit[tx_size] < xd->tile.mi_row_end) {
-    if (is_cur_buf_hbd(xd)) {
-      uint16_t *dst = CONVERT_TO_SHORTPTR(dst_buffer);
-      for (int i = 0; i < bw; ++i)
-        dst[(bw + i) * dst_buffer_stride - 1] =
-            dst[(bw - 1) * dst_buffer_stride - 1];
-    } else {
-      for (int i = 0; i < bw; ++i)
-        dst_buffer[(bw + i) * dst_buffer_stride - 1] =
-            dst_buffer[(bw - 1) * dst_buffer_stride - 1];
-    }
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst_buffer);
+    for (int i = 0; i < bw; ++i)
+      dst[(bw + i) * dst_buffer_stride - 1] =
+          dst[(bw - 1) * dst_buffer_stride - 1];
   }
 
   // if cpi->sf.tpl_sf.prune_intra_modes is on, then search only DC_PRED,
@@ -423,8 +408,8 @@
                               ref_frame_ptr->y_stride };
     InterPredParams inter_pred_params;
     av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE,
-                          mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0,
-                          &tpl_data->sf, &ref_buf, kernel);
+                          mi_col * MI_SIZE, 0, 0, xd->bd, 0, &tpl_data->sf,
+                          &ref_buf, kernel);
     inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd);
 
     av1_enc_build_one_inter_predictor(predictor, bw, &best_rfidx_mv.as_mv,
@@ -478,8 +463,8 @@
                               ref_frame_ptr->y_width, ref_frame_ptr->y_height,
                               ref_frame_ptr->y_stride };
     av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE,
-                          mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0,
-                          &tpl_data->sf, &ref_buf, kernel);
+                          mi_col * MI_SIZE, 0, 0, xd->bd, 0, &tpl_data->sf,
+                          &ref_buf, kernel);
     inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd);
 
     av1_enc_build_one_inter_predictor(dst_buffer, dst_buffer_stride,

diff --git a/av1/encoder/tune_vmaf.c b/av1/encoder/tune_vmaf.c
index a95b501..2a8c549 100644
--- a/av1/encoder/tune_vmaf.c
+++ b/av1/encoder/tune_vmaf.c

@@ -118,13 +118,8 @@
       buf.buf = (uint8_t *)y_buffer + row_offset_y * y_stride + col_offset_y;
       buf.stride = y_stride;
 
-      if (cpi->common.seq_params.use_highbitdepth) {
-        assert(frame->flags & YV12_FLAG_HIGHBITDEPTH);
-        var += av1_high_get_sby_perpixel_variance(cpi, &buf, block_size,
-                                                  bit_depth);
-      } else {
-        var += av1_get_sby_perpixel_variance(cpi, &buf, block_size);
-      }
+      var +=
+          av1_high_get_sby_perpixel_variance(cpi, &buf, block_size, bit_depth);
       var_count += 1.0;
     }
   }
@@ -214,41 +209,15 @@
   }
 }
 
-static AOM_INLINE void unsharp_rect(const uint8_t *source, int source_stride,
-                                    const uint8_t *blurred, int blurred_stride,
-                                    uint8_t *dst, int dst_stride, int w, int h,
-                                    double amount) {
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; ++j) {
-      const double val =
-          (double)source[j] + amount * ((double)source[j] - (double)blurred[j]);
-      dst[j] = (uint8_t)clamp((int)(val + 0.5), 0, 255);
-    }
-    source += source_stride;
-    blurred += blurred_stride;
-    dst += dst_stride;
-  }
-}
-
 static AOM_INLINE void unsharp(const AV1_COMP *const cpi,
                                const YV12_BUFFER_CONFIG *source,
                                const YV12_BUFFER_CONFIG *blurred,
                                const YV12_BUFFER_CONFIG *dst, double amount) {
   const int bit_depth = cpi->td.mb.e_mbd.bd;
-  if (cpi->common.seq_params.use_highbitdepth) {
-    assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
-    assert(blurred->flags & YV12_FLAG_HIGHBITDEPTH);
-    assert(dst->flags & YV12_FLAG_HIGHBITDEPTH);
-    highbd_unsharp_rect(CONVERT_TO_SHORTPTR(source->y_buffer), source->y_stride,
-                        CONVERT_TO_SHORTPTR(blurred->y_buffer),
-                        blurred->y_stride, CONVERT_TO_SHORTPTR(dst->y_buffer),
-                        dst->y_stride, source->y_width, source->y_height,
-                        amount, bit_depth);
-  } else {
-    unsharp_rect(source->y_buffer, source->y_stride, blurred->y_buffer,
-                 blurred->y_stride, dst->y_buffer, dst->y_stride,
-                 source->y_width, source->y_height, amount);
-  }
+  highbd_unsharp_rect(CONVERT_TO_SHORTPTR(source->y_buffer), source->y_stride,
+                      CONVERT_TO_SHORTPTR(blurred->y_buffer), blurred->y_stride,
+                      CONVERT_TO_SHORTPTR(dst->y_buffer), dst->y_stride,
+                      source->y_width, source->y_height, amount, bit_depth);
 }
 
 // 8-tap Gaussian convolution filter with sigma = 1.0, sums to 128,
@@ -280,16 +249,10 @@
       uint8_t *dst_buf =
           dst->y_buffer + row_offset_y * dst->y_stride + col_offset_y;
 
-      if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
-        av1_highbd_convolve_2d_sr(
-            CONVERT_TO_SHORTPTR(src_buf), source->y_stride,
-            CONVERT_TO_SHORTPTR(dst_buf), dst->y_stride, block_w, block_h,
-            &filter, &filter, 0, 0, &conv_params, bit_depth);
-      } else {
-        av1_convolve_2d_sr(src_buf, source->y_stride, dst_buf, dst->y_stride,
-                           block_w, block_h, &filter, &filter, 0, 0,
-                           &conv_params);
-      }
+      av1_highbd_convolve_2d_sr(CONVERT_TO_SHORTPTR(src_buf), source->y_stride,
+                                CONVERT_TO_SHORTPTR(dst_buf), dst->y_stride,
+                                block_w, block_h, &filter, &filter, 0, 0,
+                                &conv_params, bit_depth);
     }
   }
 }
@@ -368,9 +331,9 @@
 #endif
   YV12_BUFFER_CONFIG sharpened;
   memset(&sharpened, 0, sizeof(sharpened));
-  aom_alloc_frame_buffer(
-      &sharpened, width, height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&sharpened, width, height, 1, 1,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
 
   const double baseline_variance = frame_average_variance(cpi, source);
   double unsharp_amount;
@@ -444,9 +407,9 @@
 
   YV12_BUFFER_CONFIG blurred;
   memset(&blurred, 0, sizeof(blurred));
-  aom_alloc_frame_buffer(
-      &blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&blurred, width, height, 1, 1,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
 
   gaussian_blur(bit_depth, source, &blurred);
   unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount);
@@ -466,12 +429,12 @@
   YV12_BUFFER_CONFIG source_extended, blurred;
   memset(&source_extended, 0, sizeof(source_extended));
   memset(&blurred, 0, sizeof(blurred));
-  aom_alloc_frame_buffer(
-      &source_extended, width, height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
-  aom_alloc_frame_buffer(
-      &blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&source_extended, width, height, 1, 1,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&blurred, width, height, 1, 1,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
 
   av1_copy_and_extend_frame(source, &source_extended);
   gaussian_blur(bit_depth, &source_extended, &blurred);
@@ -505,12 +468,12 @@
   YV12_BUFFER_CONFIG source_extended, blurred;
   memset(&blurred, 0, sizeof(blurred));
   memset(&source_extended, 0, sizeof(source_extended));
-  aom_alloc_frame_buffer(
-      &blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
-  aom_alloc_frame_buffer(
-      &source_extended, width, height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&blurred, width, height, 1, 1,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&source_extended, width, height, 1, 1,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
 
   av1_copy_and_extend_frame(source, &source_extended);
   gaussian_blur(bit_depth, &source_extended, &blurred);
@@ -541,12 +504,12 @@
   YV12_BUFFER_CONFIG source_block, blurred_block;
   memset(&source_block, 0, sizeof(source_block));
   memset(&blurred_block, 0, sizeof(blurred_block));
-  aom_alloc_frame_buffer(
-      &source_block, block_w, block_h, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
-  aom_alloc_frame_buffer(
-      &blurred_block, block_w, block_h, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&source_block, block_w, block_h, 1, 1,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&blurred_block, block_w, block_h, 1, 1,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
 
   for (int row = 0; row < num_rows; ++row) {
     for (int col = 0; col < num_cols; ++col) {
@@ -556,58 +519,29 @@
       const int block_height = AOMMIN(height - row_offset_y, block_h);
       const int index = col + row * num_cols;
 
-      if (cm->seq_params.use_highbitdepth) {
-        assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
-        assert(blurred.flags & YV12_FLAG_HIGHBITDEPTH);
-        uint16_t *frame_src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) +
-                                  row_offset_y * source->y_stride +
-                                  col_offset_y;
-        uint16_t *frame_blurred_buf = CONVERT_TO_SHORTPTR(blurred.y_buffer) +
-                                      row_offset_y * blurred.y_stride +
-                                      col_offset_y;
-        uint16_t *blurred_dst = CONVERT_TO_SHORTPTR(blurred_block.y_buffer);
-        uint16_t *src_dst = CONVERT_TO_SHORTPTR(source_block.y_buffer);
+      uint16_t *frame_src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) +
+                                row_offset_y * source->y_stride + col_offset_y;
+      uint16_t *frame_blurred_buf = CONVERT_TO_SHORTPTR(blurred.y_buffer) +
+                                    row_offset_y * blurred.y_stride +
+                                    col_offset_y;
+      uint16_t *blurred_dst = CONVERT_TO_SHORTPTR(blurred_block.y_buffer);
+      uint16_t *src_dst = CONVERT_TO_SHORTPTR(source_block.y_buffer);
 
-        // Copy block from source frame.
-        for (int i = 0; i < block_h; ++i) {
-          for (int j = 0; j < block_w; ++j) {
-            if (i >= block_height || j >= block_width) {
-              src_dst[j] = 0;
-              blurred_dst[j] = 0;
-            } else {
-              src_dst[j] = frame_src_buf[j];
-              blurred_dst[j] = frame_blurred_buf[j];
-            }
+      // Copy block from source frame.
+      for (int i = 0; i < block_h; ++i) {
+        for (int j = 0; j < block_w; ++j) {
+          if (i >= block_height || j >= block_width) {
+            src_dst[j] = 0;
+            blurred_dst[j] = 0;
+          } else {
+            src_dst[j] = frame_src_buf[j];
+            blurred_dst[j] = frame_blurred_buf[j];
           }
-          frame_src_buf += source->y_stride;
-          frame_blurred_buf += blurred.y_stride;
-          src_dst += source_block.y_stride;
-          blurred_dst += blurred_block.y_stride;
         }
-      } else {
-        uint8_t *frame_src_buf =
-            source->y_buffer + row_offset_y * source->y_stride + col_offset_y;
-        uint8_t *frame_blurred_buf =
-            blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
-        uint8_t *blurred_dst = blurred_block.y_buffer;
-        uint8_t *src_dst = source_block.y_buffer;
-
-        // Copy block from source frame.
-        for (int i = 0; i < block_h; ++i) {
-          for (int j = 0; j < block_w; ++j) {
-            if (i >= block_height || j >= block_width) {
-              src_dst[j] = 0;
-              blurred_dst[j] = 0;
-            } else {
-              src_dst[j] = frame_src_buf[j];
-              blurred_dst[j] = frame_blurred_buf[j];
-            }
-          }
-          frame_src_buf += source->y_stride;
-          frame_blurred_buf += blurred.y_stride;
-          src_dst += source_block.y_stride;
-          blurred_dst += blurred_block.y_stride;
-        }
+        frame_src_buf += source->y_stride;
+        frame_blurred_buf += blurred.y_stride;
+        src_dst += source_block.y_stride;
+        blurred_dst += blurred_block.y_stride;
       }
 
       best_unsharp_amounts[index] = find_best_frame_unsharp_amount(
@@ -625,26 +559,14 @@
       const int block_height = AOMMIN(source->y_height - row_offset_y, block_h);
       const int index = col + row * num_cols;
 
-      if (cm->seq_params.use_highbitdepth) {
-        assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
-        assert(blurred.flags & YV12_FLAG_HIGHBITDEPTH);
-        uint16_t *src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) +
-                            row_offset_y * source->y_stride + col_offset_y;
-        uint16_t *blurred_buf = CONVERT_TO_SHORTPTR(blurred.y_buffer) +
-                                row_offset_y * blurred.y_stride + col_offset_y;
-        highbd_unsharp_rect(src_buf, source->y_stride, blurred_buf,
-                            blurred.y_stride, src_buf, source->y_stride,
-                            block_width, block_height,
-                            best_unsharp_amounts[index], bit_depth);
-      } else {
-        uint8_t *src_buf =
-            source->y_buffer + row_offset_y * source->y_stride + col_offset_y;
-        uint8_t *blurred_buf =
-            blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
-        unsharp_rect(src_buf, source->y_stride, blurred_buf, blurred.y_stride,
-                     src_buf, source->y_stride, block_width, block_height,
-                     best_unsharp_amounts[index]);
-      }
+      uint16_t *src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) +
+                          row_offset_y * source->y_stride + col_offset_y;
+      uint16_t *blurred_buf = CONVERT_TO_SHORTPTR(blurred.y_buffer) +
+                              row_offset_y * blurred.y_stride + col_offset_y;
+      highbd_unsharp_rect(src_buf, source->y_stride, blurred_buf,
+                          blurred.y_stride, src_buf, source->y_stride,
+                          block_width, block_height,
+                          best_unsharp_amounts[index], bit_depth);
     }
   }
 
@@ -686,18 +608,10 @@
     float *ref, *main;
     ref = ref_data + i * stride;
     main = main_data + i * stride;
-    if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
-      uint16_t *src;
-      src = CONVERT_TO_SHORTPTR(source->y_buffer) + i * source->y_stride;
-      for (int j = 0; j < width; ++j) {
-        ref[j] = main[j] = scale_factor * (float)src[j];
-      }
-    } else {
-      uint8_t *src;
-      src = source->y_buffer + i * source->y_stride;
-      for (int j = 0; j < width; ++j) {
-        ref[j] = main[j] = (float)src[j];
-      }
+    uint16_t *src;
+    src = CONVERT_TO_SHORTPTR(source->y_buffer) + i * source->y_stride;
+    for (int j = 0; j < width; ++j) {
+      ref[j] = main[j] = scale_factor * (float)src[j];
     }
   }
   if (row < num_rows && col < num_cols) {
@@ -708,26 +622,14 @@
     const int block_height = AOMMIN(height - row_offset, block_h);
 
     float *main_buf = main_data + col_offset + row_offset * stride;
-    if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
-      uint16_t *blurred_buf = CONVERT_TO_SHORTPTR(blurred->y_buffer) +
-                              row_offset * blurred->y_stride + col_offset;
-      for (int i = 0; i < block_height; ++i) {
-        for (int j = 0; j < block_width; ++j) {
-          main_buf[j] = scale_factor * (float)blurred_buf[j];
-        }
-        main_buf += stride;
-        blurred_buf += blurred->y_stride;
+    uint16_t *blurred_buf = CONVERT_TO_SHORTPTR(blurred->y_buffer) +
+                            row_offset * blurred->y_stride + col_offset;
+    for (int i = 0; i < block_height; ++i) {
+      for (int j = 0; j < block_width; ++j) {
+        main_buf[j] = scale_factor * (float)blurred_buf[j];
       }
-    } else {
-      uint8_t *blurred_buf =
-          blurred->y_buffer + row_offset * blurred->y_stride + col_offset;
-      for (int i = 0; i < block_height; ++i) {
-        for (int j = 0; j < block_width; ++j) {
-          main_buf[j] = (float)blurred_buf[j];
-        }
-        main_buf += stride;
-        blurred_buf += blurred->y_stride;
-      }
+      main_buf += stride;
+      blurred_buf += blurred->y_stride;
     }
 
     frames->col++;
@@ -755,8 +657,7 @@
   memset(&resized_source, 0, sizeof(resized_source));
   aom_alloc_frame_buffer(
       &resized_source, y_width / resize_factor, y_height / resize_factor, 1, 1,
-      cm->seq_params.use_highbitdepth, cpi->oxcf.border_in_pixels,
-      cm->features.byte_alignment);
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
   av1_resize_and_extend_frame_nonnormative(cpi->source, &resized_source,
                                            bit_depth, av1_num_planes(cm));
 
@@ -772,7 +673,6 @@
   YV12_BUFFER_CONFIG blurred;
   memset(&blurred, 0, sizeof(blurred));
   aom_alloc_frame_buffer(&blurred, resized_y_width, resized_y_height, 1, 1,
-                         cm->seq_params.use_highbitdepth,
                          cpi->oxcf.border_in_pixels,
                          cm->features.byte_alignment);
   gaussian_blur(bit_depth, &resized_source, &blurred);
@@ -781,7 +681,6 @@
   YV12_BUFFER_CONFIG recon;
   memset(&recon, 0, sizeof(recon));
   aom_alloc_frame_buffer(&recon, resized_y_width, resized_y_height, 1, 1,
-                         cm->seq_params.use_highbitdepth,
                          cpi->oxcf.border_in_pixels,
                          cm->features.byte_alignment);
   aom_yv12_copy_frame(&resized_source, &recon, 1);
@@ -829,16 +728,10 @@
       uint8_t *const recon_buf =
           recon.y_buffer + row_offset_y * recon.y_stride + col_offset_y;
       // Set recon buf
-      if (cpi->common.seq_params.use_highbitdepth) {
-        highbd_unsharp_rect(CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride,
-                            CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride,
-                            CONVERT_TO_SHORTPTR(recon_buf), recon.y_stride,
-                            resized_block_w, resized_block_h, 0.0, bit_depth);
-      } else {
-        unsharp_rect(blurred_buf, blurred.y_stride, blurred_buf,
-                     blurred.y_stride, recon_buf, recon.y_stride,
-                     resized_block_w, resized_block_h, 0.0);
-      }
+      highbd_unsharp_rect(CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride,
+                          CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride,
+                          CONVERT_TO_SHORTPTR(recon_buf), recon.y_stride,
+                          resized_block_w, resized_block_h, 0.0, bit_depth);
 
       double vmaf;
       aom_calc_vmaf_at_index_rc(vmaf_context, cpi->vmaf_info.vmaf_model,
@@ -846,17 +739,11 @@
                                 &vmaf);
 
       // Restore recon buf
-      if (cpi->common.seq_params.use_highbitdepth) {
-        highbd_unsharp_rect(
-            CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride,
-            CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride,
-            CONVERT_TO_SHORTPTR(recon_buf), recon.y_stride, resized_block_w,
-            resized_block_h, 0.0, bit_depth);
-      } else {
-        unsharp_rect(orig_buf, resized_source.y_stride, orig_buf,
-                     resized_source.y_stride, recon_buf, recon.y_stride,
-                     resized_block_w, resized_block_h, 0.0);
-      }
+      highbd_unsharp_rect(
+          CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride,
+          CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride,
+          CONVERT_TO_SHORTPTR(recon_buf), recon.y_stride, resized_block_w,
+          resized_block_h, 0.0, bit_depth);
 #else
       const double vmaf = scores[index];
 #endif
@@ -973,47 +860,33 @@
   memset(&blurred_last, 0, sizeof(blurred_last));
   memset(&blurred_next, 0, sizeof(blurred_next));
 
-  aom_alloc_frame_buffer(
-      &blurred_cur, y_width, y_height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
-  aom_alloc_frame_buffer(
-      &blurred_last, y_width, y_height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
-  aom_alloc_frame_buffer(
-      &blurred_next, y_width, y_height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&blurred_cur, y_width, y_height, 1, 1,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&blurred_last, y_width, y_height, 1, 1,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&blurred_next, y_width, y_height, 1, 1,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
 
   gaussian_blur(bit_depth, cur, &blurred_cur);
   gaussian_blur(bit_depth, last, &blurred_last);
   if (next) gaussian_blur(bit_depth, next, &blurred_next);
 
   double motion1, motion2 = 65536.0;
-  if (cm->seq_params.use_highbitdepth) {
-    assert(blurred_cur.flags & YV12_FLAG_HIGHBITDEPTH);
-    assert(blurred_last.flags & YV12_FLAG_HIGHBITDEPTH);
-    const float scale_factor = 1.0f / (float)(1 << (bit_depth - 8));
-    motion1 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer),
+  const float scale_factor = 1.0f / (float)(1 << (bit_depth - 8));
+  motion1 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer),
+                               blurred_cur.y_stride,
+                               CONVERT_TO_SHORTPTR(blurred_last.y_buffer),
+                               blurred_last.y_stride, y_width, y_height) *
+            scale_factor;
+  if (next) {
+    motion2 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer),
                                  blurred_cur.y_stride,
-                                 CONVERT_TO_SHORTPTR(blurred_last.y_buffer),
-                                 blurred_last.y_stride, y_width, y_height) *
+                                 CONVERT_TO_SHORTPTR(blurred_next.y_buffer),
+                                 blurred_next.y_stride, y_width, y_height) *
               scale_factor;
-    if (next) {
-      assert(blurred_next.flags & YV12_FLAG_HIGHBITDEPTH);
-      motion2 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer),
-                                   blurred_cur.y_stride,
-                                   CONVERT_TO_SHORTPTR(blurred_next.y_buffer),
-                                   blurred_next.y_stride, y_width, y_height) *
-                scale_factor;
-    }
-  } else {
-    motion1 = image_sad_c(blurred_cur.y_buffer, blurred_cur.y_stride,
-                          blurred_last.y_buffer, blurred_last.y_stride, y_width,
-                          y_height);
-    if (next) {
-      motion2 = image_sad_c(blurred_cur.y_buffer, blurred_cur.y_stride,
-                            blurred_next.y_buffer, blurred_next.y_stride,
-                            y_width, y_height);
-    }
   }
 
   aom_free_frame_buffer(&blurred_cur);
@@ -1128,18 +1001,18 @@
   memset(&src_sharpened, 0, sizeof(src_sharpened));
   memset(&recon_blurred, 0, sizeof(recon_blurred));
   memset(&src_blurred, 0, sizeof(src_blurred));
-  aom_alloc_frame_buffer(
-      &recon_sharpened, width, height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
-  aom_alloc_frame_buffer(
-      &src_sharpened, width, height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
-  aom_alloc_frame_buffer(
-      &recon_blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
-  aom_alloc_frame_buffer(
-      &src_blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth,
-      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&recon_sharpened, width, height, 1, 1,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&src_sharpened, width, height, 1, 1,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&recon_blurred, width, height, 1, 1,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
+  aom_alloc_frame_buffer(&src_blurred, width, height, 1, 1,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
 
   gaussian_blur(bit_depth, recon, &recon_blurred);
   gaussian_blur(bit_depth, src, &src_blurred);
@@ -1208,15 +1081,8 @@
   aom_calc_vmaf(cpi->oxcf.tune_cfg.vmaf_model_path, source, recon, bit_depth,
                 &cpi->vmaf_info.last_frame_vmaf[layer_depth]);
 #endif  // CONFIG_USE_VMAF_RC
-  if (cpi->common.seq_params.use_highbitdepth) {
-    assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
-    assert(recon->flags & YV12_FLAG_HIGHBITDEPTH);
-    cpi->vmaf_info.last_frame_ysse[layer_depth] =
-        (double)aom_highbd_get_y_sse(source, recon);
-  } else {
-    cpi->vmaf_info.last_frame_ysse[layer_depth] =
-        (double)aom_get_y_sse(source, recon);
-  }
+  cpi->vmaf_info.last_frame_ysse[layer_depth] =
+      (double)aom_highbd_get_y_sse(source, recon);
 
 #if CONFIG_USE_VMAF_RC
   if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {

diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index 02ce71f..b55e142 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c

@@ -482,7 +482,6 @@
   param.tx_type = DCT_DCT;
   param.tx_size = max_tx_size;
   param.bd = xd->bd;
-  param.is_hbd = is_cur_buf_hbd(xd);
   param.lossless = 0;
   param.tx_set_type = av1_get_ext_tx_set_type(
       param.tx_size, is_inter_block(xd->mi[0], xd->tree_type), reduced_tx_set);
@@ -530,7 +529,7 @@
   for (int i = 0; i < n4; ++i)
     set_blk_skip(x->txfm_search_info.blk_skip, 0, i, 1);
   rd_stats->skip_txfm = 1;
-  if (is_cur_buf_hbd(xd)) dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2);
+  dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2);
   rd_stats->dist = rd_stats->sse = (dist << 4);
   // Though decision is to make the block as skip based on luma stats,
   // it is possible that block becomes non skip after chroma rd. In addition
@@ -614,24 +613,14 @@
     assert(bw <= 32);
     assert(bh <= 32);
     assert(((bw - 1) >> w_shift) + (((bh - 1) >> h_shift) << 2) == 15);
-    if (cpi->common.seq_params.use_highbitdepth) {
-      const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
-      const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
-      for (int i = 0; i < bh; ++i)
-        for (int j = 0; j < bw; ++j) {
-          const int index = (j >> w_shift) + ((i >> h_shift) << 2);
-          esq[index] +=
-              (src16[j + i * src_stride] - dst16[j + i * dst_stride]) *
-              (src16[j + i * src_stride] - dst16[j + i * dst_stride]);
-        }
-    } else {
-      for (int i = 0; i < bh; ++i)
-        for (int j = 0; j < bw; ++j) {
-          const int index = (j >> w_shift) + ((i >> h_shift) << 2);
-          esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) *
-                        (src[j + i * src_stride] - dst[j + i * dst_stride]);
-        }
-    }
+    const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+    const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+    for (int i = 0; i < bh; ++i)
+      for (int j = 0; j < bw; ++j) {
+        const int index = (j >> w_shift) + ((i >> h_shift) << 2);
+        esq[index] += (src16[j + i * src_stride] - dst16[j + i * dst_stride]) *
+                      (src16[j + i * src_stride] - dst16[j + i * dst_stride]);
+      }
   } else {  // Calculate 'esq' values using 'vf' functions on the 16 sub-blocks.
     const int f_index =
         (bsize < BLOCK_SIZES) ? bsize - BLOCK_16X16 : bsize - BLOCK_8X16;
@@ -812,7 +801,7 @@
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const int txw = tx_size_wide[tx_size];
   const int txh = tx_size_high[tx_size];
-  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+  const int dequant_shift = xd->bd - 5;
 
   const int q_step =
       ROUND_POWER_OF_TWO(p->dequant_QTX[1], QUANT_TABLE_BITS) >> dequant_shift;
@@ -963,19 +952,6 @@
   return sum / (w * h);
 }
 
-static double get_diff_mean(const uint8_t *src, int src_stride,
-                            const uint8_t *dst, int dst_stride, int w, int h) {
-  double sum = 0.0;
-  for (int j = 0; j < h; ++j) {
-    for (int i = 0; i < w; ++i) {
-      const int diff = src[j * src_stride + i] - dst[j * dst_stride + i];
-      sum += diff;
-    }
-  }
-  assert(w > 0 && h > 0);
-  return sum / (w * h);
-}
-
 static AOM_INLINE void PrintPredictionUnitStats(const AV1_COMP *const cpi,
                                                 const TileDataEnc *tile_data,
                                                 MACROBLOCK *x,
@@ -1008,7 +984,7 @@
   get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
                      &bh);
   const int num_samples = bw * bh;
-  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+  const int dequant_shift = xd->bd - 5;
 
   const int q_step =
       ROUND_POWER_OF_TWO(p->dequant_QTX[1], QUANT_TABLE_BITS) >> dequant_shift;
@@ -1067,13 +1043,8 @@
           model_rdcost_norm);
 
   double mean;
-  if (is_cur_buf_hbd(xd)) {
-    mean = get_highbd_diff_mean(p->src.buf, p->src.stride, pd->dst.buf,
-                                pd->dst.stride, bw, bh);
-  } else {
-    mean = get_diff_mean(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
-                         bw, bh);
-  }
+  mean = get_highbd_diff_mean(p->src.buf, p->src.stride, pd->dst.buf,
+                              pd->dst.stride, bw, bh);
   mean /= (1 << shift);
   float hor_corr, vert_corr;
   av1_get_horver_correlation_full(src_diff, diff_stride, bw, bh, &hor_corr,
@@ -1198,14 +1169,10 @@
   }
 
   const MACROBLOCKD *xd = &x->e_mbd;
-  if (is_cur_buf_hbd(xd)) {
-    uint64_t sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride,
-                                             visible_cols, visible_rows);
-    return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2);
-  }
+  uint64_t sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride,
+                                           visible_cols, visible_rows);
+  return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2);
 
-  sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols,
-                         visible_rows);
   return sse;
 }
 
@@ -1262,14 +1229,9 @@
   uint8_t *recon;
   DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]);
 
-  if (is_cur_buf_hbd(xd)) {
-    recon = CONVERT_TO_BYTEPTR(recon16);
-    aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride,
-                             CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw, bsh);
-  } else {
-    recon = (uint8_t *)recon16;
-    aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh);
-  }
+  recon = CONVERT_TO_BYTEPTR(recon16);
+  aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride,
+                           CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw, bsh);
 
   const PLANE_TYPE plane_type = get_plane_type(plane);
   TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
@@ -1395,11 +1357,8 @@
   tran_low_t *const coeff = p->coeff + block_offset;
   tran_low_t *const dqcoeff = p->dqcoeff + block_offset;
   MACROBLOCKD *const xd = &x->e_mbd;
-  if (is_cur_buf_hbd(xd))
-    *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse,
-                                       xd->bd);
-  else
-    *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
+  *out_dist =
+      av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse, xd->bd);
 
   *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift);
   *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift);
@@ -2242,7 +2201,7 @@
     int *dc_only_blk) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
-  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+  const int dequant_shift = xd->bd - 5;
 
   const int qstep =
       ROUND_POWER_OF_TWO(x->plane[plane].dequant_QTX[1], QUANT_TABLE_BITS) >>
@@ -2257,8 +2216,7 @@
                                 per_px_mean, &block_var);
   assert((*block_mse_q8) != UINT_MAX);
   uint64_t var_threshold = (uint64_t)(1.8 * qstep * qstep);
-  if (is_cur_buf_hbd(xd))
-    block_var = ROUND_POWER_OF_TWO(block_var, (xd->bd - 8) * 2);
+  block_var = ROUND_POWER_OF_TWO(block_var, (xd->bd - 8) * 2);
   // Early prediction of skip block if residual mean and variance are less
   // than qstep based threshold
   if (((llabs(*per_px_mean) * dc_coeff_scale[tx_size]) < (dc_qstep << 12)) &&
@@ -2270,8 +2228,7 @@
 
     x->plane[plane].eobs[block] = 0;
 
-    if (is_cur_buf_hbd(xd))
-      *block_sse = ROUND_POWER_OF_TWO((*block_sse), (xd->bd - 8) * 2);
+    *block_sse = ROUND_POWER_OF_TWO((*block_sse), (xd->bd - 8) * 2);
 
     best_rd_stats->dist = (*block_sse) << 4;
     best_rd_stats->sse = best_rd_stats->dist;
@@ -2397,7 +2354,7 @@
   int txk_map[TX_TYPES] = {
     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   };
-  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+  const int dequant_shift = xd->bd - 5;
 
   const int qstep =
       ROUND_POWER_OF_TWO(x->plane[plane].dequant_QTX[1], QUANT_TABLE_BITS) >>
@@ -2434,10 +2391,8 @@
                           &txk_allowed, txk_map);
   const uint16_t allowed_tx_mask = tx_mask;
 
-  if (is_cur_buf_hbd(xd)) {
-    block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
-    block_mse_q8 = ROUND_POWER_OF_TWO(block_mse_q8, (xd->bd - 8) * 2);
-  }
+  block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
+  block_mse_q8 = ROUND_POWER_OF_TWO(block_mse_q8, (xd->bd - 8) * 2);
   block_sse *= 16;
   // Use mse / qstep^2 based threshold logic to take decision of R-D
   // optimization of coeffs. For smaller residuals, coeff optimization

diff --git a/av1/encoder/x86/error_intrin_avx2.c b/av1/encoder/x86/error_intrin_avx2.c
deleted file mode 100644
index 0f259c7..0000000
--- a/av1/encoder/x86/error_intrin_avx2.c
+++ /dev/null

@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <immintrin.h>  // AVX2
-
-#include "config/av1_rtcd.h"
-
-#include "aom/aom_integer.h"
-
-static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
-                              __m256i *c) {
-  const tran_low_t *addr = coeff + offset;
-
-  if (sizeof(tran_low_t) == 4) {
-    const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr);
-    const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1);
-    const __m256i y = _mm256_packs_epi32(x0, x1);
-    *c = _mm256_permute4x64_epi64(y, 0xD8);
-  } else {
-    *c = _mm256_loadu_si256((const __m256i *)addr);
-  }
-}
-
-int64_t av1_block_error_lp_avx2(const int16_t *coeff, const int16_t *dqcoeff,
-                                intptr_t block_size) {
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i sse_256 = zero;
-  __m256i sse_hi;
-  __m128i sse_128;
-  int64_t sse;
-
-  if (block_size == 16) {
-    // Load 16 elements for coeff and dqcoeff.
-    const __m256i _coeff = _mm256_loadu_si256((const __m256i *)coeff);
-    const __m256i _dqcoeff = _mm256_loadu_si256((const __m256i *)dqcoeff);
-    // dqcoeff - coeff
-    const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff);
-    // madd (dqcoeff - coeff)
-    const __m256i error_lo = _mm256_madd_epi16(diff, diff);
-    // Save the higher 64 bit of each 128 bit lane.
-    const __m256i error_hi = _mm256_srli_si256(error_lo, 8);
-    // Add the higher 64 bit to the low 64 bit.
-    const __m256i error = _mm256_add_epi32(error_lo, error_hi);
-    // Expand each double word in the lower 64 bits to quad word.
-    sse_256 = _mm256_unpacklo_epi32(error, zero);
-  } else {
-    for (int i = 0; i < block_size; i += 16) {
-      // Load 16 elements for coeff and dqcoeff.
-      const __m256i _coeff = _mm256_loadu_si256((const __m256i *)coeff);
-      const __m256i _dqcoeff = _mm256_loadu_si256((const __m256i *)dqcoeff);
-      const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff);
-      const __m256i error = _mm256_madd_epi16(diff, diff);
-      // Expand each double word of madd (dqcoeff - coeff) to quad word.
-      const __m256i exp_error_lo = _mm256_unpacklo_epi32(error, zero);
-      const __m256i exp_error_hi = _mm256_unpackhi_epi32(error, zero);
-      // Add each quad word of madd (dqcoeff - coeff).
-      sse_256 = _mm256_add_epi64(sse_256, exp_error_lo);
-      sse_256 = _mm256_add_epi64(sse_256, exp_error_hi);
-      coeff += 16;
-      dqcoeff += 16;
-    }
-  }
-  // Save the higher 64 bit of each 128 bit lane.
-  sse_hi = _mm256_srli_si256(sse_256, 8);
-  // Add the higher 64 bit to the low 64 bit.
-  sse_256 = _mm256_add_epi64(sse_256, sse_hi);
-
-  // Add each 64 bit from each of the 128 bit lane of the 256 bit.
-  sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256),
-                          _mm256_extractf128_si256(sse_256, 1));
-
-  // Store the results.
-  _mm_storel_epi64((__m128i *)&sse, sse_128);
-  return sse;
-}
-
-int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,
-                             intptr_t block_size, int64_t *ssz) {
-  __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
-  __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
-  __m256i sse_reg_64hi, ssz_reg_64hi;
-  __m128i sse_reg128, ssz_reg128;
-  int64_t sse;
-  int i;
-  const __m256i zero_reg = _mm256_setzero_si256();
-
-  // init sse and ssz registerd to zero
-  sse_reg = _mm256_setzero_si256();
-  ssz_reg = _mm256_setzero_si256();
-
-  for (i = 0; i < block_size; i += 16) {
-    // load 32 bytes from coeff and dqcoeff
-    read_coeff(coeff, i, &coeff_reg);
-    read_coeff(dqcoeff, i, &dqcoeff_reg);
-    // dqcoeff - coeff
-    dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
-    // madd (dqcoeff - coeff)
-    dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg);
-    // madd coeff
-    coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg);
-    // expand each double word of madd (dqcoeff - coeff) to quad word
-    exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg);
-    exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg);
-    // expand each double word of madd (coeff) to quad word
-    exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg);
-    exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg);
-    // add each quad word of madd (dqcoeff - coeff) and madd (coeff)
-    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo);
-    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo);
-    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi);
-    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi);
-  }
-  // save the higher 64 bit of each 128 bit lane
-  sse_reg_64hi = _mm256_srli_si256(sse_reg, 8);
-  ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8);
-  // add the higher 64 bit to the low 64 bit
-  sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi);
-  ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi);
-
-  // add each 64 bit from each of the 128 bit lane of the 256 bit
-  sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg),
-                             _mm256_extractf128_si256(sse_reg, 1));
-
-  ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg),
-                             _mm256_extractf128_si256(ssz_reg, 1));
-
-  // store the results
-  _mm_storel_epi64((__m128i *)(&sse), sse_reg128);
-
-  _mm_storel_epi64((__m128i *)(ssz), ssz_reg128);
-  _mm256_zeroupper();
-  return sse;
-}

diff --git a/av1/encoder/x86/error_sse2.asm b/av1/encoder/x86/error_sse2.asm
deleted file mode 100644
index 2cffb1e..0000000
--- a/av1/encoder/x86/error_sse2.asm
+++ /dev/null

@@ -1,88 +0,0 @@
-;
-; Copyright (c) 2021, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 3-Clause Clear License and the
-; Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear License was
-; not distributed with this source code in the LICENSE file, you can obtain it
-; at aomedia.org/license/software-license/bsd-3-c-c/.  If the Alliance for Open Media Patent
-; License 1.0 was not distributed with this source code in the PATENTS file, you
-; can obtain it at aomedia.org/license/patent-license/.
-;
-
-;
-
-; Increment %1 by sizeof() tran_low_t * %2.
-%macro INCREMENT_ELEMENTS_TRAN_LOW 2
-  lea %1, [%1 + %2 * 4]
-%endmacro
-
-; Load %2 + %3 into m%1.
-; %3 is the offset in elements, not bytes.
-; If tran_low_t is 16 bits (low bit depth configuration) then load the value
-; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack
-; the values down to 16 bits.
-%macro LOAD_TRAN_LOW 3
-  mova     m%1, [%2 + (%3) * 4]
-  packssdw m%1, [%2 + (%3) * 4 + 16]
-%endmacro
-
-%define private_prefix av1
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-; int64_t av1_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
-;                         int64_t *ssz)
-
-INIT_XMM sse2
-cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
-  pxor      m4, m4                 ; sse accumulator
-  pxor      m6, m6                 ; ssz accumulator
-  pxor      m5, m5                 ; dedicated zero register
-.loop:
-  LOAD_TRAN_LOW 2, uqcq, 0
-  LOAD_TRAN_LOW 0, dqcq, 0
-  LOAD_TRAN_LOW 3, uqcq, 8
-  LOAD_TRAN_LOW 1, dqcq, 8
-  INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
-  INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
-  sub    sizeq, 16
-  psubw     m0, m2
-  psubw     m1, m3
-  ; individual errors are max. 15bit+sign, so squares are 30bit, and
-  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
-  pmaddwd   m0, m0
-  pmaddwd   m1, m1
-  pmaddwd   m2, m2
-  pmaddwd   m3, m3
-  ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
-  paddd     m0, m1
-  paddd     m2, m3
-  ; accumulate in 64bit
-  punpckldq m7, m0, m5
-  punpckhdq m0, m5
-  paddq     m4, m7
-  punpckldq m7, m2, m5
-  paddq     m4, m0
-  punpckhdq m2, m5
-  paddq     m6, m7
-  paddq     m6, m2
-  jg .loop
-
-  ; accumulate horizontally and store in return value
-  movhlps   m5, m4
-  movhlps   m7, m6
-  paddq     m4, m5
-  paddq     m6, m7
-%if ARCH_X86_64
-  movq    rax, m4
-  movq [sszq], m6
-%else
-  mov     eax, sszm
-  pshufd   m5, m4, 0x1
-  movq  [eax], m6
-  movd    eax, m4
-  movd    edx, m5
-%endif
-  RET

diff --git a/av1/encoder/x86/highbd_temporal_filter_sse2.c b/av1/encoder/x86/highbd_temporal_filter_sse2.c
index 4d86fb9..3a949ff 100644
--- a/av1/encoder/x86/highbd_temporal_filter_sse2.c
+++ b/av1/encoder/x86/highbd_temporal_filter_sse2.c

@@ -245,11 +245,9 @@
     const int num_planes, const double *noise_levels, const MV *subblock_mvs,
     const int *subblock_mses, const int q_factor, const int filter_strength,
     const uint8_t *pred, uint32_t *accum, uint16_t *count) {
-  const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
   assert(block_size == BLOCK_32X32 && "Only support 32x32 block with avx2!");
   assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with avx2!");
   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-  (void)is_high_bitdepth;
 
   const int mb_height = block_size_high[block_size];
   const int mb_width = block_size_wide[block_size];

diff --git a/av1/encoder/x86/pickrst_avx2.c b/av1/encoder/x86/pickrst_avx2.c
index 0e7533f..24caca9 100644
--- a/av1/encoder/x86/pickrst_avx2.c
+++ b/av1/encoder/x86/pickrst_avx2.c

@@ -505,289 +505,6 @@
   }
 }
 
-static INLINE void acc_stat_win5_one_line_avx2(
-    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
-    int dgd_stride, const __m128i *shuffle, int32_t *sumX,
-    int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
-    int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
-    int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
-  int j, k, l;
-  const int wiener_win = WIENER_WIN_CHROMA;
-  // Main loop handles two pixels at a time
-  // We can assume that h_start is even, since it will always be aligned to
-  // a tile edge + some number of restoration units, and both of those will
-  // be 64-pixel aligned.
-  // However, at the edge of the image, h_end may be odd, so we need to handle
-  // that case correctly.
-  assert(h_start % 2 == 0);
-  const int h_end_even = h_end & ~1;
-  const int has_odd_pixel = h_end & 1;
-  for (j = h_start; j < h_end_even; j += 2) {
-    const uint8_t X1 = src[j];
-    const uint8_t X2 = src[j + 1];
-    *sumX += X1 + X2;
-    const uint8_t *dgd_ij = dgd + j;
-    for (k = 0; k < wiener_win; k++) {
-      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
-      for (l = 0; l < wiener_win; l++) {
-        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
-        const uint8_t D1 = dgd_ijk[l];
-        const uint8_t D2 = dgd_ijk[l + 1];
-        sumY[k][l] += D1 + D2;
-        M_int[k][l] += D1 * X1 + D2 * X2;
-
-        const __m256i kl =
-            _mm256_cvtepu8_epi16(_mm_set1_epi16(loadu_uint16(dgd_ijk + l)));
-        acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
-        acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
-        acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
-        acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
-        acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
-      }
-    }
-  }
-  // If the width is odd, add in the final pixel
-  if (has_odd_pixel) {
-    const uint8_t X1 = src[j];
-    *sumX += X1;
-    const uint8_t *dgd_ij = dgd + j;
-    for (k = 0; k < wiener_win; k++) {
-      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
-      for (l = 0; l < wiener_win; l++) {
-        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
-        const uint8_t D1 = dgd_ijk[l];
-        sumY[k][l] += D1;
-        M_int[k][l] += D1 * X1;
-
-        // The `acc_stat_avx2` function wants its input to have interleaved
-        // copies of two pixels, but we only have one. However, the pixels
-        // are (effectively) used as inputs to a multiply-accumulate.
-        // So if we set the extra pixel slot to 0, then it is effectively
-        // ignored.
-        const __m256i kl = _mm256_cvtepu8_epi16(_mm_set1_epi16((uint16_t)D1));
-        acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
-        acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
-        acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
-        acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
-        acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
-      }
-    }
-  }
-}
-
-static INLINE void compute_stats_win5_opt_avx2(
-    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
-    int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H) {
-  int i, j, k, l, m, n;
-  const int wiener_win = WIENER_WIN_CHROMA;
-  const int pixel_count = (h_end - h_start) * (v_end - v_start);
-  const int wiener_win2 = wiener_win * wiener_win;
-  const int wiener_halfwin = (wiener_win >> 1);
-  uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
-
-  int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
-  int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
-  DECLARE_ALIGNED(
-      32, int32_t,
-      H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) = { { 0 } };
-  int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
-  int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
-  int32_t sumX = 0;
-  const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
-
-  const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
-  for (j = v_start; j < v_end; j += 64) {
-    const int vert_end = AOMMIN(64, v_end - j) + j;
-    for (i = j; i < vert_end; i++) {
-      acc_stat_win5_one_line_avx2(
-          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
-          dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
-    }
-    for (k = 0; k < wiener_win; ++k) {
-      for (l = 0; l < wiener_win; ++l) {
-        M_int64[k][l] += M_int32[k][l];
-        M_int32[k][l] = 0;
-      }
-    }
-    for (k = 0; k < WIENER_WIN2_CHROMA; ++k) {
-      for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
-        H_int64[k][l] += H_int32[k][l];
-        H_int32[k][l] = 0;
-      }
-    }
-  }
-
-  const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
-  for (k = 0; k < wiener_win; k++) {
-    for (l = 0; l < wiener_win; l++) {
-      const int32_t idx0 = l * wiener_win + k;
-      M[idx0] =
-          M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]));
-      int64_t *H_ = H + idx0 * wiener_win2;
-      int64_t *H_int_ = &H_int64[idx0][0];
-      for (m = 0; m < wiener_win; m++) {
-        for (n = 0; n < wiener_win; n++) {
-          H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
-                                   (int64_t)avg * (sumY[k][l] + sumY[n][m]);
-        }
-      }
-    }
-  }
-}
-
-void av1_compute_stats_avx2(int wiener_win, const uint8_t *dgd,
-                            const uint8_t *src, int h_start, int h_end,
-                            int v_start, int v_end, int dgd_stride,
-                            int src_stride, int64_t *M, int64_t *H) {
-  if (wiener_win == WIENER_WIN) {
-    compute_stats_win7_opt_avx2(dgd, src, h_start, h_end, v_start, v_end,
-                                dgd_stride, src_stride, M, H);
-  } else if (wiener_win == WIENER_WIN_CHROMA) {
-    compute_stats_win5_opt_avx2(dgd, src, h_start, h_end, v_start, v_end,
-                                dgd_stride, src_stride, M, H);
-  } else {
-    av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
-                        dgd_stride, src_stride, M, H);
-  }
-}
-
-static INLINE __m256i pair_set_epi16(int a, int b) {
-  return _mm256_set1_epi32(
-      (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
-}
-
-int64_t av1_lowbd_pixel_proj_error_avx2(
-    const uint8_t *src8, int width, int height, int src_stride,
-    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
-    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
-  int i, j, k;
-  const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
-  const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
-  __m256i sum64 = _mm256_setzero_si256();
-  const uint8_t *src = src8;
-  const uint8_t *dat = dat8;
-  int64_t err = 0;
-  if (params->r[0] > 0 && params->r[1] > 0) {
-    __m256i xq_coeff = pair_set_epi16(xq[0], xq[1]);
-    for (i = 0; i < height; ++i) {
-      __m256i sum32 = _mm256_setzero_si256();
-      for (j = 0; j <= width - 16; j += 16) {
-        const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
-        const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
-        const __m256i flt0_16b = _mm256_permute4x64_epi64(
-            _mm256_packs_epi32(yy_loadu_256(flt0 + j),
-                               yy_loadu_256(flt0 + j + 8)),
-            0xd8);
-        const __m256i flt1_16b = _mm256_permute4x64_epi64(
-            _mm256_packs_epi32(yy_loadu_256(flt1 + j),
-                               yy_loadu_256(flt1 + j + 8)),
-            0xd8);
-        const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS);
-        const __m256i flt0_0_sub_u = _mm256_sub_epi16(flt0_16b, u0);
-        const __m256i flt1_0_sub_u = _mm256_sub_epi16(flt1_16b, u0);
-        const __m256i v0 = _mm256_madd_epi16(
-            xq_coeff, _mm256_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u));
-        const __m256i v1 = _mm256_madd_epi16(
-            xq_coeff, _mm256_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u));
-        const __m256i vr0 =
-            _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
-        const __m256i vr1 =
-            _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
-        const __m256i e0 = _mm256_sub_epi16(
-            _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
-        const __m256i err0 = _mm256_madd_epi16(e0, e0);
-        sum32 = _mm256_add_epi32(sum32, err0);
-      }
-      for (k = j; k < width; ++k) {
-        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
-        int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
-        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
-        err += ((int64_t)e * e);
-      }
-      dat += dat_stride;
-      src += src_stride;
-      flt0 += flt0_stride;
-      flt1 += flt1_stride;
-      const __m256i sum64_0 =
-          _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
-      const __m256i sum64_1 =
-          _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
-      sum64 = _mm256_add_epi64(sum64, sum64_0);
-      sum64 = _mm256_add_epi64(sum64, sum64_1);
-    }
-  } else if (params->r[0] > 0 || params->r[1] > 0) {
-    const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
-    const __m256i xq_coeff =
-        pair_set_epi16(xq_active, (-xq_active * (1 << SGRPROJ_RST_BITS)));
-    const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
-    const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
-    for (i = 0; i < height; ++i) {
-      __m256i sum32 = _mm256_setzero_si256();
-      for (j = 0; j <= width - 16; j += 16) {
-        const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
-        const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
-        const __m256i flt_16b = _mm256_permute4x64_epi64(
-            _mm256_packs_epi32(yy_loadu_256(flt + j),
-                               yy_loadu_256(flt + j + 8)),
-            0xd8);
-        const __m256i v0 =
-            _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt_16b, d0));
-        const __m256i v1 =
-            _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt_16b, d0));
-        const __m256i vr0 =
-            _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
-        const __m256i vr1 =
-            _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
-        const __m256i e0 = _mm256_sub_epi16(
-            _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
-        const __m256i err0 = _mm256_madd_epi16(e0, e0);
-        sum32 = _mm256_add_epi32(sum32, err0);
-      }
-      for (k = j; k < width; ++k) {
-        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
-        int32_t v = xq_active * (flt[k] - u);
-        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
-        err += ((int64_t)e * e);
-      }
-      dat += dat_stride;
-      src += src_stride;
-      flt += flt_stride;
-      const __m256i sum64_0 =
-          _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
-      const __m256i sum64_1 =
-          _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
-      sum64 = _mm256_add_epi64(sum64, sum64_0);
-      sum64 = _mm256_add_epi64(sum64, sum64_1);
-    }
-  } else {
-    __m256i sum32 = _mm256_setzero_si256();
-    for (i = 0; i < height; ++i) {
-      for (j = 0; j <= width - 16; j += 16) {
-        const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
-        const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
-        const __m256i diff0 = _mm256_sub_epi16(d0, s0);
-        const __m256i err0 = _mm256_madd_epi16(diff0, diff0);
-        sum32 = _mm256_add_epi32(sum32, err0);
-      }
-      for (k = j; k < width; ++k) {
-        const int32_t e = (int32_t)(dat[k]) - src[k];
-        err += ((int64_t)e * e);
-      }
-      dat += dat_stride;
-      src += src_stride;
-    }
-    const __m256i sum64_0 =
-        _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
-    const __m256i sum64_1 =
-        _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
-    sum64 = _mm256_add_epi64(sum64_0, sum64_1);
-  }
-  int64_t sum[4];
-  yy_storeu_256(sum, sum64);
-  err += sum[0] + sum[1] + sum[2] + sum[3];
-  return err;
-}
-
 // When params->r[0] > 0 and params->r[1] > 0. In this case all elements of
 // C and H need to be computed.
 static AOM_INLINE void calc_proj_params_r0_r1_avx2(
@@ -881,144 +598,6 @@
   C[1] /= size;
 }
 
-// When only params->r[0] > 0. In this case only H[0][0] and C[0] are
-// non-zero and need to be computed.
-static AOM_INLINE void calc_proj_params_r0_avx2(const uint8_t *src8, int width,
-                                                int height, int src_stride,
-                                                const uint8_t *dat8,
-                                                int dat_stride, int32_t *flt0,
-                                                int flt0_stride,
-                                                int64_t H[2][2], int64_t C[2]) {
-  const int size = width * height;
-  const uint8_t *src = src8;
-  const uint8_t *dat = dat8;
-  __m256i h00, c0;
-  const __m256i zero = _mm256_setzero_si256();
-  c0 = h00 = zero;
-
-  for (int i = 0; i < height; ++i) {
-    for (int j = 0; j < width; j += 8) {
-      const __m256i u_load = _mm256_cvtepu8_epi32(
-          _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
-      const __m256i s_load = _mm256_cvtepu8_epi32(
-          _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
-      __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
-      __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
-      __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
-      s = _mm256_sub_epi32(s, d);
-      f1 = _mm256_sub_epi32(f1, d);
-
-      const __m256i h00_even = _mm256_mul_epi32(f1, f1);
-      const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
-                                               _mm256_srli_epi64(f1, 32));
-      h00 = _mm256_add_epi64(h00, h00_even);
-      h00 = _mm256_add_epi64(h00, h00_odd);
-
-      const __m256i c0_even = _mm256_mul_epi32(f1, s);
-      const __m256i c0_odd =
-          _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
-      c0 = _mm256_add_epi64(c0, c0_even);
-      c0 = _mm256_add_epi64(c0, c0_odd);
-    }
-  }
-  const __m128i h00_128bit = _mm_add_epi64(_mm256_extracti128_si256(h00, 1),
-                                           _mm256_castsi256_si128(h00));
-  const __m128i h00_val =
-      _mm_add_epi64(h00_128bit, _mm_srli_si128(h00_128bit, 8));
-
-  const __m128i c0_128bit = _mm_add_epi64(_mm256_extracti128_si256(c0, 1),
-                                          _mm256_castsi256_si128(c0));
-  const __m128i c0_val = _mm_add_epi64(c0_128bit, _mm_srli_si128(c0_128bit, 8));
-
-  const __m128i c = _mm_unpacklo_epi64(c0_val, _mm256_castsi256_si128(zero));
-  const __m128i h0x = _mm_unpacklo_epi64(h00_val, _mm256_castsi256_si128(zero));
-
-  xx_storeu_128(C, c);
-  xx_storeu_128(H[0], h0x);
-
-  H[0][0] /= size;
-  C[0] /= size;
-}
-
-// When only params->r[1] > 0. In this case only H[1][1] and C[1] are
-// non-zero and need to be computed.
-static AOM_INLINE void calc_proj_params_r1_avx2(const uint8_t *src8, int width,
-                                                int height, int src_stride,
-                                                const uint8_t *dat8,
-                                                int dat_stride, int32_t *flt1,
-                                                int flt1_stride,
-                                                int64_t H[2][2], int64_t C[2]) {
-  const int size = width * height;
-  const uint8_t *src = src8;
-  const uint8_t *dat = dat8;
-  __m256i h11, c1;
-  const __m256i zero = _mm256_setzero_si256();
-  c1 = h11 = zero;
-
-  for (int i = 0; i < height; ++i) {
-    for (int j = 0; j < width; j += 8) {
-      const __m256i u_load = _mm256_cvtepu8_epi32(
-          _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
-      const __m256i s_load = _mm256_cvtepu8_epi32(
-          _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
-      __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
-      __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
-      __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
-      s = _mm256_sub_epi32(s, d);
-      f2 = _mm256_sub_epi32(f2, d);
-
-      const __m256i h11_even = _mm256_mul_epi32(f2, f2);
-      const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
-                                               _mm256_srli_epi64(f2, 32));
-      h11 = _mm256_add_epi64(h11, h11_even);
-      h11 = _mm256_add_epi64(h11, h11_odd);
-
-      const __m256i c1_even = _mm256_mul_epi32(f2, s);
-      const __m256i c1_odd =
-          _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
-      c1 = _mm256_add_epi64(c1, c1_even);
-      c1 = _mm256_add_epi64(c1, c1_odd);
-    }
-  }
-
-  const __m128i h11_128bit = _mm_add_epi64(_mm256_extracti128_si256(h11, 1),
-                                           _mm256_castsi256_si128(h11));
-  const __m128i h11_val =
-      _mm_add_epi64(h11_128bit, _mm_srli_si128(h11_128bit, 8));
-
-  const __m128i c1_128bit = _mm_add_epi64(_mm256_extracti128_si256(c1, 1),
-                                          _mm256_castsi256_si128(c1));
-  const __m128i c1_val = _mm_add_epi64(c1_128bit, _mm_srli_si128(c1_128bit, 8));
-
-  const __m128i c = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), c1_val);
-  const __m128i h1x = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), h11_val);
-
-  xx_storeu_128(C, c);
-  xx_storeu_128(H[1], h1x);
-
-  H[1][1] /= size;
-  C[1] /= size;
-}
-
-// AVX2 variant of av1_calc_proj_params_c.
-void av1_calc_proj_params_avx2(const uint8_t *src8, int width, int height,
-                               int src_stride, const uint8_t *dat8,
-                               int dat_stride, int32_t *flt0, int flt0_stride,
-                               int32_t *flt1, int flt1_stride, int64_t H[2][2],
-                               int64_t C[2], const sgr_params_type *params) {
-  if ((params->r[0] > 0) && (params->r[1] > 0)) {
-    calc_proj_params_r0_r1_avx2(src8, width, height, src_stride, dat8,
-                                dat_stride, flt0, flt0_stride, flt1,
-                                flt1_stride, H, C);
-  } else if (params->r[0] > 0) {
-    calc_proj_params_r0_avx2(src8, width, height, src_stride, dat8, dat_stride,
-                             flt0, flt0_stride, H, C);
-  } else if (params->r[1] > 0) {
-    calc_proj_params_r1_avx2(src8, width, height, src_stride, dat8, dat_stride,
-                             flt1, flt1_stride, H, C);
-  }
-}
-
 int64_t av1_highbd_pixel_proj_error_avx2(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,

diff --git a/av1/encoder/x86/pickrst_sse4.c b/av1/encoder/x86/pickrst_sse4.c
index 61c1979..f97ad56 100644
--- a/av1/encoder/x86/pickrst_sse4.c
+++ b/av1/encoder/x86/pickrst_sse4.c

@@ -512,277 +512,6 @@
   }
 }
 
-static INLINE void acc_stat_win5_one_line_sse4_1(
-    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
-    int dgd_stride, const __m128i *shuffle, int32_t *sumX,
-    int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
-    int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
-    int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
-  const int wiener_win = WIENER_WIN_CHROMA;
-  int j, k, l;
-  // Main loop handles two pixels at a time
-  // We can assume that h_start is even, since it will always be aligned to
-  // a tile edge + some number of restoration units, and both of those will
-  // be 64-pixel aligned.
-  // However, at the edge of the image, h_end may be odd, so we need to handle
-  // that case correctly.
-  assert(h_start % 2 == 0);
-  const int h_end_even = h_end & ~1;
-  const int has_odd_pixel = h_end & 1;
-  for (j = h_start; j < h_end_even; j += 2) {
-    const uint8_t *dgd_ij = dgd + j;
-    const uint8_t X1 = src[j];
-    const uint8_t X2 = src[j + 1];
-    *sumX += X1 + X2;
-    for (k = 0; k < wiener_win; k++) {
-      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
-      for (l = 0; l < wiener_win; l++) {
-        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
-        const uint8_t D1 = dgd_ijk[l];
-        const uint8_t D2 = dgd_ijk[l + 1];
-        sumY[k][l] += D1 + D2;
-        M_int[k][l] += D1 * X1 + D2 * X2;
-
-        const __m128i kl =
-            _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
-        acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
-        acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
-        acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
-        acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
-        acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
-      }
-    }
-  }
-  // If the width is odd, add in the final pixel
-  if (has_odd_pixel) {
-    const uint8_t *dgd_ij = dgd + j;
-    const uint8_t X1 = src[j];
-    *sumX += X1;
-    for (k = 0; k < wiener_win; k++) {
-      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
-      for (l = 0; l < wiener_win; l++) {
-        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
-        const uint8_t D1 = dgd_ijk[l];
-        sumY[k][l] += D1;
-        M_int[k][l] += D1 * X1;
-
-        // The `acc_stat_sse41` function wants its input to have interleaved
-        // copies of two pixels, but we only have one. However, the pixels
-        // are (effectively) used as inputs to a multiply-accumulate.
-        // So if we set the extra pixel slot to 0, then it is effectively
-        // ignored.
-        const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((uint16_t)D1));
-        acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
-        acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
-        acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
-        acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
-        acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
-      }
-    }
-  }
-}
-
-static INLINE void compute_stats_win5_opt_sse4_1(
-    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
-    int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H) {
-  int i, j, k, l, m, n;
-  const int wiener_win = WIENER_WIN_CHROMA;
-  const int pixel_count = (h_end - h_start) * (v_end - v_start);
-  const int wiener_win2 = wiener_win * wiener_win;
-  const int wiener_halfwin = (wiener_win >> 1);
-  const uint8_t avg =
-      find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
-
-  int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
-  int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
-  int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
-  int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
-  int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
-  int32_t sumX = 0;
-  const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
-
-  const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
-  for (j = v_start; j < v_end; j += 64) {
-    const int vert_end = AOMMIN(64, v_end - j) + j;
-    for (i = j; i < vert_end; i++) {
-      acc_stat_win5_one_line_sse4_1(
-          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
-          dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
-    }
-    for (k = 0; k < wiener_win; ++k) {
-      for (l = 0; l < wiener_win; ++l) {
-        M_int64[k][l] += M_int32[k][l];
-        M_int32[k][l] = 0;
-      }
-    }
-    for (k = 0; k < WIENER_WIN_CHROMA * WIENER_WIN_CHROMA; ++k) {
-      for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
-        H_int64[k][l] += H_int32[k][l];
-        H_int32[k][l] = 0;
-      }
-    }
-  }
-
-  const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
-  for (k = 0; k < wiener_win; k++) {
-    for (l = 0; l < wiener_win; l++) {
-      const int32_t idx0 = l * wiener_win + k;
-      M[idx0] =
-          M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]));
-      int64_t *H_ = H + idx0 * wiener_win2;
-      int64_t *H_int_ = &H_int64[idx0][0];
-      for (m = 0; m < wiener_win; m++) {
-        for (n = 0; n < wiener_win; n++) {
-          H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
-                                   (int64_t)avg * (sumY[k][l] + sumY[n][m]);
-        }
-      }
-    }
-  }
-}
-void av1_compute_stats_sse4_1(int wiener_win, const uint8_t *dgd,
-                              const uint8_t *src, int h_start, int h_end,
-                              int v_start, int v_end, int dgd_stride,
-                              int src_stride, int64_t *M, int64_t *H) {
-  if (wiener_win == WIENER_WIN) {
-    compute_stats_win7_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end,
-                                  dgd_stride, src_stride, M, H);
-  } else if (wiener_win == WIENER_WIN_CHROMA) {
-    compute_stats_win5_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end,
-                                  dgd_stride, src_stride, M, H);
-  } else {
-    av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
-                        dgd_stride, src_stride, M, H);
-  }
-}
-
-static INLINE __m128i pair_set_epi16(int a, int b) {
-  return _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
-}
-
-int64_t av1_lowbd_pixel_proj_error_sse4_1(
-    const uint8_t *src8, int width, int height, int src_stride,
-    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
-    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
-  int i, j, k;
-  const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
-  const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
-  __m128i sum64 = _mm_setzero_si128();
-  const uint8_t *src = src8;
-  const uint8_t *dat = dat8;
-  int64_t err = 0;
-  if (params->r[0] > 0 && params->r[1] > 0) {
-    __m128i xq_coeff = pair_set_epi16(xq[0], xq[1]);
-    for (i = 0; i < height; ++i) {
-      __m128i sum32 = _mm_setzero_si128();
-      for (j = 0; j <= width - 8; j += 8) {
-        const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
-        const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
-        const __m128i flt0_16b =
-            _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4));
-        const __m128i flt1_16b =
-            _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4));
-        const __m128i u0 = _mm_slli_epi16(d0, SGRPROJ_RST_BITS);
-        const __m128i flt0_0_sub_u = _mm_sub_epi16(flt0_16b, u0);
-        const __m128i flt1_0_sub_u = _mm_sub_epi16(flt1_16b, u0);
-        const __m128i v0 = _mm_madd_epi16(
-            xq_coeff, _mm_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u));
-        const __m128i v1 = _mm_madd_epi16(
-            xq_coeff, _mm_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u));
-        const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
-        const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
-        const __m128i e0 =
-            _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
-        const __m128i err0 = _mm_madd_epi16(e0, e0);
-        sum32 = _mm_add_epi32(sum32, err0);
-      }
-      for (k = j; k < width; ++k) {
-        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
-        int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
-        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
-        err += ((int64_t)e * e);
-      }
-      dat += dat_stride;
-      src += src_stride;
-      flt0 += flt0_stride;
-      flt1 += flt1_stride;
-      const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
-      const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
-      sum64 = _mm_add_epi64(sum64, sum64_0);
-      sum64 = _mm_add_epi64(sum64, sum64_1);
-    }
-  } else if (params->r[0] > 0 || params->r[1] > 0) {
-    const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
-    const __m128i xq_coeff =
-        pair_set_epi16(xq_active, -(xq_active << SGRPROJ_RST_BITS));
-    const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
-    const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
-    for (i = 0; i < height; ++i) {
-      __m128i sum32 = _mm_setzero_si128();
-      for (j = 0; j <= width - 8; j += 8) {
-        const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
-        const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
-        const __m128i flt_16b =
-            _mm_packs_epi32(xx_loadu_128(flt + j), xx_loadu_128(flt + j + 4));
-        const __m128i v0 =
-            _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt_16b, d0));
-        const __m128i v1 =
-            _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt_16b, d0));
-        const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
-        const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
-        const __m128i e0 =
-            _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
-        const __m128i err0 = _mm_madd_epi16(e0, e0);
-        sum32 = _mm_add_epi32(sum32, err0);
-      }
-      for (k = j; k < width; ++k) {
-        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
-        int32_t v = xq_active * (flt[k] - u);
-        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
-        err += ((int64_t)e * e);
-      }
-      dat += dat_stride;
-      src += src_stride;
-      flt += flt_stride;
-      const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
-      const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
-      sum64 = _mm_add_epi64(sum64, sum64_0);
-      sum64 = _mm_add_epi64(sum64, sum64_1);
-    }
-  } else {
-    __m128i sum32 = _mm_setzero_si128();
-    for (i = 0; i < height; ++i) {
-      for (j = 0; j <= width - 16; j += 16) {
-        const __m128i d = xx_loadu_128(dat + j);
-        const __m128i s = xx_loadu_128(src + j);
-        const __m128i d0 = _mm_cvtepu8_epi16(d);
-        const __m128i d1 = _mm_cvtepu8_epi16(_mm_srli_si128(d, 8));
-        const __m128i s0 = _mm_cvtepu8_epi16(s);
-        const __m128i s1 = _mm_cvtepu8_epi16(_mm_srli_si128(s, 8));
-        const __m128i diff0 = _mm_sub_epi16(d0, s0);
-        const __m128i diff1 = _mm_sub_epi16(d1, s1);
-        const __m128i err0 = _mm_madd_epi16(diff0, diff0);
-        const __m128i err1 = _mm_madd_epi16(diff1, diff1);
-        sum32 = _mm_add_epi32(sum32, err0);
-        sum32 = _mm_add_epi32(sum32, err1);
-      }
-      for (k = j; k < width; ++k) {
-        const int32_t e = (int32_t)(dat[k]) - src[k];
-        err += ((int64_t)e * e);
-      }
-      dat += dat_stride;
-      src += src_stride;
-    }
-    const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
-    const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
-    sum64 = _mm_add_epi64(sum64_0, sum64_1);
-  }
-  int64_t sum[2];
-  xx_storeu_128(sum, sum64);
-  err += sum[0] + sum[1];
-  return err;
-}
-
 int64_t av1_highbd_pixel_proj_error_sse4_1(
     const uint8_t *src8, int width, int height, int src_stride,
     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,

diff --git a/av1/encoder/x86/temporal_filter_avx2.c b/av1/encoder/x86/temporal_filter_avx2.c
deleted file mode 100644
index d53da44..0000000
--- a/av1/encoder/x86/temporal_filter_avx2.c
+++ /dev/null

@@ -1,295 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <assert.h>
-#include <immintrin.h>
-
-#include "config/av1_rtcd.h"
-#include "av1/encoder/encoder.h"
-#include "av1/encoder/temporal_filter.h"
-
-#define SSE_STRIDE (BW + 2)
-
-DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask[4][8]) = {
-  { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0 },
-  { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 },
-  { 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 },
-  { 0, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, shufflemask_16b[2][16]) = {
-  { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
-  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 11, 10, 11 }
-};
-
-static AOM_FORCE_INLINE void get_squared_error_16x16_avx2(
-    const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
-    const unsigned int stride2, const int block_width, const int block_height,
-    uint16_t *frame_sse, const unsigned int sse_stride) {
-  (void)block_width;
-  const uint8_t *src1 = frame1;
-  const uint8_t *src2 = frame2;
-  uint16_t *dst = frame_sse;
-  for (int i = 0; i < block_height; i++) {
-    __m128i vf1_128, vf2_128;
-    __m256i vf1, vf2, vdiff1, vsqdiff1;
-
-    vf1_128 = _mm_loadu_si128((__m128i *)(src1));
-    vf2_128 = _mm_loadu_si128((__m128i *)(src2));
-    vf1 = _mm256_cvtepu8_epi16(vf1_128);
-    vf2 = _mm256_cvtepu8_epi16(vf2_128);
-    vdiff1 = _mm256_sub_epi16(vf1, vf2);
-    vsqdiff1 = _mm256_mullo_epi16(vdiff1, vdiff1);
-
-    _mm256_storeu_si256((__m256i *)(dst), vsqdiff1);
-    // Set zero to uninitialized memory to avoid uninitialized loads later
-    *(uint32_t *)(dst + 16) = _mm_cvtsi128_si32(_mm_setzero_si128());
-
-    src1 += stride, src2 += stride2;
-    dst += sse_stride;
-  }
-}
-
-static AOM_FORCE_INLINE void get_squared_error_32x32_avx2(
-    const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
-    const unsigned int stride2, const int block_width, const int block_height,
-    uint16_t *frame_sse, const unsigned int sse_stride) {
-  (void)block_width;
-  const uint8_t *src1 = frame1;
-  const uint8_t *src2 = frame2;
-  uint16_t *dst = frame_sse;
-  for (int i = 0; i < block_height; i++) {
-    __m256i vsrc1, vsrc2, vmin, vmax, vdiff, vdiff1, vdiff2, vres1, vres2;
-
-    vsrc1 = _mm256_loadu_si256((__m256i *)src1);
-    vsrc2 = _mm256_loadu_si256((__m256i *)src2);
-    vmax = _mm256_max_epu8(vsrc1, vsrc2);
-    vmin = _mm256_min_epu8(vsrc1, vsrc2);
-    vdiff = _mm256_subs_epu8(vmax, vmin);
-
-    __m128i vtmp1 = _mm256_castsi256_si128(vdiff);
-    __m128i vtmp2 = _mm256_extracti128_si256(vdiff, 1);
-    vdiff1 = _mm256_cvtepu8_epi16(vtmp1);
-    vdiff2 = _mm256_cvtepu8_epi16(vtmp2);
-
-    vres1 = _mm256_mullo_epi16(vdiff1, vdiff1);
-    vres2 = _mm256_mullo_epi16(vdiff2, vdiff2);
-    _mm256_storeu_si256((__m256i *)(dst), vres1);
-    _mm256_storeu_si256((__m256i *)(dst + 16), vres2);
-    // Set zero to uninitialized memory to avoid uninitialized loads later
-    *(uint32_t *)(dst + 32) = _mm_cvtsi128_si32(_mm_setzero_si128());
-
-    src1 += stride;
-    src2 += stride2;
-    dst += sse_stride;
-  }
-}
-
-static AOM_FORCE_INLINE __m256i xx_load_and_pad(uint16_t *src, int col,
-                                                int block_width) {
-  __m128i v128tmp = _mm_loadu_si128((__m128i *)(src));
-  if (col == 0) {
-    // For the first column, replicate the first element twice to the left
-    v128tmp = _mm_shuffle_epi8(v128tmp, *(__m128i *)shufflemask_16b[0]);
-  }
-  if (col == block_width - 4) {
-    // For the last column, replicate the last element twice to the right
-    v128tmp = _mm_shuffle_epi8(v128tmp, *(__m128i *)shufflemask_16b[1]);
-  }
-  return _mm256_cvtepu16_epi32(v128tmp);
-}
-
-static AOM_FORCE_INLINE int32_t xx_mask_and_hadd(__m256i vsum, int i) {
-  // Mask the required 5 values inside the vector
-  __m256i vtmp = _mm256_and_si256(vsum, *(__m256i *)sse_bytemask[i]);
-  __m128i v128a, v128b;
-  // Extract 256b as two 128b registers A and B
-  v128a = _mm256_castsi256_si128(vtmp);
-  v128b = _mm256_extracti128_si256(vtmp, 1);
-  // A = [A0+B0, A1+B1, A2+B2, A3+B3]
-  v128a = _mm_add_epi32(v128a, v128b);
-  // B = [A2+B2, A3+B3, 0, 0]
-  v128b = _mm_srli_si128(v128a, 8);
-  // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X]
-  v128a = _mm_add_epi32(v128a, v128b);
-  // B = [A1+B1+A3+B3, 0, 0, 0]
-  v128b = _mm_srli_si128(v128a, 4);
-  // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X]
-  v128a = _mm_add_epi32(v128a, v128b);
-  return _mm_extract_epi32(v128a, 0);
-}
-
-static void apply_temporal_filter(
-    const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
-    const unsigned int stride2, const int block_width, const int block_height,
-    const int min_frame_size, const double sigma, const MV *subblock_mvs,
-    const int *subblock_mses, const int q_factor, const int filter_strength,
-    unsigned int *accumulator, uint16_t *count, uint16_t *luma_sq_error,
-    uint16_t *chroma_sq_error, int plane, int ss_x_shift, int ss_y_shift) {
-  assert(((block_width == 32) && (block_height == 32)) ||
-         ((block_width == 16) && (block_height == 16)));
-  if (plane > PLANE_TYPE_Y) assert(chroma_sq_error != NULL);
-
-  uint32_t acc_5x5_sse[BH][BW];
-  uint16_t *frame_sse =
-      (plane == PLANE_TYPE_Y) ? luma_sq_error : chroma_sq_error;
-
-  if (block_width == 32) {
-    get_squared_error_32x32_avx2(frame1, stride, frame2, stride2, block_width,
-                                 block_height, frame_sse, SSE_STRIDE);
-  } else {
-    get_squared_error_16x16_avx2(frame1, stride, frame2, stride2, block_width,
-                                 block_height, frame_sse, SSE_STRIDE);
-  }
-
-  __m256i vsrc[5];
-
-  const double n_decay = 0.5 + log(2 * sigma + 5.0);
-  const double q_decay =
-      CLIP(pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2), 1e-5, 1);
-  const double s_decay =
-      CLIP(pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2), 1e-5, 1);
-
-  // Traverse 4 columns at a time
-  // First and last columns will require padding
-  for (int col = 0; col < block_width; col += 4) {
-    uint16_t *src = (col) ? frame_sse + col - 2 : frame_sse;
-
-    // Load and pad(for first and last col) 3 rows from the top
-    for (int i = 2; i < 5; i++) {
-      vsrc[i] = xx_load_and_pad(src, col, block_width);
-      src += SSE_STRIDE;
-    }
-
-    // Copy first row to first 2 vectors
-    vsrc[0] = vsrc[2];
-    vsrc[1] = vsrc[2];
-
-    for (int row = 0; row < block_height; row++) {
-      __m256i vsum = _mm256_setzero_si256();
-
-      // Add 5 consecutive rows
-      for (int i = 0; i < 5; i++) {
-        vsum = _mm256_add_epi32(vsum, vsrc[i]);
-      }
-
-      // Push all elements by one element to the top
-      for (int i = 0; i < 4; i++) {
-        vsrc[i] = vsrc[i + 1];
-      }
-
-      // Load next row to the last element
-      if (row <= block_width - 4) {
-        vsrc[4] = xx_load_and_pad(src, col, block_width);
-        src += SSE_STRIDE;
-      } else {
-        vsrc[4] = vsrc[3];
-      }
-
-      // Accumulate the sum horizontally
-      for (int i = 0; i < 4; i++) {
-        acc_5x5_sse[row][col + i] = xx_mask_and_hadd(vsum, i);
-      }
-    }
-  }
-
-  for (int i = 0, k = 0; i < block_height; i++) {
-    for (int j = 0; j < block_width; j++, k++) {
-      const int pixel_value = frame2[i * stride2 + j];
-
-      int diff_sse = acc_5x5_sse[i][j];
-      int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH;
-
-      // Filter U-plane and V-plane using Y-plane. This is because motion
-      // search is only done on Y-plane, so the information from Y-plane will
-      // be more accurate.
-      if (plane != PLANE_TYPE_Y) {
-        for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
-          for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
-            const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
-            const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
-            diff_sse += luma_sq_error[yy * SSE_STRIDE + xx];
-            ++num_ref_pixels;
-          }
-        }
-      }
-
-      const double window_error = (double)(diff_sse) / num_ref_pixels;
-      const int subblock_idx =
-          (i >= block_height / 2) * 2 + (j >= block_width / 2);
-      const double block_error = (double)subblock_mses[subblock_idx];
-      const double combined_error =
-          (TF_WINDOW_BLOCK_BALANCE_WEIGHT * window_error + block_error) /
-          (TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) / TF_SEARCH_ERROR_NORM_WEIGHT;
-
-      const MV mv = subblock_mvs[subblock_idx];
-      const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
-      const double distance_threshold =
-          (double)AOMMAX(min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD, 1);
-      const double d_factor = AOMMAX(distance / distance_threshold, 1);
-
-      const double scaled_error =
-          AOMMIN(combined_error * d_factor / n_decay / q_decay / s_decay, 7);
-      const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
-
-      count[k] += weight;
-      accumulator[k] += weight * pixel_value;
-    }
-  }
-}
-
-void av1_apply_temporal_filter_avx2(
-    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
-    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
-    const int *subblock_mses, const int q_factor, const int filter_strength,
-    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
-  const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
-  assert(block_size == BLOCK_32X32 && "Only support 32x32 block with avx2!");
-  assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with avx2!");
-  assert(!is_high_bitdepth && "Only support low bit-depth with avx2!");
-  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-  (void)is_high_bitdepth;
-
-  const int mb_height = block_size_high[block_size];
-  const int mb_width = block_size_wide[block_size];
-  const int mb_pels = mb_height * mb_width;
-  const int frame_height = frame_to_filter->y_crop_height;
-  const int frame_width = frame_to_filter->y_crop_width;
-  const int min_frame_size = AOMMIN(frame_height, frame_width);
-  uint16_t luma_sq_error[SSE_STRIDE * BH];
-  uint16_t *chroma_sq_error =
-      (num_planes > 0)
-          ? (uint16_t *)aom_malloc(SSE_STRIDE * BH * sizeof(uint16_t))
-          : NULL;
-
-  for (int plane = 0; plane < num_planes; ++plane) {
-    const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
-    const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
-    const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
-    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
-
-    const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset;
-    const int ss_x_shift =
-        mbd->plane[plane].subsampling_x - mbd->plane[0].subsampling_x;
-    const int ss_y_shift =
-        mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y;
-
-    apply_temporal_filter(ref, frame_stride, pred + mb_pels * plane, plane_w,
-                          plane_w, plane_h, min_frame_size, noise_levels[plane],
-                          subblock_mvs, subblock_mses, q_factor,
-                          filter_strength, accum + mb_pels * plane,
-                          count + mb_pels * plane, luma_sq_error,
-                          chroma_sq_error, plane, ss_x_shift, ss_y_shift);
-  }
-  if (chroma_sq_error != NULL) aom_free(chroma_sq_error);
-}

diff --git a/av1/encoder/x86/temporal_filter_sse2.c b/av1/encoder/x86/temporal_filter_sse2.c
deleted file mode 100644
index 8f204a7..0000000
--- a/av1/encoder/x86/temporal_filter_sse2.c
+++ /dev/null

@@ -1,273 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <assert.h>
-#include <emmintrin.h>
-
-#include "config/av1_rtcd.h"
-#include "av1/encoder/encoder.h"
-#include "av1/encoder/temporal_filter.h"
-
-// For the squared error buffer, keep a padding for 4 samples
-#define SSE_STRIDE (BW + 4)
-
-DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask_2x4[4][2][4]) = {
-  { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
-    { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 } },
-  { { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
-    { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 } },
-  { { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
-    { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 } },
-  { { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
-    { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } }
-};
-
-static void get_squared_error(const uint8_t *frame1, const unsigned int stride,
-                              const uint8_t *frame2, const unsigned int stride2,
-                              const int block_width, const int block_height,
-                              uint16_t *frame_sse,
-                              const unsigned int dst_stride) {
-  const uint8_t *src1 = frame1;
-  const uint8_t *src2 = frame2;
-  uint16_t *dst = frame_sse;
-
-  for (int i = 0; i < block_height; i++) {
-    for (int j = 0; j < block_width; j += 16) {
-      // Set zero to uninitialized memory to avoid uninitialized loads later
-      *(uint32_t *)(dst) = _mm_cvtsi128_si32(_mm_setzero_si128());
-
-      __m128i vsrc1 = _mm_loadu_si128((__m128i *)(src1 + j));
-      __m128i vsrc2 = _mm_loadu_si128((__m128i *)(src2 + j));
-
-      __m128i vmax = _mm_max_epu8(vsrc1, vsrc2);
-      __m128i vmin = _mm_min_epu8(vsrc1, vsrc2);
-      __m128i vdiff = _mm_subs_epu8(vmax, vmin);
-
-      __m128i vzero = _mm_setzero_si128();
-      __m128i vdiff1 = _mm_unpacklo_epi8(vdiff, vzero);
-      __m128i vdiff2 = _mm_unpackhi_epi8(vdiff, vzero);
-
-      __m128i vres1 = _mm_mullo_epi16(vdiff1, vdiff1);
-      __m128i vres2 = _mm_mullo_epi16(vdiff2, vdiff2);
-
-      _mm_storeu_si128((__m128i *)(dst + j + 2), vres1);
-      _mm_storeu_si128((__m128i *)(dst + j + 10), vres2);
-    }
-
-    // Set zero to uninitialized memory to avoid uninitialized loads later
-    *(uint32_t *)(dst + block_width + 2) =
-        _mm_cvtsi128_si32(_mm_setzero_si128());
-
-    src1 += stride;
-    src2 += stride2;
-    dst += dst_stride;
-  }
-}
-
-static void xx_load_and_pad(uint16_t *src, __m128i *dstvec, int col,
-                            int block_width) {
-  __m128i vtmp = _mm_loadu_si128((__m128i *)src);
-  __m128i vzero = _mm_setzero_si128();
-  __m128i vtmp1 = _mm_unpacklo_epi16(vtmp, vzero);
-  __m128i vtmp2 = _mm_unpackhi_epi16(vtmp, vzero);
-  // For the first column, replicate the first element twice to the left
-  dstvec[0] = (col) ? vtmp1 : _mm_shuffle_epi32(vtmp1, 0xEA);
-  // For the last column, replicate the last element twice to the right
-  dstvec[1] = (col < block_width - 4) ? vtmp2 : _mm_shuffle_epi32(vtmp2, 0x54);
-}
-
-static int32_t xx_mask_and_hadd(__m128i vsum1, __m128i vsum2, int i) {
-  __m128i veca, vecb;
-  // Mask and obtain the required 5 values inside the vector
-  veca = _mm_and_si128(vsum1, *(__m128i *)sse_bytemask_2x4[i][0]);
-  vecb = _mm_and_si128(vsum2, *(__m128i *)sse_bytemask_2x4[i][1]);
-  // A = [A0+B0, A1+B1, A2+B2, A3+B3]
-  veca = _mm_add_epi32(veca, vecb);
-  // B = [A2+B2, A3+B3, 0, 0]
-  vecb = _mm_srli_si128(veca, 8);
-  // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X]
-  veca = _mm_add_epi32(veca, vecb);
-  // B = [A1+B1+A3+B3, 0, 0, 0]
-  vecb = _mm_srli_si128(veca, 4);
-  // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X]
-  veca = _mm_add_epi32(veca, vecb);
-  return _mm_cvtsi128_si32(veca);
-}
-
-static void apply_temporal_filter(
-    const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
-    const unsigned int stride2, const int block_width, const int block_height,
-    const int min_frame_size, const double sigma, const MV *subblock_mvs,
-    const int *subblock_mses, const int q_factor, const int filter_strength,
-    unsigned int *accumulator, uint16_t *count, uint16_t *luma_sq_error,
-    uint16_t *chroma_sq_error, int plane, int ss_x_shift, int ss_y_shift) {
-  assert(((block_width == 32) && (block_height == 32)) ||
-         ((block_width == 16) && (block_height == 16)));
-  if (plane > PLANE_TYPE_Y) assert(chroma_sq_error != NULL);
-
-  uint32_t acc_5x5_sse[BH][BW];
-  uint16_t *frame_sse =
-      (plane == PLANE_TYPE_Y) ? luma_sq_error : chroma_sq_error;
-
-  get_squared_error(frame1, stride, frame2, stride2, block_width, block_height,
-                    frame_sse, SSE_STRIDE);
-
-  __m128i vsrc[5][2];
-
-  const double n_decay = 0.5 + log(2 * sigma + 5.0);
-  const double q_decay =
-      CLIP(pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2), 1e-5, 1);
-  const double s_decay =
-      CLIP(pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2), 1e-5, 1);
-
-  // Traverse 4 columns at a time
-  // First and last columns will require padding
-  for (int col = 0; col < block_width; col += 4) {
-    uint16_t *src = frame_sse + col;
-
-    // Load and pad(for first and last col) 3 rows from the top
-    for (int i = 2; i < 5; i++) {
-      xx_load_and_pad(src, vsrc[i], col, block_width);
-      src += SSE_STRIDE;
-    }
-
-    // Padding for top 2 rows
-    vsrc[0][0] = vsrc[2][0];
-    vsrc[0][1] = vsrc[2][1];
-    vsrc[1][0] = vsrc[2][0];
-    vsrc[1][1] = vsrc[2][1];
-
-    for (int row = 0; row < block_height; row++) {
-      __m128i vsum1 = _mm_setzero_si128();
-      __m128i vsum2 = _mm_setzero_si128();
-
-      // Add 5 consecutive rows
-      for (int i = 0; i < 5; i++) {
-        vsum1 = _mm_add_epi32(vsrc[i][0], vsum1);
-        vsum2 = _mm_add_epi32(vsrc[i][1], vsum2);
-      }
-
-      // Push all elements by one element to the top
-      for (int i = 0; i < 4; i++) {
-        vsrc[i][0] = vsrc[i + 1][0];
-        vsrc[i][1] = vsrc[i + 1][1];
-      }
-
-      if (row <= block_height - 4) {
-        // Load next row
-        xx_load_and_pad(src, vsrc[4], col, block_width);
-        src += SSE_STRIDE;
-      } else {
-        // Padding for bottom 2 rows
-        vsrc[4][0] = vsrc[3][0];
-        vsrc[4][1] = vsrc[3][1];
-      }
-
-      // Accumulate the sum horizontally
-      for (int i = 0; i < 4; i++) {
-        acc_5x5_sse[row][col + i] = xx_mask_and_hadd(vsum1, vsum2, i);
-      }
-    }
-  }
-
-  for (int i = 0, k = 0; i < block_height; i++) {
-    for (int j = 0; j < block_width; j++, k++) {
-      const int pixel_value = frame2[i * stride2 + j];
-
-      int diff_sse = acc_5x5_sse[i][j];
-      int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH;
-
-      // Filter U-plane and V-plane using Y-plane. This is because motion
-      // search is only done on Y-plane, so the information from Y-plane will
-      // be more accurate.
-      if (plane != PLANE_TYPE_Y) {
-        for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
-          for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
-            const int yy = (i << ss_y_shift) + ii;      // Y-coord on Y-plane.
-            const int xx = (j << ss_x_shift) + jj + 2;  // X-coord on Y-plane.
-            const int ww = SSE_STRIDE;                  // Stride of Y-plane.
-            diff_sse += luma_sq_error[yy * ww + xx];
-            ++num_ref_pixels;
-          }
-        }
-      }
-
-      const double window_error = (double)(diff_sse) / num_ref_pixels;
-      const int subblock_idx =
-          (i >= block_height / 2) * 2 + (j >= block_width / 2);
-      const double block_error = (double)subblock_mses[subblock_idx];
-      const double combined_error =
-          (TF_WINDOW_BLOCK_BALANCE_WEIGHT * window_error + block_error) /
-          (TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) / TF_SEARCH_ERROR_NORM_WEIGHT;
-
-      const MV mv = subblock_mvs[subblock_idx];
-      const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
-      const double distance_threshold =
-          (double)AOMMAX(min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD, 1);
-      const double d_factor = AOMMAX(distance / distance_threshold, 1);
-
-      const double scaled_error =
-          AOMMIN(combined_error * d_factor / n_decay / q_decay / s_decay, 7);
-      const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
-
-      count[k] += weight;
-      accumulator[k] += weight * pixel_value;
-    }
-  }
-}
-
-void av1_apply_temporal_filter_sse2(
-    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
-    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const double *noise_levels, const MV *subblock_mvs,
-    const int *subblock_mses, const int q_factor, const int filter_strength,
-    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
-  const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
-  assert(block_size == BLOCK_32X32 && "Only support 32x32 block with avx2!");
-  assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with avx2!");
-  assert(!is_high_bitdepth && "Only support low bit-depth with sse2!");
-  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-  (void)is_high_bitdepth;
-
-  const int mb_height = block_size_high[block_size];
-  const int mb_width = block_size_wide[block_size];
-  const int mb_pels = mb_height * mb_width;
-  const int frame_height = frame_to_filter->y_crop_height;
-  const int frame_width = frame_to_filter->y_crop_width;
-  const int min_frame_size = AOMMIN(frame_height, frame_width);
-  uint16_t luma_sq_error[SSE_STRIDE * BH];
-  uint16_t *chroma_sq_error =
-      (num_planes > 0)
-          ? (uint16_t *)aom_malloc(SSE_STRIDE * BH * sizeof(uint16_t))
-          : NULL;
-
-  for (int plane = 0; plane < num_planes; ++plane) {
-    const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
-    const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
-    const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
-    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
-
-    const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset;
-    const int ss_x_shift =
-        mbd->plane[plane].subsampling_x - mbd->plane[0].subsampling_x;
-    const int ss_y_shift =
-        mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y;
-
-    apply_temporal_filter(ref, frame_stride, pred + mb_pels * plane, plane_w,
-                          plane_w, plane_h, min_frame_size, noise_levels[plane],
-                          subblock_mvs, subblock_mses, q_factor,
-                          filter_strength, accum + mb_pels * plane,
-                          count + mb_pels * plane, luma_sq_error,
-                          chroma_sq_error, plane, ss_x_shift, ss_y_shift);
-  }
-  if (chroma_sq_error != NULL) aom_free(chroma_sq_error);
-}

diff --git a/common/tools_common.c b/common/tools_common.c
index 5ddec93..83199f3 100644
--- a/common/tools_common.c
+++ b/common/tools_common.c

@@ -284,95 +284,6 @@
 }
 
 // TODO(debargha): Consolidate the functions below into a separate file.
-static void highbd_img_upshift(aom_image_t *dst, const aom_image_t *src,
-                               int input_shift) {
-#if CONFIG_ZERO_OFFSET_BITUPSHIFT
-  const int offset = 0;
-#else
-  // Note the offset is 1 less than half.
-  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
-#endif  // CONFIG_ZERO_OFFSET_BITUPSHIFT
-  int plane;
-  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
-      dst->x_chroma_shift != src->x_chroma_shift ||
-      dst->y_chroma_shift != src->y_chroma_shift || dst->fmt != src->fmt ||
-      input_shift < 0) {
-    fatal("Unsupported image conversion");
-  }
-  switch (src->fmt) {
-    case AOM_IMG_FMT_I42016:
-    case AOM_IMG_FMT_I42216:
-    case AOM_IMG_FMT_I44416: break;
-    default: fatal("Unsupported image conversion"); break;
-  }
-  for (plane = 0; plane < 3; plane++) {
-    int w = src->d_w;
-    int h = src->d_h;
-    int x, y;
-    if (plane) {
-      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
-      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
-    }
-    for (y = 0; y < h; y++) {
-      const uint16_t *p_src =
-          (const uint16_t *)(src->planes[plane] + y * src->stride[plane]);
-      uint16_t *p_dst =
-          (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
-      for (x = 0; x < w; x++) *p_dst++ = (*p_src++ << input_shift) + offset;
-    }
-  }
-}
-
-static void lowbd_img_upshift(aom_image_t *dst, const aom_image_t *src,
-                              int input_shift) {
-#if CONFIG_ZERO_OFFSET_BITUPSHIFT
-  const int offset = 0;
-#else
-  // Note the offset is 1 less than half.
-  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
-#endif  // CONFIG_ZERO_OFFSET_BITUPSHIFT
-  int plane;
-  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
-      dst->x_chroma_shift != src->x_chroma_shift ||
-      dst->y_chroma_shift != src->y_chroma_shift ||
-      dst->fmt != src->fmt + AOM_IMG_FMT_HIGHBITDEPTH || input_shift < 0) {
-    fatal("Unsupported image conversion");
-  }
-  switch (src->fmt) {
-    case AOM_IMG_FMT_YV12:
-    case AOM_IMG_FMT_I420:
-    case AOM_IMG_FMT_I422:
-    case AOM_IMG_FMT_I444: break;
-    default: fatal("Unsupported image conversion"); break;
-  }
-  for (plane = 0; plane < 3; plane++) {
-    int w = src->d_w;
-    int h = src->d_h;
-    int x, y;
-    if (plane) {
-      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
-      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
-    }
-    for (y = 0; y < h; y++) {
-      const uint8_t *p_src = src->planes[plane] + y * src->stride[plane];
-      uint16_t *p_dst =
-          (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
-      for (x = 0; x < w; x++) {
-        *p_dst++ = (*p_src++ << input_shift) + offset;
-      }
-    }
-  }
-}
-
-void aom_img_upshift(aom_image_t *dst, const aom_image_t *src,
-                     int input_shift) {
-  if (src->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
-    highbd_img_upshift(dst, src, input_shift);
-  } else {
-    lowbd_img_upshift(dst, src, input_shift);
-  }
-}
-
 void aom_img_truncate_16_to_8(aom_image_t *dst, const aom_image_t *src) {
   int plane;
   if (dst->fmt + AOM_IMG_FMT_HIGHBITDEPTH != src->fmt || dst->d_w != src->d_w ||

diff --git a/common/tools_common.h b/common/tools_common.h
index a6ad507..10460ca 100644
--- a/common/tools_common.h
+++ b/common/tools_common.h

@@ -17,7 +17,7 @@
 #include "config/aom_config.h"
 
 #include "aom/aom_codec.h"
-#include "aom/aom_image.h"
+#include "aom/internal/aom_image_internal.h"
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/msvc.h"
@@ -168,7 +168,6 @@
 int aom_img_read(aom_image_t *img, FILE *file);
 
 double sse_to_psnr(double samples, double peak, double mse);
-void aom_img_upshift(aom_image_t *dst, const aom_image_t *src, int input_shift);
 void aom_img_downshift(aom_image_t *dst, const aom_image_t *src,
                        int down_shift);
 void aom_shift_img(unsigned int output_bit_depth, aom_image_t **img_ptr,

diff --git a/doc/dev_guide/av1_encoder.dox b/doc/dev_guide/av1_encoder.dox
index c81c4b3..39bab11 100644
--- a/doc/dev_guide/av1_encoder.dox
+++ b/doc/dev_guide/av1_encoder.dox

@@ -462,12 +462,12 @@
   filtering algorithm. It breaks each frame into "MxM" blocks. For each
   block a motion search \ref tf_motion_search() is applied to find
   the motion vector from one neighboring frame. tf_build_predictor() is then
-  called to build the matching patch and \ref av1_apply_temporal_filter_c() (see
-  also optimised SIMD versions) to apply temporal filtering. The weighted
+  called to build the matching patch and \ref av1_highbd_apply_temporal_filter_c()
+  (see also optimised SIMD versions) to apply temporal filtering. The weighted
   average over each pixel is accumulated and finally normalized in
   \ref tf_normalize_filtered_frame() to generate the final filtered frame.
 
-- \ref av1_apply_temporal_filter_c(): the core function of our temporal
+- \ref av1_highbd_apply_temporal_filter_c(): the core function of our temporal
   filtering algorithm (see also optimised SIMD versions).
 
 \subsection architecture_enc_frame_proc_film Film Grain Modelling

diff --git a/examples/aom_cx_set_ref.c b/examples/aom_cx_set_ref.c
index d7263d0..fb09bc8 100644
--- a/examples/aom_cx_set_ref.c
+++ b/examples/aom_cx_set_ref.c

@@ -283,8 +283,6 @@
   cfg.g_lag_in_frames = 3;
   cfg.g_bit_depth = AOM_BITS_8;
 
-  flags |= AOM_CODEC_USE_HIGHBITDEPTH;
-
   writer = aom_video_writer_open(outfile_arg, kContainerIVF, &info);
   if (!writer) die("Failed to open %s for writing.", outfile_arg);
 

diff --git a/examples/av1_dec_fuzzer.cc b/examples/av1_dec_fuzzer.cc
index 85434be..74caf38 100644
--- a/examples/av1_dec_fuzzer.cc
+++ b/examples/av1_dec_fuzzer.cc

@@ -39,7 +39,7 @@
   aom_codec_ctx_t codec;
   // Set thread count in the range [1, 64].
   const unsigned int threads = (data[IVF_FILE_HDR_SZ] & 0x3f) + 1;
-  aom_codec_dec_cfg_t cfg = { threads, 0, 0, 0 };
+  aom_codec_dec_cfg_t cfg = { threads, 0, 0 };
   if (aom_codec_dec_init(&codec, codec_interface, &cfg, 0)) {
     return 0;
   }

diff --git a/examples/lightfield_encoder.c b/examples/lightfield_encoder.c
index 52f45f4..58c656f 100644
--- a/examples/lightfield_encoder.c
+++ b/examples/lightfield_encoder.c

@@ -369,7 +369,6 @@
   cfg.kf_mode = AOM_KF_DISABLED;
   cfg.large_scale_tile = 0;  // Only set it to 1 for camera frame encoding.
   cfg.g_bit_depth = AOM_BITS_8;
-  flags |= AOM_CODEC_USE_HIGHBITDEPTH;
 
   if (!(infile = fopen(infile_arg, "rb")))
     die("Failed to open %s for reading", infile_arg);

diff --git a/examples/noise_model.c b/examples/noise_model.c
index 0d1ad01..7583bc4 100644
--- a/examples/noise_model.c
+++ b/examples/noise_model.c

@@ -321,11 +321,9 @@
   }
   fprintf(stderr, "Bit depth: %d  stride:%d\n", args.bit_depth, raw.stride[0]);
 
-  const int high_bd = args.bit_depth > 8;
   const int block_size = args.block_size;
   aom_flat_block_finder_t block_finder;
-  aom_flat_block_finder_init(&block_finder, block_size, args.bit_depth,
-                             high_bd);
+  aom_flat_block_finder_init(&block_finder, block_size, args.bit_depth);
 
   const int num_blocks_w = (info.frame_width + block_size - 1) / block_size;
   const int num_blocks_h = (info.frame_height + block_size - 1) / block_size;
@@ -333,8 +331,8 @@
   // Sets the random seed on the first entry in the output table
   int16_t random_seed = 7391;
   aom_noise_model_t noise_model;
-  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3, args.bit_depth,
-                                      high_bd };
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3,
+                                      args.bit_depth };
   aom_noise_model_init(&noise_model, params);
 
   FILE *denoised_file = 0;
@@ -374,8 +372,8 @@
                                    raw.planes[2] };
       uint8_t *denoised_planes[3] = { denoised.planes[0], denoised.planes[1],
                                       denoised.planes[2] };
-      int strides[3] = { raw.stride[0] >> high_bd, raw.stride[1] >> high_bd,
-                         raw.stride[2] >> high_bd };
+      int strides[3] = { raw.stride[0] >> 1, raw.stride[1] >> 1,
+                         raw.stride[2] >> 1 };
       int chroma_sub[3] = { raw.x_chroma_shift, raw.y_chroma_shift, 0 };
 
       fprintf(stdout, "Updating noise model...\n");

diff --git a/test/arf_freq_test.cc b/test/arf_freq_test.cc
index 8a1c5d4..317fdf6 100644
--- a/test/arf_freq_test.cc
+++ b/test/arf_freq_test.cc

@@ -175,7 +175,6 @@
   cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
   cfg_.g_bit_depth = test_video_param_.bit_depth;
   init_flags_ = AOM_CODEC_USE_PSNR;
-  if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
 
   std::unique_ptr<libaom_test::VideoSource> video;
   if (is_extension_y4m(test_video_param_.filename)) {

diff --git a/test/av1_convolve_scale_test.cc b/test/av1_convolve_scale_test.cc
index 5565ebb..5d31dc5 100644
--- a/test/av1_convolve_scale_test.cc
+++ b/test/av1_convolve_scale_test.cc

@@ -382,58 +382,6 @@
 
 typedef tuple<int, int> BlockDimension;
 
-typedef void (*LowbdConvolveFunc)(const uint8_t *src, int src_stride,
-                                  uint8_t *dst, int dst_stride, int w, int h,
-                                  const InterpFilterParams *filter_params_x,
-                                  const InterpFilterParams *filter_params_y,
-                                  const int subpel_x_qn, const int x_step_qn,
-                                  const int subpel_y_qn, const int y_step_qn,
-                                  ConvolveParams *conv_params);
-
-// Test parameter list:
-//  <tst_fun, dims, ntaps_x, ntaps_y, avg>
-typedef tuple<LowbdConvolveFunc, BlockDimension, NTaps, NTaps, bool>
-    LowBDParams;
-
-class LowBDConvolveScaleTest
-    : public ConvolveScaleTestBase<uint8_t>,
-      public ::testing::WithParamInterface<LowBDParams> {
- public:
-  virtual ~LowBDConvolveScaleTest() {}
-
-  void SetUp() {
-    tst_fun_ = GET_PARAM(0);
-
-    const BlockDimension &block = GET_PARAM(1);
-    const NTaps ntaps_x = GET_PARAM(2);
-    const NTaps ntaps_y = GET_PARAM(3);
-    const int bd = 8;
-    const bool avg = GET_PARAM(4);
-
-    SetParams(BaseParams(block, ntaps_x, ntaps_y, avg), bd);
-  }
-
-  void RunOne(bool ref) {
-    const uint8_t *src = image_->GetSrcData(ref, false);
-    uint8_t *dst = image_->GetDstData(ref, false);
-    convolve_params_.dst = image_->GetDst16Data(ref, false);
-    const int src_stride = image_->src_stride();
-    const int dst_stride = image_->dst_stride();
-    if (ref) {
-      av1_convolve_2d_scale_c(src, src_stride, dst, dst_stride, width_, height_,
-                              &filter_x_.params_, &filter_y_.params_, subpel_x_,
-                              kXStepQn, subpel_y_, kYStepQn, &convolve_params_);
-    } else {
-      tst_fun_(src, src_stride, dst, dst_stride, width_, height_,
-               &filter_x_.params_, &filter_y_.params_, subpel_x_, kXStepQn,
-               subpel_y_, kYStepQn, &convolve_params_);
-    }
-  }
-
- private:
-  LowbdConvolveFunc tst_fun_;
-};
-
 const BlockDimension kBlockDim[] = {
   make_tuple(2, 2),    make_tuple(2, 4),    make_tuple(4, 4),
   make_tuple(4, 8),    make_tuple(8, 4),    make_tuple(8, 8),
@@ -445,16 +393,6 @@
 
 const NTaps kNTaps[] = { EIGHT_TAP };
 
-TEST_P(LowBDConvolveScaleTest, Check) { Run(); }
-TEST_P(LowBDConvolveScaleTest, DISABLED_Speed) { SpeedTest(); }
-
-INSTANTIATE_TEST_SUITE_P(
-    SSE4_1, LowBDConvolveScaleTest,
-    ::testing::Combine(::testing::Values(av1_convolve_2d_scale_sse4_1),
-                       ::testing::ValuesIn(kBlockDim),
-                       ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
-                       ::testing::Bool()));
-
 typedef void (*HighbdConvolveFunc)(const uint16_t *src, int src_stride,
                                    uint16_t *dst, int dst_stride, int w, int h,
                                    const InterpFilterParams *filter_params_x,

diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index 37c96b1..4f49982 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc

@@ -118,33 +118,9 @@
   return result;
 }
 
-template <typename T>
-std::vector<TestParam<T>> GetLowbdTestParams(T test_func) {
-  return GetTestParams({ 8 }, test_func);
-}
-
-template <typename T>
-::testing::internal::ParamGenerator<TestParam<T>> BuildLowbdParams(
-    T test_func) {
-  return ::testing::ValuesIn(GetLowbdTestParams(test_func));
-}
-
 // Test the test-parameters generators work as expected.
 class AV1ConvolveParametersTest : public ::testing::Test {};
 
-TEST_F(AV1ConvolveParametersTest, GetLowbdTestParams) {
-  auto v = GetLowbdTestParams(av1_convolve_x_sr_c);
-  ASSERT_EQ(27U, v.size());
-  for (const auto &p : v) {
-    ASSERT_EQ(8, p.BitDepth());
-    // Needed (instead of ASSERT_EQ(...) since gtest does not
-    // have built in printing for arbitrary functions, which
-    // causes a compilation error.
-    bool same_fn = av1_convolve_x_sr_c == p.TestFunction();
-    ASSERT_TRUE(same_fn);
-  }
-}
-
 template <typename T>
 std::vector<TestParam<T>> GetHighbdTestParams(T test_func) {
   return GetTestParams({ 10, 12 }, test_func);
@@ -308,67 +284,6 @@
   uint16_t input16_2_[kInputStride * kInputStride];
 };
 
-////////////////////////////////////////////////////////
-// Single reference convolve-x functions (low bit-depth)
-////////////////////////////////////////////////////////
-typedef void (*convolve_x_func)(const uint8_t *src, int src_stride,
-                                uint8_t *dst, int dst_stride, int w, int h,
-                                const InterpFilterParams *filter_params_x,
-                                const int subpel_x_qn,
-                                ConvolveParams *conv_params);
-
-class AV1ConvolveXTest : public AV1ConvolveTest<convolve_x_func> {
- public:
-  void RunTest() {
-    for (int sub_x = 0; sub_x < 16; ++sub_x) {
-      for (int filter = EIGHTTAP_REGULAR; filter < INTERP_FILTERS_ALL;
-           ++filter) {
-        InterpFilter f = static_cast<InterpFilter>(filter);
-        TestConvolve(sub_x, f);
-      }
-    }
-  }
-
- private:
-  void TestConvolve(const int sub_x, const InterpFilter filter) {
-    const int width = GetParam().Block().Width();
-    const int height = GetParam().Block().Height();
-    const InterpFilterParams *filter_params_x =
-        av1_get_interp_filter_params_with_block_size(filter, width);
-    ConvolveParams conv_params1 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
-    const uint8_t *input = FirstRandomInput8(GetParam());
-    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
-    av1_convolve_x_sr(input, width, reference, kOutputStride, width, height,
-                      filter_params_x, sub_x, &conv_params1);
-
-    ConvolveParams conv_params2 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
-    convolve_x_func test_func = GetParam().TestFunction();
-    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
-    test_func(input, width, test, kOutputStride, width, height, filter_params_x,
-              sub_x, &conv_params2);
-    AssertOutputBufferEq(reference, test, width, height);
-  }
-};
-
-TEST_P(AV1ConvolveXTest, RunTest) { RunTest(); }
-INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveXTest,
-                         BuildLowbdParams(av1_convolve_x_sr_c));
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveXTest,
-                         BuildLowbdParams(av1_convolve_x_sr_sse2));
-#endif
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveXTest,
-                         BuildLowbdParams(av1_convolve_x_sr_avx2));
-#endif
-
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveXTest,
-                         BuildLowbdParams(av1_convolve_x_sr_neon));
-#endif
-
 /////////////////////////////////////////////////////////
 // Single reference convolve-x functions (high bit-depth)
 /////////////////////////////////////////////////////////
@@ -428,64 +343,6 @@
                          BuildHighbdParams(av1_highbd_convolve_x_sr_avx2));
 #endif
 
-////////////////////////////////////////////////////////
-// Single reference convolve-y functions (low bit-depth)
-////////////////////////////////////////////////////////
-typedef void (*convolve_y_func)(const uint8_t *src, int src_stride,
-                                uint8_t *dst, int dst_stride, int w, int h,
-                                const InterpFilterParams *filter_params_y,
-                                const int subpel_y_qn);
-
-class AV1ConvolveYTest : public AV1ConvolveTest<convolve_y_func> {
- public:
-  void RunTest() {
-    for (int sub_y = 0; sub_y < 16; ++sub_y) {
-      for (int filter = EIGHTTAP_REGULAR; filter < INTERP_FILTERS_ALL;
-           ++filter) {
-        InterpFilter f = static_cast<InterpFilter>(filter);
-        TestConvolve(sub_y, f);
-      }
-    }
-  }
-
- private:
-  void TestConvolve(const int sub_y, const InterpFilter filter) {
-    const int width = GetParam().Block().Width();
-    const int height = GetParam().Block().Height();
-
-    const InterpFilterParams *filter_params_y =
-        av1_get_interp_filter_params_with_block_size(filter, height);
-    const uint8_t *input = FirstRandomInput8(GetParam());
-    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
-    av1_convolve_y_sr(input, width, reference, kOutputStride, width, height,
-                      filter_params_y, sub_y);
-    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
-    GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
-                              filter_params_y, sub_y);
-    AssertOutputBufferEq(reference, test, width, height);
-  }
-};
-
-TEST_P(AV1ConvolveYTest, RunTest) { RunTest(); }
-
-INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveYTest,
-                         BuildLowbdParams(av1_convolve_y_sr_c));
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveYTest,
-                         BuildLowbdParams(av1_convolve_y_sr_sse2));
-#endif
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveYTest,
-                         BuildLowbdParams(av1_convolve_y_sr_avx2));
-#endif
-
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveYTest,
-                         BuildLowbdParams(av1_convolve_y_sr_neon));
-#endif
-
 /////////////////////////////////////////////////////////
 // Single reference convolve-y functions (high bit-depth)
 /////////////////////////////////////////////////////////
@@ -539,59 +396,6 @@
                          BuildHighbdParams(av1_highbd_convolve_y_sr_avx2));
 #endif
 
-//////////////////////////////////////////////////////////////
-// Single reference convolve-copy functions (low bit-depth)
-//////////////////////////////////////////////////////////////
-typedef void (*convolve_copy_func)(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride, int w,
-                                   int h);
-
-class AV1ConvolveCopyTest : public AV1ConvolveTest<convolve_copy_func> {
- public:
-  void RunTest() {
-    const int width = GetParam().Block().Width();
-    const int height = GetParam().Block().Height();
-    const uint8_t *input = FirstRandomInput8(GetParam());
-    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
-    aom_convolve_copy(input, width, reference, kOutputStride, width, height);
-    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
-    GetParam().TestFunction()(input, width, test, kOutputStride, width, height);
-    AssertOutputBufferEq(reference, test, width, height);
-  }
-};
-
-// Note that even though these are AOM convolve functions, we are using the
-// newer AV1 test framework.
-TEST_P(AV1ConvolveCopyTest, RunTest) { RunTest(); }
-
-INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveCopyTest,
-                         BuildLowbdParams(aom_convolve_copy_c));
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveCopyTest,
-                         BuildLowbdParams(aom_convolve_copy_sse2));
-#endif
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveCopyTest,
-                         BuildLowbdParams(aom_convolve_copy_avx2));
-#endif
-
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveCopyTest,
-                         BuildLowbdParams(aom_convolve_copy_neon));
-#endif
-
-#if HAVE_MSA
-INSTANTIATE_TEST_SUITE_P(MSA, AV1ConvolveCopyTest,
-                         BuildLowbdParams(aom_convolve_copy_msa));
-#endif
-
-#if HAVE_DSPR2
-INSTANTIATE_TEST_SUITE_P(DSPR2, AV1ConvolveCopyTest,
-                         BuildLowbdParams(aom_convolve_copy_dspr2));
-#endif
-
 ///////////////////////////////////////////////////////////////
 // Single reference convolve-copy functions (high bit-depth)
 ///////////////////////////////////////////////////////////////
@@ -631,75 +435,6 @@
                          BuildHighbdParams(aom_highbd_convolve_copy_avx2));
 #endif
 
-/////////////////////////////////////////////////////////
-// Single reference convolve-2D functions (low bit-depth)
-/////////////////////////////////////////////////////////
-typedef void (*convolve_2d_func)(const uint8_t *src, int src_stride,
-                                 uint8_t *dst, int dst_stride, int w, int h,
-                                 const InterpFilterParams *filter_params_x,
-                                 const InterpFilterParams *filter_params_y,
-                                 const int subpel_x_qn, const int subpel_y_qn,
-                                 ConvolveParams *conv_params);
-
-class AV1Convolve2DTest : public AV1ConvolveTest<convolve_2d_func> {
- public:
-  void RunTest() {
-    for (int sub_x = 0; sub_x < 16; ++sub_x) {
-      for (int sub_y = 0; sub_y < 16; ++sub_y) {
-        for (int h_f = EIGHTTAP_REGULAR; h_f < INTERP_FILTERS_ALL; ++h_f) {
-          for (int v_f = EIGHTTAP_REGULAR; v_f < INTERP_FILTERS_ALL; ++v_f) {
-            TestConvolve(static_cast<InterpFilter>(h_f),
-                         static_cast<InterpFilter>(v_f), sub_x, sub_y);
-          }
-        }
-      }
-    }
-  }
-
- private:
-  void TestConvolve(const InterpFilter h_f, const InterpFilter v_f,
-                    const int sub_x, const int sub_y) {
-    const int width = GetParam().Block().Width();
-    const int height = GetParam().Block().Height();
-    const InterpFilterParams *filter_params_x =
-        av1_get_interp_filter_params_with_block_size(h_f, width);
-    const InterpFilterParams *filter_params_y =
-        av1_get_interp_filter_params_with_block_size(v_f, height);
-    const uint8_t *input = FirstRandomInput8(GetParam());
-    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
-    ConvolveParams conv_params1 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
-    av1_convolve_2d_sr(input, width, reference, kOutputStride, width, height,
-                       filter_params_x, filter_params_y, sub_x, sub_y,
-                       &conv_params1);
-    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
-    ConvolveParams conv_params2 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
-    GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
-                              filter_params_x, filter_params_y, sub_x, sub_y,
-                              &conv_params2);
-    AssertOutputBufferEq(reference, test, width, height);
-  }
-};
-
-TEST_P(AV1Convolve2DTest, RunTest) { RunTest(); }
-
-INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DTest,
-                         BuildLowbdParams(av1_convolve_2d_sr_c));
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(SSE2, AV1Convolve2DTest,
-                         BuildLowbdParams(av1_convolve_2d_sr_sse2));
-#endif
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(AVX2, AV1Convolve2DTest,
-                         BuildLowbdParams(av1_convolve_2d_sr_avx2));
-#endif
-
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(NEON, AV1Convolve2DTest,
-                         BuildLowbdParams(av1_convolve_2d_sr_neon));
-#endif
-
 //////////////////////////////////////////////////////////
 // Single reference convolve-2d functions (high bit-depth)
 //////////////////////////////////////////////////////////
@@ -793,27 +528,6 @@
 }
 
 template <typename T>
-std::vector<TestParam<T>> GetLowbdLumaTestParams(T test_func) {
-  return GetLumaTestParams({ 8 }, test_func);
-}
-
-template <typename T>
-::testing::internal::ParamGenerator<TestParam<T>> BuildLowbdLumaParams(
-    T test_func) {
-  return ::testing::ValuesIn(GetLowbdLumaTestParams(test_func));
-}
-
-TEST_F(AV1ConvolveParametersTest, GetLowbdLumaTestParams) {
-  auto v = GetLowbdLumaTestParams(av1_dist_wtd_convolve_x_c);
-  ASSERT_EQ(22U, v.size());
-  for (const auto &e : v) {
-    ASSERT_EQ(8, e.BitDepth());
-    bool same_fn = av1_dist_wtd_convolve_x_c == e.TestFunction();
-    ASSERT_TRUE(same_fn);
-  }
-}
-
-template <typename T>
 std::vector<TestParam<T>> GetHighbdLumaTestParams(T test_func) {
   return GetLumaTestParams({ 10, 12 }, test_func);
 }
@@ -882,10 +596,9 @@
   }
 }
 
-////////////////////////////////////////////////
-// Compound convolve-x functions (low bit-depth)
-////////////////////////////////////////////////
-
+/////////////////////////////////////////////////
+// Compound convolve-x functions (high bit-depth)
+/////////////////////////////////////////////////
 ConvolveParams GetConvolveParams(int do_average, CONV_BUF_TYPE *conv_buf,
                                  int width, int bit_depth,
                                  const CompoundParam &compound) {
@@ -897,94 +610,6 @@
   return conv_params;
 }
 
-class AV1ConvolveXCompoundTest : public AV1ConvolveTest<convolve_x_func> {
- public:
-  void RunTest() {
-    auto compound_params = GetCompoundParams();
-    for (int sub_pix = 0; sub_pix < 16; ++sub_pix) {
-      for (int f = EIGHTTAP_REGULAR; f < INTERP_FILTERS_ALL; ++f) {
-        for (const auto &c : compound_params) {
-          TestConvolve(sub_pix, static_cast<InterpFilter>(f), c);
-        }
-      }
-    }
-  }
-
- protected:
-  virtual const InterpFilterParams *FilterParams(InterpFilter f,
-                                                 const BlockSize &block) const {
-    return av1_get_interp_filter_params_with_block_size(f, block.Width());
-  }
-
-  virtual convolve_x_func ReferenceFunc() const {
-    return av1_dist_wtd_convolve_x;
-  }
-
- private:
-  void TestConvolve(const int sub_pix, const InterpFilter filter,
-                    const CompoundParam &compound) {
-    const int width = GetParam().Block().Width();
-    const int height = GetParam().Block().Height();
-    const uint8_t *input1 = FirstRandomInput8(GetParam());
-    const uint8_t *input2 = SecondRandomInput8(GetParam());
-    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
-    DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
-    Convolve(ReferenceFunc(), input1, input2, reference, reference_conv_buf,
-             compound, sub_pix, filter);
-
-    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
-    DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
-    Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
-             compound, sub_pix, filter);
-
-    AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
-    AssertOutputBufferEq(reference, test, width, height);
-  }
-
- private:
-  void Convolve(convolve_x_func test_func, const uint8_t *src1,
-                const uint8_t *src2, uint8_t *dst, CONV_BUF_TYPE *conv_buf,
-                const CompoundParam &compound, const int sub_pix,
-                const InterpFilter filter) {
-    const int width = GetParam().Block().Width();
-    const int height = GetParam().Block().Height();
-    const InterpFilterParams *filter_params =
-        FilterParams(filter, GetParam().Block());
-
-    ConvolveParams conv_params =
-        GetConvolveParams(0, conv_buf, kOutputStride, 8, compound);
-    test_func(src1, width, dst, kOutputStride, width, height, filter_params,
-              sub_pix, &conv_params);
-
-    conv_params = GetConvolveParams(1, conv_buf, kOutputStride, 8, compound);
-    test_func(src2, width, dst, kOutputStride, width, height, filter_params,
-              sub_pix, &conv_params);
-  }
-};
-
-TEST_P(AV1ConvolveXCompoundTest, RunTest) { RunTest(); }
-
-INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveXCompoundTest,
-                         BuildLowbdLumaParams(av1_dist_wtd_convolve_x_c));
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveXCompoundTest,
-                         BuildLowbdLumaParams(av1_dist_wtd_convolve_x_sse2));
-#endif
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveXCompoundTest,
-                         BuildLowbdLumaParams(av1_dist_wtd_convolve_x_avx2));
-#endif
-
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveXCompoundTest,
-                         BuildLowbdLumaParams(av1_dist_wtd_convolve_x_neon));
-#endif
-
-/////////////////////////////////////////////////
-// Compound convolve-x functions (high bit-depth)
-/////////////////////////////////////////////////
 class AV1ConvolveXHighbdCompoundTest
     : public AV1ConvolveTest<highbd_convolve_x_func> {
  public:
@@ -1069,44 +694,6 @@
     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_avx2));
 #endif
 
-////////////////////////////////////////////////
-// Compound convolve-y functions (low bit-depth)
-////////////////////////////////////////////////
-
-// Note that the X and Y convolve functions have the same type signature and
-// logic; they only differentiate the filter parameters and reference function.
-class AV1ConvolveYCompoundTest : public AV1ConvolveXCompoundTest {
- protected:
-  virtual const InterpFilterParams *FilterParams(
-      InterpFilter f, const BlockSize &block) const override {
-    return av1_get_interp_filter_params_with_block_size(f, block.Height());
-  }
-
-  virtual convolve_x_func ReferenceFunc() const override {
-    return av1_dist_wtd_convolve_y;
-  }
-};
-
-TEST_P(AV1ConvolveYCompoundTest, RunTest) { RunTest(); }
-
-INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveYCompoundTest,
-                         BuildLowbdLumaParams(av1_dist_wtd_convolve_y_c));
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveYCompoundTest,
-                         BuildLowbdLumaParams(av1_dist_wtd_convolve_y_sse2));
-#endif
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveYCompoundTest,
-                         BuildLowbdLumaParams(av1_dist_wtd_convolve_y_avx2));
-#endif
-
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveYCompoundTest,
-                         BuildLowbdLumaParams(av1_dist_wtd_convolve_y_neon));
-#endif
-
 /////////////////////////////////////////////////
 // Compound convolve-y functions (high bit-depth)
 /////////////////////////////////////////////////
@@ -1140,84 +727,6 @@
     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_y_avx2));
 #endif
 
-//////////////////////////////////////////////////////
-// Compound convolve-2d-copy functions (low bit-depth)
-//////////////////////////////////////////////////////
-typedef void (*compound_conv_2d_copy_func)(const uint8_t *src, int src_stride,
-                                           uint8_t *dst, int dst_stride, int w,
-                                           int h, ConvolveParams *conv_params);
-
-class AV1Convolve2DCopyCompoundTest
-    : public AV1ConvolveTest<compound_conv_2d_copy_func> {
- public:
-  void RunTest() {
-    auto compound_params = GetCompoundParams();
-    for (const auto &compound : compound_params) {
-      TestConvolve(compound);
-    }
-  }
-
- private:
-  void TestConvolve(const CompoundParam &compound) {
-    const BlockSize &block = GetParam().Block();
-    const int width = block.Width();
-    const int height = block.Height();
-
-    const uint8_t *input1 = FirstRandomInput8(GetParam());
-    const uint8_t *input2 = SecondRandomInput8(GetParam());
-    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
-    DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
-    Convolve(av1_dist_wtd_convolve_2d_copy, input1, input2, reference,
-             reference_conv_buf, compound);
-
-    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
-    DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
-    Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
-             compound);
-
-    AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
-    AssertOutputBufferEq(reference, test, width, height);
-  }
-
- private:
-  void Convolve(compound_conv_2d_copy_func test_func, const uint8_t *src1,
-                const uint8_t *src2, uint8_t *dst, uint16_t *conv_buf,
-                const CompoundParam &compound) {
-    const BlockSize &block = GetParam().Block();
-    const int width = block.Width();
-    const int height = block.Height();
-    ConvolveParams conv_params =
-        GetConvolveParams(0, conv_buf, kOutputStride, 8, compound);
-    test_func(src1, width, dst, kOutputStride, width, height, &conv_params);
-
-    conv_params = GetConvolveParams(1, conv_buf, kOutputStride, 8, compound);
-    test_func(src2, width, dst, kOutputStride, width, height, &conv_params);
-  }
-};
-
-TEST_P(AV1Convolve2DCopyCompoundTest, RunTest) { RunTest(); }
-
-INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DCopyCompoundTest,
-                         BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_copy_c));
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(
-    SSE2, AV1Convolve2DCopyCompoundTest,
-    BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_copy_sse2));
-#endif
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(
-    AVX2, AV1Convolve2DCopyCompoundTest,
-    BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_copy_avx2));
-#endif
-
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(
-    NEON, AV1Convolve2DCopyCompoundTest,
-    BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_copy_neon));
-#endif
-
 ///////////////////////////////////////////////////////
 // Compound convolve-2d-copy functions (high bit-depth)
 ///////////////////////////////////////////////////////
@@ -1297,103 +806,6 @@
     BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_copy_avx2));
 #endif
 
-/////////////////////////////////////////////////
-// Compound convolve-2d functions (low bit-depth)
-/////////////////////////////////////////////////
-
-class AV1Convolve2DCompoundTest : public AV1ConvolveTest<convolve_2d_func> {
- public:
-  void RunTest() {
-    auto compound_params = GetCompoundParams();
-    for (int h_f = EIGHTTAP_REGULAR; h_f < INTERP_FILTERS_ALL; ++h_f) {
-      for (int v_f = EIGHTTAP_REGULAR; v_f < INTERP_FILTERS_ALL; ++v_f) {
-        for (int sub_x = 0; sub_x < 16; ++sub_x) {
-          for (int sub_y = 0; sub_y < 16; ++sub_y) {
-            for (const auto &compound : compound_params) {
-              TestConvolve(static_cast<InterpFilter>(h_f),
-                           static_cast<InterpFilter>(v_f), sub_x, sub_y,
-                           compound);
-            }
-          }
-        }
-      }
-    }
-  }
-
- private:
-  void TestConvolve(const InterpFilter h_f, const InterpFilter v_f,
-                    const int sub_x, const int sub_y,
-                    const CompoundParam &compound) {
-    const BlockSize &block = GetParam().Block();
-    const int width = block.Width();
-    const int height = block.Height();
-
-    const uint8_t *input1 = FirstRandomInput8(GetParam());
-    const uint8_t *input2 = SecondRandomInput8(GetParam());
-    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
-    DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
-    Convolve(av1_dist_wtd_convolve_2d, input1, input2, reference,
-             reference_conv_buf, compound, h_f, v_f, sub_x, sub_y);
-
-    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
-    DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
-    Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
-             compound, h_f, v_f, sub_x, sub_y);
-
-    AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
-    AssertOutputBufferEq(reference, test, width, height);
-  }
-
- private:
-  void Convolve(convolve_2d_func test_func, const uint8_t *src1,
-                const uint8_t *src2, uint8_t *dst, uint16_t *conv_buf,
-                const CompoundParam &compound, const InterpFilter h_f,
-                const InterpFilter v_f, const int sub_x, const int sub_y) {
-    const BlockSize &block = GetParam().Block();
-    const int width = block.Width();
-    const int height = block.Height();
-
-    const InterpFilterParams *filter_params_x =
-        av1_get_interp_filter_params_with_block_size(h_f, width);
-    const InterpFilterParams *filter_params_y =
-        av1_get_interp_filter_params_with_block_size(v_f, height);
-    ConvolveParams conv_params =
-        GetConvolveParams(0, conv_buf, kOutputStride, 8, compound);
-
-    test_func(src1, width, dst, kOutputStride, width, height, filter_params_x,
-              filter_params_y, sub_x, sub_y, &conv_params);
-
-    conv_params = GetConvolveParams(1, conv_buf, kOutputStride, 8, compound);
-    test_func(src2, width, dst, kOutputStride, width, height, filter_params_x,
-              filter_params_y, sub_x, sub_y, &conv_params);
-  }
-};
-
-TEST_P(AV1Convolve2DCompoundTest, RunTest) { RunTest(); }
-
-INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DCompoundTest,
-                         BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_c));
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(SSE2, AV1Convolve2DCompoundTest,
-                         BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_sse2));
-#endif
-
-#if HAVE_SSSE3
-INSTANTIATE_TEST_SUITE_P(SSSE3, AV1Convolve2DCompoundTest,
-                         BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_ssse3));
-#endif
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(AVX2, AV1Convolve2DCompoundTest,
-                         BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_avx2));
-#endif
-
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(NEON, AV1Convolve2DCompoundTest,
-                         BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_neon));
-#endif
-
 //////////////////////////////////////////////////
 // Compound convolve-2d functions (high bit-depth)
 //////////////////////////////////////////////////

diff --git a/test/av1_ext_tile_test.cc b/test/av1_ext_tile_test.cc
index 7b76eab..113a50f 100644
--- a/test/av1_ext_tile_test.cc
+++ b/test/av1_ext_tile_test.cc

@@ -44,7 +44,6 @@
     aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
     cfg.w = kImgWidth;
     cfg.h = kImgHeight;
-    cfg.allow_lowbitdepth = 1;
 
     decoder_ = codec_->CreateDecoder(cfg, 0);
     decoder_->Control(AV1_SET_TILE_MODE, 1);
@@ -53,7 +52,7 @@
     decoder_->Control(AV1_SET_DECODE_TILE_COL, -1);
 
     // Allocate buffer to store tile image.
-    aom_img_alloc(&tile_img_, AOM_IMG_FMT_I420, kImgWidth, kImgHeight, 32);
+    aom_img_alloc(&tile_img_, AOM_IMG_FMT_I42016, kImgWidth, kImgHeight, 32);
 
     md5_.clear();
     tile_md5_.clear();
@@ -152,6 +151,8 @@
           break;
         }
 
+        if (!img) continue;
+
         const int kMaxMBPlane = 3;
         for (int plane = 0; plane < kMaxMBPlane; ++plane) {
           const int shift = (plane == 0) ? 0 : 1;
@@ -159,10 +160,13 @@
           int tile_width = kTIleSizeInPixels >> shift;
 
           for (int tr = 0; tr < tile_height; ++tr) {
-            memcpy(tile_img_.planes[plane] +
-                       tile_img_.stride[plane] * (r * tile_height + tr) +
-                       c * tile_width,
-                   img->planes[plane] + img->stride[plane] * tr, tile_width);
+            const uint16_t *src = (const uint16_t *)(img->planes[plane] +
+                                                     img->stride[plane] * tr);
+            uint16_t *dst =
+                (uint16_t *)(tile_img_.planes[plane] +
+                             tile_img_.stride[plane] * (r * tile_height + tr) +
+                             c * tile_width * sizeof(src[0]));
+            memcpy(dst, src, tile_width * sizeof(src[0]));
           }
         }
       }

diff --git a/test/av1_highbd_iht_test.cc b/test/av1_highbd_iht_test.cc
index 815d982..2630655 100644
--- a/test/av1_highbd_iht_test.cc
+++ b/test/av1_highbd_iht_test.cc

@@ -237,7 +237,6 @@
   txfm_param.tx_size = tx_size_;
   txfm_param.lossless = 0;
   txfm_param.bd = bit_depth_;
-  txfm_param.is_hbd = 1;
   txfm_param.tx_set_type = EXT_TX_SET_ALL16;
 
   for (int cnt = 0; cnt < randTimes; ++cnt) {

diff --git a/test/av1_horz_only_frame_superres_test.cc b/test/av1_horz_only_frame_superres_test.cc
index 932e0e5..7b4982f 100644
--- a/test/av1_horz_only_frame_superres_test.cc
+++ b/test/av1_horz_only_frame_superres_test.cc

@@ -248,60 +248,6 @@
   TestImage<Pixel> *image_;
 };
 
-typedef void (*LowBDConvolveHorizRsFunc)(const uint8_t *src, int src_stride,
-                                         uint8_t *dst, int dst_stride, int w,
-                                         int h, const int16_t *x_filters,
-                                         const int x0_qn, const int x_step_qn);
-
-// Test parameter list:
-//  <tst_fun_>
-typedef tuple<LowBDConvolveHorizRsFunc> LowBDParams;
-
-class LowBDConvolveHorizRSTest
-    : public ConvolveHorizRSTestBase<uint8_t>,
-      public ::testing::WithParamInterface<LowBDParams> {
- public:
-  virtual ~LowBDConvolveHorizRSTest() {}
-
-  void SetUp() {
-    tst_fun_ = GET_PARAM(0);
-    const int bd = 8;
-    SetBitDepth(bd);
-  }
-
-  void RunOne(bool ref) {
-    const uint8_t *src = image_->GetSrcData(ref, false);
-    uint8_t *dst = image_->GetDstData(ref, false);
-    const int src_stride = image_->src_stride();
-    const int dst_stride = image_->dst_stride();
-    const int width_src = image_->src_width();
-    const int width_dst = image_->dst_width();
-    const int height = image_->height();
-    const int x0_qn = image_->x0();
-
-    const int32_t x_step_qn =
-        av1_get_upscale_convolve_step(width_src, width_dst);
-
-    if (ref) {
-      av1_convolve_horiz_rs_c(src, src_stride, dst, dst_stride, width_dst,
-                              height, &av1_resize_filter_normative[0][0], x0_qn,
-                              x_step_qn);
-    } else {
-      tst_fun_(src, src_stride, dst, dst_stride, width_dst, height,
-               &av1_resize_filter_normative[0][0], x0_qn, x_step_qn);
-    }
-  }
-
- private:
-  LowBDConvolveHorizRsFunc tst_fun_;
-};
-
-TEST_P(LowBDConvolveHorizRSTest, Correctness) { CorrectnessTest(); }
-TEST_P(LowBDConvolveHorizRSTest, DISABLED_Speed) { SpeedTest(); }
-
-INSTANTIATE_TEST_SUITE_P(SSE4_1, LowBDConvolveHorizRSTest,
-                         ::testing::Values(av1_convolve_horiz_rs_sse4_1));
-
 typedef void (*HighBDConvolveHorizRsFunc)(const uint16_t *src, int src_stride,
                                           uint16_t *dst, int dst_stride, int w,
                                           int h, const int16_t *x_filters,

diff --git a/test/av1_inv_txfm2d_test.cc b/test/av1_inv_txfm2d_test.cc
deleted file mode 100644
index 9652167..0000000
--- a/test/av1_inv_txfm2d_test.cc
+++ /dev/null

@@ -1,424 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <tuple>
-#include <vector>
-
-#include "config/av1_rtcd.h"
-
-#include "aom_ports/aom_timer.h"
-#include "av1/common/av1_inv_txfm1d_cfg.h"
-#include "av1/common/scan.h"
-#include "test/acm_random.h"
-#include "test/av1_txfm_test.h"
-#include "test/util.h"
-
-using libaom_test::ACMRandom;
-using libaom_test::bd;
-using libaom_test::compute_avg_abs_error;
-using libaom_test::input_base;
-using libaom_test::InvTxfm2dFunc;
-using libaom_test::LbdInvTxfm2dFunc;
-
-using ::testing::Combine;
-using ::testing::Range;
-using ::testing::Values;
-
-using std::vector;
-
-typedef TX_TYPE TxType;
-typedef TX_SIZE TxSize;
-
-namespace {
-
-static const char *tx_type_name[] = {
-  "DCT_DCT",
-  "ADST_DCT",
-  "DCT_ADST",
-  "ADST_ADST",
-  "FLIPADST_DCT",
-  "DCT_FLIPADST",
-  "FLIPADST_FLIPADST",
-  "ADST_FLIPADST",
-  "FLIPADST_ADST",
-  "IDTX",
-  "V_DCT",
-  "H_DCT",
-  "V_ADST",
-  "H_ADST",
-  "V_FLIPADST",
-  "H_FLIPADST",
-};
-
-// AV1InvTxfm2dParam argument list:
-// tx_type_, tx_size_, max_error_, max_avg_error_
-typedef std::tuple<TxType, TxSize, int, double> AV1InvTxfm2dParam;
-
-class AV1InvTxfm2d : public ::testing::TestWithParam<AV1InvTxfm2dParam> {
- public:
-  virtual void SetUp() {
-    tx_type_ = GET_PARAM(0);
-    tx_size_ = GET_PARAM(1);
-    max_error_ = GET_PARAM(2);
-    max_avg_error_ = GET_PARAM(3);
-  }
-
-  void RunRoundtripCheck() {
-    int tx_w = tx_size_wide[tx_size_];
-    int tx_h = tx_size_high[tx_size_];
-    int txfm2d_size = tx_w * tx_h;
-    const FwdTxfm2dFunc fwd_txfm_func = libaom_test::fwd_txfm_func_ls[tx_size_];
-    const InvTxfm2dFunc inv_txfm_func = libaom_test::inv_txfm_func_ls[tx_size_];
-    double avg_abs_error = 0;
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-
-    const int count = 500;
-
-    for (int ci = 0; ci < count; ci++) {
-      DECLARE_ALIGNED(16, int16_t, input[64 * 64]) = { 0 };
-      ASSERT_LE(txfm2d_size, NELEMENTS(input));
-
-      for (int ni = 0; ni < txfm2d_size; ++ni) {
-        if (ci == 0) {
-          int extreme_input = input_base - 1;
-          input[ni] = extreme_input;  // extreme case
-        } else {
-          input[ni] = rnd.Rand16() % input_base;
-        }
-      }
-
-      DECLARE_ALIGNED(16, uint16_t, expected[64 * 64]) = { 0 };
-      ASSERT_LE(txfm2d_size, NELEMENTS(expected));
-      if (TxfmUsesApproximation()) {
-        // Compare reference forward HT + inverse HT vs forward HT + inverse HT.
-        double ref_input[64 * 64];
-        ASSERT_LE(txfm2d_size, NELEMENTS(ref_input));
-        for (int ni = 0; ni < txfm2d_size; ++ni) {
-          ref_input[ni] = input[ni];
-        }
-        double ref_coeffs[64 * 64] = { 0 };
-        ASSERT_LE(txfm2d_size, NELEMENTS(ref_coeffs));
-        ASSERT_EQ(tx_type_, static_cast<TxType>(DCT_DCT));
-        libaom_test::reference_hybrid_2d(ref_input, ref_coeffs, tx_type_,
-                                         tx_size_);
-        DECLARE_ALIGNED(16, int32_t, ref_coeffs_int[64 * 64]) = { 0 };
-        ASSERT_LE(txfm2d_size, NELEMENTS(ref_coeffs_int));
-        for (int ni = 0; ni < txfm2d_size; ++ni) {
-          ref_coeffs_int[ni] = (int32_t)round(ref_coeffs[ni]);
-        }
-        inv_txfm_func(ref_coeffs_int, expected, tx_w, tx_type_, bd);
-      } else {
-        // Compare original input vs forward HT + inverse HT.
-        for (int ni = 0; ni < txfm2d_size; ++ni) {
-          expected[ni] = input[ni];
-        }
-      }
-
-      DECLARE_ALIGNED(16, int32_t, coeffs[64 * 64]) = { 0 };
-      ASSERT_LE(txfm2d_size, NELEMENTS(coeffs));
-      fwd_txfm_func(input, coeffs, tx_w, tx_type_, bd);
-
-      DECLARE_ALIGNED(16, uint16_t, actual[64 * 64]) = { 0 };
-      ASSERT_LE(txfm2d_size, NELEMENTS(actual));
-      inv_txfm_func(coeffs, actual, tx_w, tx_type_, bd);
-
-      double actual_max_error = 0;
-      for (int ni = 0; ni < txfm2d_size; ++ni) {
-        const double this_error = abs(expected[ni] - actual[ni]);
-        actual_max_error = AOMMAX(actual_max_error, this_error);
-      }
-      EXPECT_GE(max_error_, actual_max_error)
-          << " tx_w: " << tx_w << " tx_h " << tx_h << " tx_type: " << tx_type_;
-      if (actual_max_error > max_error_) {  // exit early.
-        break;
-      }
-      avg_abs_error += compute_avg_abs_error<uint16_t, uint16_t>(
-          expected, actual, txfm2d_size);
-    }
-
-    avg_abs_error /= count;
-    EXPECT_GE(max_avg_error_, avg_abs_error)
-        << " tx_w: " << tx_w << " tx_h " << tx_h << " tx_type: " << tx_type_;
-  }
-
- private:
-  bool TxfmUsesApproximation() {
-    if (tx_size_wide[tx_size_] == 64 || tx_size_high[tx_size_] == 64) {
-      return true;
-    }
-    return false;
-  }
-
-  int max_error_;
-  double max_avg_error_;
-  TxType tx_type_;
-  TxSize tx_size_;
-};
-
-static int max_error_ls[TX_SIZES_ALL] = {
-  2,  // 4x4 transform
-  2,  // 8x8 transform
-  2,  // 16x16 transform
-  4,  // 32x32 transform
-  3,  // 64x64 transform
-  2,  // 4x8 transform
-  2,  // 8x4 transform
-  2,  // 8x16 transform
-  2,  // 16x8 transform
-  3,  // 16x32 transform
-  3,  // 32x16 transform
-  5,  // 32x64 transform
-  5,  // 64x32 transform
-  2,  // 4x16 transform
-  2,  // 16x4 transform
-  2,  // 8x32 transform
-  2,  // 32x8 transform
-  3,  // 16x64 transform
-  3,  // 64x16 transform
-};
-
-static double avg_error_ls[TX_SIZES_ALL] = {
-  0.002,  // 4x4 transform
-  0.05,   // 8x8 transform
-  0.07,   // 16x16 transform
-  0.4,    // 32x32 transform
-  0.3,    // 64x64 transform
-  0.02,   // 4x8 transform
-  0.02,   // 8x4 transform
-  0.04,   // 8x16 transform
-  0.07,   // 16x8 transform
-  0.4,    // 16x32 transform
-  0.5,    // 32x16 transform
-  0.38,   // 32x64 transform
-  0.39,   // 64x32 transform
-  0.2,    // 4x16 transform
-  0.2,    // 16x4 transform
-  0.2,    // 8x32 transform
-  0.2,    // 32x8 transform
-  0.38,   // 16x64 transform
-  0.38,   // 64x16 transform
-};
-
-vector<AV1InvTxfm2dParam> GetInvTxfm2dParamList() {
-  vector<AV1InvTxfm2dParam> param_list;
-  for (int s = 0; s < TX_SIZES; ++s) {
-    const int max_error = max_error_ls[s];
-    const double avg_error = avg_error_ls[s];
-    for (int t = 0; t < TX_TYPES; ++t) {
-      const TxType tx_type = static_cast<TxType>(t);
-      const TxSize tx_size = static_cast<TxSize>(s);
-      if (libaom_test::IsTxSizeTypeValid(tx_size, tx_type)) {
-        param_list.push_back(
-            AV1InvTxfm2dParam(tx_type, tx_size, max_error, avg_error));
-      }
-    }
-  }
-  return param_list;
-}
-
-INSTANTIATE_TEST_SUITE_P(C, AV1InvTxfm2d,
-                         ::testing::ValuesIn(GetInvTxfm2dParamList()));
-
-TEST_P(AV1InvTxfm2d, RunRoundtripCheck) { RunRoundtripCheck(); }
-
-TEST(AV1InvTxfm2d, CfgTest) {
-  for (int bd_idx = 0; bd_idx < BD_NUM; ++bd_idx) {
-    int bd = libaom_test::bd_arr[bd_idx];
-    int8_t low_range = libaom_test::low_range_arr[bd_idx];
-    int8_t high_range = libaom_test::high_range_arr[bd_idx];
-    for (int tx_size = 0; tx_size < TX_SIZES_ALL; ++tx_size) {
-      for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
-        if (libaom_test::IsTxSizeTypeValid(static_cast<TxSize>(tx_size),
-                                           static_cast<TxType>(tx_type)) ==
-            false) {
-          continue;
-        }
-        TXFM_2D_FLIP_CFG cfg;
-        av1_get_inv_txfm_cfg(static_cast<TxType>(tx_type),
-                             static_cast<TxSize>(tx_size), &cfg);
-        int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
-        int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
-        av1_gen_inv_stage_range(stage_range_col, stage_range_row, &cfg,
-                                static_cast<TxSize>(tx_size), bd);
-        libaom_test::txfm_stage_range_check(stage_range_col, cfg.stage_num_col,
-                                            cfg.cos_bit_col, low_range,
-                                            high_range);
-        libaom_test::txfm_stage_range_check(stage_range_row, cfg.stage_num_row,
-                                            cfg.cos_bit_row, low_range,
-                                            high_range);
-      }
-    }
-  }
-}
-
-typedef std::tuple<const LbdInvTxfm2dFunc> AV1LbdInvTxfm2dParam;
-class AV1LbdInvTxfm2d : public ::testing::TestWithParam<AV1LbdInvTxfm2dParam> {
- public:
-  virtual void SetUp() { target_func_ = GET_PARAM(0); }
-  void RunAV1InvTxfm2dTest(TxType tx_type, TxSize tx_size, int run_times,
-                           int gt_int16 = 0);
-
- private:
-  LbdInvTxfm2dFunc target_func_;
-};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1LbdInvTxfm2d);
-
-void AV1LbdInvTxfm2d::RunAV1InvTxfm2dTest(TxType tx_type, TxSize tx_size,
-                                          int run_times, int gt_int16) {
-  FwdTxfm2dFunc fwd_func_ = libaom_test::fwd_txfm_func_ls[tx_size];
-  InvTxfm2dFunc ref_func_ = libaom_test::inv_txfm_func_ls[tx_size];
-  if (fwd_func_ == NULL || ref_func_ == NULL || target_func_ == NULL) {
-    return;
-  }
-  const int bd = 8;
-  const int BLK_WIDTH = 64;
-  const int BLK_SIZE = BLK_WIDTH * BLK_WIDTH;
-  DECLARE_ALIGNED(16, int16_t, input[BLK_SIZE]) = { 0 };
-  DECLARE_ALIGNED(32, int32_t, inv_input[BLK_SIZE]) = { 0 };
-  DECLARE_ALIGNED(16, uint8_t, output[BLK_SIZE]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, ref_output[BLK_SIZE]) = { 0 };
-  int stride = BLK_WIDTH;
-  int rows = tx_size_high[tx_size];
-  int cols = tx_size_wide[tx_size];
-  const int rows_nonezero = AOMMIN(32, rows);
-  const int cols_nonezero = AOMMIN(32, cols);
-  run_times /= (rows * cols);
-  run_times = AOMMAX(1, run_times);
-  const SCAN_ORDER *scan_order = get_default_scan(tx_size, tx_type);
-  const int16_t *scan = scan_order->scan;
-  const int16_t eobmax = rows_nonezero * cols_nonezero;
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int randTimes = run_times == 1 ? (eobmax + 500) : 1;
-
-  for (int cnt = 0; cnt < randTimes; ++cnt) {
-    const int16_t max_in = (1 << (bd)) - 1;
-    for (int r = 0; r < BLK_WIDTH; ++r) {
-      for (int c = 0; c < BLK_WIDTH; ++c) {
-        input[r * cols + c] = (cnt == 0) ? max_in : rnd.Rand8Extremes();
-        output[r * stride + c] = (cnt == 0) ? 128 : rnd.Rand8();
-        ref_output[r * stride + c] = output[r * stride + c];
-      }
-    }
-    fwd_func_(input, inv_input, stride, tx_type, bd);
-
-    // produce eob input by setting high freq coeffs to zero
-    const int eob = AOMMIN(cnt + 1, eobmax);
-    for (int i = eob; i < eobmax; i++) {
-      inv_input[scan[i]] = 0;
-    }
-    if (gt_int16) {
-      inv_input[scan[eob - 1]] = ((int32_t)INT16_MAX * 100 / 141);
-    }
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-    for (int i = 0; i < run_times; ++i) {
-      ref_func_(inv_input, ref_output, stride, tx_type, bd);
-    }
-    aom_usec_timer_mark(&timer);
-    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
-    aom_usec_timer_start(&timer);
-    for (int i = 0; i < run_times; ++i) {
-      target_func_(inv_input, output, stride, tx_type, tx_size, eob);
-    }
-    aom_usec_timer_mark(&timer);
-    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
-    if (run_times > 10) {
-      printf("txfm[%d] %3dx%-3d:%7.2f/%7.2fns", tx_type, cols, rows, time1,
-             time2);
-      printf("(%3.2f)\n", time1 / time2);
-    }
-    for (int r = 0; r < rows; ++r) {
-      for (int c = 0; c < cols; ++c) {
-        uint8_t ref_value = static_cast<uint8_t>(ref_output[r * stride + c]);
-        if (ref_value != output[r * stride + c]) {
-          printf(" ");
-        }
-        ASSERT_EQ(ref_value, output[r * stride + c])
-            << "[" << r << "," << c << "] " << cnt
-            << " tx_size: " << static_cast<int>(tx_size)
-            << " tx_type: " << tx_type_name[tx_type] << " eob " << eob;
-      }
-    }
-  }
-}
-
-TEST_P(AV1LbdInvTxfm2d, match) {
-  for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
-    for (int i = 0; i < (int)TX_TYPES; ++i) {
-      if (libaom_test::IsTxSizeTypeValid(static_cast<TxSize>(j),
-                                         static_cast<TxType>(i))) {
-        RunAV1InvTxfm2dTest(static_cast<TxType>(i), static_cast<TxSize>(j), 1);
-      }
-    }
-  }
-}
-
-TEST_P(AV1LbdInvTxfm2d, gt_int16) {
-  static const TxType types[] = { DCT_DCT, ADST_DCT, FLIPADST_DCT, IDTX,
-                                  V_DCT,   H_DCT,    H_ADST,       H_FLIPADST };
-  for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
-    const TxSize sz = static_cast<TxSize>(j);
-    for (uint8_t i = 0; i < sizeof(types) / sizeof(types[0]); ++i) {
-      const TxType tp = types[i];
-      if (libaom_test::IsTxSizeTypeValid(sz, tp)) {
-        RunAV1InvTxfm2dTest(tp, sz, 1, 1);
-      }
-    }
-  }
-}
-
-TEST_P(AV1LbdInvTxfm2d, DISABLED_Speed) {
-  for (int j = 1; j < (int)(TX_SIZES_ALL); ++j) {
-    for (int i = 0; i < (int)TX_TYPES; ++i) {
-      if (libaom_test::IsTxSizeTypeValid(static_cast<TxSize>(j),
-                                         static_cast<TxType>(i))) {
-        RunAV1InvTxfm2dTest(static_cast<TxType>(i), static_cast<TxSize>(j),
-                            10000000);
-      }
-    }
-  }
-}
-
-#if HAVE_SSSE3
-#if defined(_MSC_VER) || defined(__SSSE3__)
-#include "av1/common/x86/av1_inv_txfm_ssse3.h"
-INSTANTIATE_TEST_SUITE_P(SSSE3, AV1LbdInvTxfm2d,
-                         ::testing::Values(av1_lowbd_inv_txfm2d_add_ssse3));
-#endif  // _MSC_VER || __SSSE3__
-#endif  // HAVE_SSSE3
-
-#if HAVE_AVX2
-extern "C" void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input,
-                                              uint8_t *output, int stride,
-                                              TxType tx_type, TxSize tx_size,
-                                              int eob);
-
-INSTANTIATE_TEST_SUITE_P(AVX2, AV1LbdInvTxfm2d,
-                         ::testing::Values(av1_lowbd_inv_txfm2d_add_avx2));
-#endif  // HAVE_AVX2
-
-// TODO(yunqing): Re-enable this unit test for NEON version after the functions
-// are fixed.
-#if HAVE_NEON
-extern "C" void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input,
-                                              uint8_t *output, int stride,
-                                              TX_TYPE tx_type, TX_SIZE tx_size,
-                                              int eob);
-
-INSTANTIATE_TEST_SUITE_P(NEON, AV1LbdInvTxfm2d,
-                         ::testing::Values(av1_lowbd_inv_txfm2d_add_neon));
-#endif  // HAVE_NEON
-
-}  // namespace

diff --git a/test/av1_key_value_api_test.cc b/test/av1_key_value_api_test.cc
index 88ba6af..c55c8e3 100644
--- a/test/av1_key_value_api_test.cc
+++ b/test/av1_key_value_api_test.cc

@@ -37,7 +37,7 @@
 #endif
 #if CONFIG_AV1_DECODER
     aom_codec_iface_t *iface_dx = aom_codec_av1_dx();
-    aom_codec_dec_cfg_t dec_cfg = { 0, 0, 0, 0 };
+    aom_codec_dec_cfg_t dec_cfg = { 0, 0, 0 };
 
     EXPECT_EQ(AOM_CODEC_OK, aom_codec_dec_init(&dec_, iface_dx, &dec_cfg, 0));
 #endif

diff --git a/test/av1_wedge_utils_test.cc b/test/av1_wedge_utils_test.cc
index d60c30e..6a8f3c2 100644
--- a/test/av1_wedge_utils_test.cc
+++ b/test/av1_wedge_utils_test.cc

@@ -47,76 +47,6 @@
   ACMRandom rng_;
 };
 
-static void equiv_blend_residuals(int16_t *r, const int16_t *r0,
-                                  const int16_t *r1, const uint8_t *m, int N) {
-  for (int i = 0; i < N; i++) {
-    const int32_t m0 = m[i];
-    const int32_t m1 = MAX_MASK_VALUE - m0;
-    const int16_t R = m0 * r0[i] + m1 * r1[i];
-    // Note that this rounding is designed to match the result
-    // you would get when actually blending the 2 predictors and computing
-    // the residuals.
-    r[i] = ROUND_POWER_OF_TWO(R - 1, WEDGE_WEIGHT_BITS);
-  }
-}
-
-static uint64_t equiv_sse_from_residuals(const int16_t *r0, const int16_t *r1,
-                                         const uint8_t *m, int N) {
-  uint64_t acc = 0;
-  for (int i = 0; i < N; i++) {
-    const int32_t m0 = m[i];
-    const int32_t m1 = MAX_MASK_VALUE - m0;
-    const int16_t R = m0 * r0[i] + m1 * r1[i];
-    const int32_t r = ROUND_POWER_OF_TWO(R - 1, WEDGE_WEIGHT_BITS);
-    acc += r * r;
-  }
-  return acc;
-}
-
-TEST_F(WedgeUtilsSSEFuncTest, ResidualBlendingEquiv) {
-  DECLARE_ALIGNED(32, uint8_t, s[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, uint8_t, p0[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, uint8_t, p1[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, uint8_t, p[MAX_SB_SQUARE]);
-
-  DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, int16_t, r_ref[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, int16_t, r_tst[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
-
-  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
-    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
-      s[i] = rng_.Rand8();
-      m[i] = rng_(MAX_MASK_VALUE + 1);
-    }
-
-    const int w = 1 << (rng_(MAX_SB_SIZE_LOG2 + 1 - 3) + 3);
-    const int h = 1 << (rng_(MAX_SB_SIZE_LOG2 + 1 - 3) + 3);
-    const int N = w * h;
-
-    for (int j = 0; j < N; j++) {
-      p0[j] = clamp(s[j] + rng_(33) - 16, 0, UINT8_MAX);
-      p1[j] = clamp(s[j] + rng_(33) - 16, 0, UINT8_MAX);
-    }
-
-    aom_blend_a64_mask(p, w, p0, w, p1, w, m, w, w, h, 0, 0);
-
-    aom_subtract_block(h, w, r0, w, s, w, p0, w);
-    aom_subtract_block(h, w, r1, w, s, w, p1, w);
-
-    aom_subtract_block(h, w, r_ref, w, s, w, p, w);
-    equiv_blend_residuals(r_tst, r0, r1, m, N);
-
-    for (int i = 0; i < N; ++i) ASSERT_EQ(r_ref[i], r_tst[i]);
-
-    uint64_t ref_sse = aom_sum_squares_i16(r_ref, N);
-    uint64_t tst_sse = equiv_sse_from_residuals(r0, r1, m, N);
-
-    ASSERT_EQ(ref_sse, tst_sse);
-  }
-}
-
 static uint64_t sse_from_residuals(const int16_t *r0, const int16_t *r1,
                                    const uint8_t *m, int N) {
   uint64_t acc = 0;

diff --git a/test/avg_test.cc b/test/avg_test.cc
deleted file mode 100644
index f5c9212..0000000
--- a/test/avg_test.cc
+++ /dev/null

@@ -1,663 +0,0 @@
-/*
- *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-#include <tuple>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-
-namespace {
-
-using libaom_test::ACMRandom;
-
-template <typename Pixel>
-class AverageTestBase : public ::testing::Test {
- public:
-  AverageTestBase(int width, int height)
-      : width_(width), height_(height), source_data_(NULL), source_stride_(0),
-        bit_depth_(8) {}
-
-  virtual void TearDown() {
-    aom_free(source_data_);
-    source_data_ = NULL;
-    libaom_test::ClearSystemState();
-  }
-
- protected:
-  // Handle blocks up to 4 blocks 64x64 with stride up to 128
-  static const int kDataAlignment = 16;
-  static const int kDataBlockSize = 64 * 128;
-
-  virtual void SetUp() {
-    source_data_ = static_cast<Pixel *>(
-        aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
-    ASSERT_TRUE(source_data_ != NULL);
-    source_stride_ = (width_ + 31) & ~31;
-    bit_depth_ = 8;
-    rnd_.Reset(ACMRandom::DeterministicSeed());
-  }
-
-  // Sum Pixels
-  static unsigned int ReferenceAverage8x8(const Pixel *source, int pitch) {
-    unsigned int average = 0;
-    for (int h = 0; h < 8; ++h) {
-      for (int w = 0; w < 8; ++w) average += source[h * pitch + w];
-    }
-    return (average + 32) >> 6;
-  }
-
-  static unsigned int ReferenceAverage4x4(const Pixel *source, int pitch) {
-    unsigned int average = 0;
-    for (int h = 0; h < 4; ++h) {
-      for (int w = 0; w < 4; ++w) average += source[h * pitch + w];
-    }
-    return (average + 8) >> 4;
-  }
-
-  void FillConstant(Pixel fill_constant) {
-    for (int i = 0; i < width_ * height_; ++i) {
-      source_data_[i] = fill_constant;
-    }
-  }
-
-  void FillRandom() {
-    for (int i = 0; i < width_ * height_; ++i) {
-      source_data_[i] = rnd_.Rand16() & ((1 << bit_depth_) - 1);
-    }
-  }
-
-  int width_, height_;
-  Pixel *source_data_;
-  int source_stride_;
-  int bit_depth_;
-
-  ACMRandom rnd_;
-};
-typedef unsigned int (*AverageFunction)(const uint8_t *s, int pitch);
-
-// Arguments: width, height, pitch, block size, avg function.
-typedef std::tuple<int, int, int, int, AverageFunction> AvgFunc;
-
-class AverageTest : public AverageTestBase<uint8_t>,
-                    public ::testing::WithParamInterface<AvgFunc> {
- public:
-  AverageTest() : AverageTestBase(GET_PARAM(0), GET_PARAM(1)) {}
-
- protected:
-  void CheckAverages() {
-    const int block_size = GET_PARAM(3);
-    unsigned int expected = 0;
-    if (block_size == 8) {
-      expected =
-          ReferenceAverage8x8(source_data_ + GET_PARAM(2), source_stride_);
-    } else if (block_size == 4) {
-      expected =
-          ReferenceAverage4x4(source_data_ + GET_PARAM(2), source_stride_);
-    }
-
-    unsigned int actual;
-    ASM_REGISTER_STATE_CHECK(
-        actual = GET_PARAM(4)(source_data_ + GET_PARAM(2), source_stride_));
-
-    EXPECT_EQ(expected, actual);
-  }
-};
-
-TEST_P(AverageTest, MinValue) {
-  FillConstant(0);
-  CheckAverages();
-}
-
-TEST_P(AverageTest, MaxValue) {
-  FillConstant(255);
-  CheckAverages();
-}
-
-TEST_P(AverageTest, Random) {
-  // The reference frame, but not the source frame, may be unaligned for
-  // certain types of searches.
-  for (int i = 0; i < 1000; i++) {
-    FillRandom();
-    CheckAverages();
-  }
-}
-
-typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref,
-                              const int ref_stride, const int height);
-
-// Params: height, asm function, c function.
-typedef std::tuple<int, IntProRowFunc, IntProRowFunc> IntProRowParam;
-
-class IntProRowTest : public AverageTestBase<uint8_t>,
-                      public ::testing::WithParamInterface<IntProRowParam> {
- public:
-  IntProRowTest()
-      : AverageTestBase(16, GET_PARAM(0)), hbuf_asm_(NULL), hbuf_c_(NULL) {
-    asm_func_ = GET_PARAM(1);
-    c_func_ = GET_PARAM(2);
-  }
-
- protected:
-  virtual void SetUp() {
-    source_data_ = static_cast<uint8_t *>(
-        aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
-    ASSERT_TRUE(source_data_ != NULL);
-
-    hbuf_asm_ = static_cast<int16_t *>(
-        aom_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16));
-    hbuf_c_ = static_cast<int16_t *>(
-        aom_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16));
-  }
-
-  virtual void TearDown() {
-    aom_free(source_data_);
-    source_data_ = NULL;
-    aom_free(hbuf_c_);
-    hbuf_c_ = NULL;
-    aom_free(hbuf_asm_);
-    hbuf_asm_ = NULL;
-  }
-
-  void RunComparison() {
-    ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
-    ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
-    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
-        << "Output mismatch\n";
-  }
-
-  void RunSpeedTest() {
-    const int numIter = 5000000;
-    printf("Height = %d number of iteration is %d \n", height_, numIter);
-    aom_usec_timer c_timer_;
-    aom_usec_timer_start(&c_timer_);
-    for (int i = 0; i < numIter; i++) {
-      c_func_(hbuf_c_, source_data_, 0, height_);
-    }
-    aom_usec_timer_mark(&c_timer_);
-
-    aom_usec_timer asm_timer_;
-    aom_usec_timer_start(&asm_timer_);
-
-    for (int i = 0; i < numIter; i++) {
-      asm_func_(hbuf_asm_, source_data_, 0, height_);
-    }
-    aom_usec_timer_mark(&asm_timer_);
-
-    const int c_sum_time = static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
-    const int asm_sum_time =
-        static_cast<int>(aom_usec_timer_elapsed(&asm_timer_));
-
-    printf("c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
-           asm_sum_time,
-           (static_cast<float>(c_sum_time) / static_cast<float>(asm_sum_time)));
-
-    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
-        << "Output mismatch\n";
-  }
-
- private:
-  IntProRowFunc asm_func_;
-  IntProRowFunc c_func_;
-  int16_t *hbuf_asm_;
-  int16_t *hbuf_c_;
-};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProRowTest);
-
-typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width);
-
-// Params: width, asm function, c function.
-typedef std::tuple<int, IntProColFunc, IntProColFunc> IntProColParam;
-
-class IntProColTest : public AverageTestBase<uint8_t>,
-                      public ::testing::WithParamInterface<IntProColParam> {
- public:
-  IntProColTest() : AverageTestBase(GET_PARAM(0), 1), sum_asm_(0), sum_c_(0) {
-    asm_func_ = GET_PARAM(1);
-    c_func_ = GET_PARAM(2);
-  }
-
- protected:
-  void RunComparison() {
-    ASM_REGISTER_STATE_CHECK(sum_c_ = c_func_(source_data_, width_));
-    ASM_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_));
-    EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch";
-  }
-  void RunSpeedTest() {
-    const int numIter = 5000000;
-    printf("Width = %d number of iteration is %d \n", width_, numIter);
-    aom_usec_timer c_timer_;
-    aom_usec_timer_start(&c_timer_);
-    for (int i = 0; i < numIter; i++) {
-      sum_c_ = c_func_(source_data_, width_);
-    }
-    aom_usec_timer_mark(&c_timer_);
-
-    aom_usec_timer asm_timer_;
-    aom_usec_timer_start(&asm_timer_);
-
-    for (int i = 0; i < numIter; i++) {
-      sum_asm_ = asm_func_(source_data_, width_);
-    }
-    aom_usec_timer_mark(&asm_timer_);
-
-    const int c_sum_time = static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
-    const int asm_sum_time =
-        static_cast<int>(aom_usec_timer_elapsed(&asm_timer_));
-
-    printf("c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
-           asm_sum_time,
-           (static_cast<float>(c_sum_time) / static_cast<float>(asm_sum_time)));
-
-    EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch \n";
-  }
-
- private:
-  IntProColFunc asm_func_;
-  IntProColFunc c_func_;
-  int16_t sum_asm_;
-  int16_t sum_c_;
-};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProColTest);
-
-TEST_P(IntProRowTest, MinValue) {
-  FillConstant(0);
-  RunComparison();
-}
-
-TEST_P(IntProRowTest, MaxValue) {
-  FillConstant(255);
-  RunComparison();
-}
-
-TEST_P(IntProRowTest, Random) {
-  FillRandom();
-  RunComparison();
-}
-
-TEST_P(IntProRowTest, DISABLED_Speed) {
-  FillRandom();
-  RunSpeedTest();
-}
-
-TEST_P(IntProColTest, MinValue) {
-  FillConstant(0);
-  RunComparison();
-}
-
-TEST_P(IntProColTest, MaxValue) {
-  FillConstant(255);
-  RunComparison();
-}
-
-TEST_P(IntProColTest, Random) {
-  FillRandom();
-  RunComparison();
-}
-
-TEST_P(IntProColTest, DISABLED_Speed) {
-  FillRandom();
-  RunSpeedTest();
-}
-class VectorVarTestBase : public ::testing::Test {
- public:
-  explicit VectorVarTestBase(int bwl) { m_bwl = bwl; }
-  VectorVarTestBase() {}
-  ~VectorVarTestBase() {}
-
- protected:
-  static const int kDataAlignment = 16;
-
-  virtual void SetUp() {
-    width = 4 << m_bwl;
-
-    ref_vector = static_cast<int16_t *>(
-        aom_memalign(kDataAlignment, width * sizeof(ref_vector[0])));
-    ASSERT_TRUE(ref_vector != NULL);
-    src_vector = static_cast<int16_t *>(
-        aom_memalign(kDataAlignment, width * sizeof(src_vector[0])));
-    ASSERT_TRUE(src_vector != NULL);
-
-    rnd_.Reset(ACMRandom::DeterministicSeed());
-  }
-  virtual void TearDown() {
-    aom_free(ref_vector);
-    ref_vector = NULL;
-    aom_free(src_vector);
-    src_vector = NULL;
-    libaom_test::ClearSystemState();
-  }
-
-  void FillConstant(int16_t fill_constant_ref, int16_t fill_constant_src) {
-    for (int i = 0; i < width; ++i) {
-      ref_vector[i] = fill_constant_ref;
-      src_vector[i] = fill_constant_src;
-    }
-  }
-
-  void FillRandom() {
-    for (int i = 0; i < width; ++i) {
-      ref_vector[i] =
-          rnd_.Rand16() % max_range;  // acc. aom_vector_var_c brief.
-      src_vector[i] = rnd_.Rand16() % max_range;
-    }
-  }
-
-  int width;
-  int m_bwl;
-  int16_t *ref_vector;
-  int16_t *src_vector;
-  ACMRandom rnd_;
-
-  static const int max_range = 510;
-  static const int num_random_cmp = 50;
-};
-
-typedef int (*VectorVarFunc)(const int16_t *ref, const int16_t *src,
-                             const int bwl);
-
-typedef std::tuple<int, VectorVarFunc, VectorVarFunc> VecVarFunc;
-
-class VectorVarTest : public VectorVarTestBase,
-                      public ::testing::WithParamInterface<VecVarFunc> {
- public:
-  VectorVarTest()
-      : VectorVarTestBase(GET_PARAM(0)), c_func(GET_PARAM(1)),
-        simd_func(GET_PARAM(2)) {}
-
- protected:
-  int calcVarC() { return c_func(ref_vector, src_vector, m_bwl); }
-  int calcVarSIMD() { return simd_func(ref_vector, src_vector, m_bwl); }
-
-  VectorVarFunc c_func;
-  VectorVarFunc simd_func;
-};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(VectorVarTest);
-
-TEST_P(VectorVarTest, MaxVar) {
-  FillConstant(0, max_range);
-  int c_var = calcVarC();
-  int simd_var = calcVarSIMD();
-  ASSERT_EQ(c_var, simd_var);
-}
-TEST_P(VectorVarTest, MaxVarRev) {
-  FillConstant(max_range, 0);
-  int c_var = calcVarC();
-  int simd_var = calcVarSIMD();
-  ASSERT_EQ(c_var, simd_var);
-}
-TEST_P(VectorVarTest, ZeroDiff) {
-  FillConstant(0, 0);
-  int c_var = calcVarC();
-  int simd_var = calcVarSIMD();
-  ASSERT_EQ(c_var, simd_var);
-}
-TEST_P(VectorVarTest, ZeroDiff2) {
-  FillConstant(max_range, max_range);
-  int c_var = calcVarC();
-  int simd_var = calcVarSIMD();
-  ASSERT_EQ(c_var, simd_var);
-}
-TEST_P(VectorVarTest, Constant) {
-  FillConstant(30, 90);
-  int c_var = calcVarC();
-  int simd_var = calcVarSIMD();
-  ASSERT_EQ(c_var, simd_var);
-}
-TEST_P(VectorVarTest, Random) {
-  for (size_t i = 0; i < num_random_cmp; i++) {
-    FillRandom();
-    int c_var = calcVarC();
-    int simd_var = calcVarSIMD();
-    ASSERT_EQ(c_var, simd_var);
-  }
-}
-TEST_P(VectorVarTest, DISABLED_Speed) {
-  FillRandom();
-  const int numIter = 50000;
-  printf("Width = %d number of iteration is %d \n", width, numIter);
-
-  int sum_c_var = 0;
-  int c_var = 0;
-
-  aom_usec_timer c_timer_;
-  aom_usec_timer_start(&c_timer_);
-  for (size_t i = 0; i < numIter; i++) {
-    c_var = calcVarC();
-    sum_c_var += c_var;
-  }
-  aom_usec_timer_mark(&c_timer_);
-
-  int simd_var = 0;
-  int sum_simd_var = 0;
-  aom_usec_timer simd_timer_;
-  aom_usec_timer_start(&simd_timer_);
-  for (size_t i = 0; i < numIter; i++) {
-    simd_var = calcVarSIMD();
-    sum_simd_var += simd_var;
-  }
-  aom_usec_timer_mark(&simd_timer_);
-
-  const int c_sum_time = static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
-  const int simd_sum_time =
-      static_cast<int>(aom_usec_timer_elapsed(&simd_timer_));
-
-  printf("c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
-         simd_sum_time,
-         (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
-
-  EXPECT_EQ(c_var, simd_var) << "Output mismatch \n";
-  EXPECT_EQ(sum_c_var, sum_simd_var) << "Output mismatch \n";
-}
-
-using std::make_tuple;
-
-INSTANTIATE_TEST_SUITE_P(
-    C, AverageTest,
-    ::testing::Values(make_tuple(16, 16, 1, 8, &aom_avg_8x8_c),
-                      make_tuple(16, 16, 1, 4, &aom_avg_4x4_c)));
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(
-    SSE2, AverageTest,
-    ::testing::Values(make_tuple(16, 16, 0, 8, &aom_avg_8x8_sse2),
-                      make_tuple(16, 16, 5, 8, &aom_avg_8x8_sse2),
-                      make_tuple(32, 32, 15, 8, &aom_avg_8x8_sse2),
-                      make_tuple(16, 16, 0, 4, &aom_avg_4x4_sse2),
-                      make_tuple(16, 16, 5, 4, &aom_avg_4x4_sse2),
-                      make_tuple(32, 32, 15, 4, &aom_avg_4x4_sse2)));
-
-INSTANTIATE_TEST_SUITE_P(
-    SSE2, IntProRowTest,
-    ::testing::Values(make_tuple(16, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
-                      make_tuple(32, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
-                      make_tuple(64, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
-                      make_tuple(128, &aom_int_pro_row_sse2,
-                                 &aom_int_pro_row_c)));
-
-INSTANTIATE_TEST_SUITE_P(
-    SSE2, IntProColTest,
-    ::testing::Values(make_tuple(16, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
-                      make_tuple(32, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
-                      make_tuple(64, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
-                      make_tuple(128, &aom_int_pro_col_sse2,
-                                 &aom_int_pro_col_c)));
-#endif
-
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(
-    NEON, AverageTest,
-    ::testing::Values(make_tuple(16, 16, 0, 8, &aom_avg_8x8_neon),
-                      make_tuple(16, 16, 5, 8, &aom_avg_8x8_neon),
-                      make_tuple(32, 32, 15, 8, &aom_avg_8x8_neon),
-                      make_tuple(16, 16, 0, 4, &aom_avg_4x4_neon),
-                      make_tuple(16, 16, 5, 4, &aom_avg_4x4_neon),
-                      make_tuple(32, 32, 15, 4, &aom_avg_4x4_neon)));
-INSTANTIATE_TEST_SUITE_P(
-    NEON, IntProRowTest,
-    ::testing::Values(make_tuple(16, &aom_int_pro_row_neon, &aom_int_pro_row_c),
-                      make_tuple(32, &aom_int_pro_row_neon, &aom_int_pro_row_c),
-                      make_tuple(64, &aom_int_pro_row_neon, &aom_int_pro_row_c),
-                      make_tuple(128, &aom_int_pro_row_neon,
-                                 &aom_int_pro_row_c)));
-
-INSTANTIATE_TEST_SUITE_P(
-    NEON, IntProColTest,
-    ::testing::Values(make_tuple(16, &aom_int_pro_col_neon, &aom_int_pro_col_c),
-                      make_tuple(32, &aom_int_pro_col_neon, &aom_int_pro_col_c),
-                      make_tuple(64, &aom_int_pro_col_neon, &aom_int_pro_col_c),
-                      make_tuple(128, &aom_int_pro_col_neon,
-                                 &aom_int_pro_col_c)));
-#endif
-
-typedef int (*SatdFunc)(const tran_low_t *coeffs, int length);
-typedef ::testing::tuple<int, SatdFunc, SatdFunc> SatdTestParam;
-class SatdTest : public ::testing::Test,
-                 public ::testing::WithParamInterface<SatdTestParam> {
- protected:
-  virtual void SetUp() {
-    satd_size_ = GET_PARAM(0);
-    satd_func_ref_ = GET_PARAM(1);
-    satd_func_simd_ = GET_PARAM(2);
-
-    rnd_.Reset(ACMRandom::DeterministicSeed());
-    src_ = reinterpret_cast<tran_low_t *>(
-        aom_memalign(32, sizeof(*src_) * satd_size_));
-    ASSERT_TRUE(src_ != NULL);
-  }
-  virtual void TearDown() {
-    libaom_test::ClearSystemState();
-    aom_free(src_);
-  }
-  void FillConstant(const tran_low_t val) {
-    for (int i = 0; i < satd_size_; ++i) src_[i] = val;
-  }
-  void FillRandom() {
-    for (int i = 0; i < satd_size_; ++i) {
-      src_[i] = static_cast<int16_t>(rnd_.Rand16());
-    }
-  }
-  void Check(int expected) {
-    int total_ref;
-    ASM_REGISTER_STATE_CHECK(total_ref = satd_func_ref_(src_, satd_size_));
-    EXPECT_EQ(expected, total_ref);
-
-    int total_simd;
-    ASM_REGISTER_STATE_CHECK(total_simd = satd_func_simd_(src_, satd_size_));
-    EXPECT_EQ(expected, total_simd);
-  }
-  void RunComparison() {
-    int total_ref;
-    ASM_REGISTER_STATE_CHECK(total_ref = satd_func_ref_(src_, satd_size_));
-
-    int total_simd;
-    ASM_REGISTER_STATE_CHECK(total_simd = satd_func_simd_(src_, satd_size_));
-
-    EXPECT_EQ(total_ref, total_simd);
-  }
-  void RunSpeedTest() {
-    const int numIter = 500000;
-    printf("size = %d number of iteration is %d \n", satd_size_, numIter);
-
-    int total_ref;
-    aom_usec_timer c_timer_;
-    aom_usec_timer_start(&c_timer_);
-    for (int i = 0; i < numIter; i++) {
-      total_ref = satd_func_ref_(src_, satd_size_);
-    }
-    aom_usec_timer_mark(&c_timer_);
-
-    int total_simd;
-    aom_usec_timer simd_timer_;
-    aom_usec_timer_start(&simd_timer_);
-
-    for (int i = 0; i < numIter; i++) {
-      total_simd = satd_func_simd_(src_, satd_size_);
-    }
-    aom_usec_timer_mark(&simd_timer_);
-
-    const int c_sum_time = static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
-    const int simd_sum_time =
-        static_cast<int>(aom_usec_timer_elapsed(&simd_timer_));
-
-    printf(
-        "c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
-        simd_sum_time,
-        (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
-
-    EXPECT_EQ(total_ref, total_simd) << "Output mismatch \n";
-  }
-  int satd_size_;
-
- private:
-  tran_low_t *src_;
-  SatdFunc satd_func_ref_;
-  SatdFunc satd_func_simd_;
-  ACMRandom rnd_;
-};
-
-TEST_P(SatdTest, MinValue) {
-  const int kMin = -32640;
-  const int expected = -kMin * satd_size_;
-  FillConstant(kMin);
-  Check(expected);
-}
-TEST_P(SatdTest, MaxValue) {
-  const int kMax = 32640;
-  const int expected = kMax * satd_size_;
-  FillConstant(kMax);
-  Check(expected);
-}
-TEST_P(SatdTest, Random) {
-  int expected;
-  switch (satd_size_) {
-    case 16: expected = 205298; break;
-    case 64: expected = 1113950; break;
-    case 256: expected = 4268415; break;
-    case 1024: expected = 16954082; break;
-    default:
-      FAIL() << "Invalid satd size (" << satd_size_
-             << ") valid: 16/64/256/1024";
-  }
-  FillRandom();
-  Check(expected);
-}
-TEST_P(SatdTest, Match) {
-  FillRandom();
-  RunComparison();
-}
-TEST_P(SatdTest, DISABLED_Speed) {
-  FillRandom();
-  RunSpeedTest();
-}
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SatdTest);
-
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(
-    NEON, SatdTest,
-    ::testing::Values(make_tuple(16, &aom_satd_c, &aom_satd_neon),
-                      make_tuple(64, &aom_satd_c, &aom_satd_neon),
-                      make_tuple(256, &aom_satd_c, &aom_satd_neon),
-                      make_tuple(1024, &aom_satd_c, &aom_satd_neon)));
-INSTANTIATE_TEST_SUITE_P(
-    NEON, VectorVarTest,
-    ::testing::Values(make_tuple(2, &aom_vector_var_c, &aom_vector_var_neon),
-                      make_tuple(3, &aom_vector_var_c, &aom_vector_var_neon),
-                      make_tuple(4, &aom_vector_var_c, &aom_vector_var_neon),
-                      make_tuple(5, &aom_vector_var_c, &aom_vector_var_neon)));
-#endif
-
-}  // namespace

diff --git a/test/blend_a64_mask_1d_test.cc b/test/blend_a64_mask_1d_test.cc
index dd7a1f5..b5876f5 100644
--- a/test/blend_a64_mask_1d_test.cc
+++ b/test/blend_a64_mask_1d_test.cc

@@ -112,111 +112,6 @@
 };
 
 //////////////////////////////////////////////////////////////////////////////
-// 8 bit version
-//////////////////////////////////////////////////////////////////////////////
-
-typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-                    uint32_t src0_stride, const uint8_t *src1,
-                    uint32_t src1_stride, const uint8_t *mask, int w, int h);
-typedef libaom_test::FuncParam<F8B> TestFuncs;
-
-class BlendA64Mask1DTest8B : public BlendA64Mask1DTest<F8B, uint8_t> {
- protected:
-  void Execute(const uint8_t *p_src0, const uint8_t *p_src1) {
-    params_.ref_func(dst_ref_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
-                     src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_,
-                     w_, h_);
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(
-        dst_tst_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
-        src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_, w_, h_));
-  }
-};
-
-TEST_P(BlendA64Mask1DTest8B, RandomValues) {
-  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
-    for (int i = 0; i < kBufSize; ++i) {
-      dst_ref_[i] = rng_.Rand8();
-      dst_tst_[i] = rng_.Rand8();
-
-      src0_[i] = rng_.Rand8();
-      src1_[i] = rng_.Rand8();
-    }
-
-    for (int i = 0; i < kMaxMaskSize; ++i)
-      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
-
-    Common();
-  }
-}
-
-TEST_P(BlendA64Mask1DTest8B, ExtremeValues) {
-  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
-    for (int i = 0; i < kBufSize; ++i) {
-      dst_ref_[i] = rng_(2) + 254;
-      dst_tst_[i] = rng_(2) + 254;
-      src0_[i] = rng_(2) + 254;
-      src1_[i] = rng_(2) + 254;
-    }
-
-    for (int i = 0; i < kMaxMaskSize; ++i)
-      mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
-
-    Common();
-  }
-}
-
-static void blend_a64_hmask_ref(uint8_t *dst, uint32_t dst_stride,
-                                const uint8_t *src0, uint32_t src0_stride,
-                                const uint8_t *src1, uint32_t src1_stride,
-                                const uint8_t *mask, int w, int h) {
-  uint8_t mask2d[BlendA64Mask1DTest8B::kMaxMaskSize]
-                [BlendA64Mask1DTest8B::kMaxMaskSize];
-
-  for (int row = 0; row < h; ++row)
-    for (int col = 0; col < w; ++col) mask2d[row][col] = mask[col];
-
-  aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                       &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize, w, h,
-                       0, 0);
-}
-
-static void blend_a64_vmask_ref(uint8_t *dst, uint32_t dst_stride,
-                                const uint8_t *src0, uint32_t src0_stride,
-                                const uint8_t *src1, uint32_t src1_stride,
-                                const uint8_t *mask, int w, int h) {
-  uint8_t mask2d[BlendA64Mask1DTest8B::kMaxMaskSize]
-                [BlendA64Mask1DTest8B::kMaxMaskSize];
-
-  for (int row = 0; row < h; ++row)
-    for (int col = 0; col < w; ++col) mask2d[row][col] = mask[row];
-
-  aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                       &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize, w, h,
-                       0, 0);
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    C, BlendA64Mask1DTest8B,
-    ::testing::Values(TestFuncs(blend_a64_hmask_ref, aom_blend_a64_hmask_c),
-                      TestFuncs(blend_a64_vmask_ref, aom_blend_a64_vmask_c)));
-
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(
-    SSE4_1, BlendA64Mask1DTest8B,
-    ::testing::Values(
-        TestFuncs(blend_a64_hmask_ref, aom_blend_a64_hmask_sse4_1),
-        TestFuncs(blend_a64_vmask_ref, aom_blend_a64_vmask_sse4_1)));
-#endif  // HAVE_SSE4_1
-
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(
-    NEON, BlendA64Mask1DTest8B,
-    ::testing::Values(TestFuncs(blend_a64_hmask_ref, aom_blend_a64_hmask_neon),
-                      TestFuncs(blend_a64_vmask_ref,
-                                aom_blend_a64_vmask_neon)));
-#endif  // HAVE_NEON
-
-//////////////////////////////////////////////////////////////////////////////
 // High bit-depth version
 //////////////////////////////////////////////////////////////////////////////
 typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,

diff --git a/test/blend_a64_mask_test.cc b/test/blend_a64_mask_test.cc
index 04ee527..c43b3e66 100644
--- a/test/blend_a64_mask_test.cc
+++ b/test/blend_a64_mask_test.cc

@@ -153,219 +153,6 @@
 };
 
 //////////////////////////////////////////////////////////////////////////////
-// 8 bit version
-//////////////////////////////////////////////////////////////////////////////
-
-typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-                    uint32_t src0_stride, const uint8_t *src1,
-                    uint32_t src1_stride, const uint8_t *mask,
-                    uint32_t mask_stride, int w, int h, int subx, int suby);
-typedef libaom_test::FuncParam<F8B> TestFuncs;
-
-class BlendA64MaskTest8B : public BlendA64MaskTest<F8B, uint8_t, uint8_t> {
- protected:
-  void Execute(const uint8_t *p_src0, const uint8_t *p_src1, int run_times) {
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-    for (int i = 0; i < run_times; ++i) {
-      params_.ref_func(dst_ref_ + dst_offset_, dst_stride_,
-                       p_src0 + src0_offset_, src0_stride_,
-                       p_src1 + src1_offset_, src1_stride_, mask_,
-                       kMaxMaskWidth, w_, h_, subx_, suby_);
-    }
-    aom_usec_timer_mark(&timer);
-    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
-    aom_usec_timer_start(&timer);
-    for (int i = 0; i < run_times; ++i) {
-      params_.tst_func(dst_tst_ + dst_offset_, dst_stride_,
-                       p_src0 + src0_offset_, src0_stride_,
-                       p_src1 + src1_offset_, src1_stride_, mask_,
-                       kMaxMaskWidth, w_, h_, subx_, suby_);
-    }
-    aom_usec_timer_mark(&timer);
-    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
-    if (run_times > 1) {
-      printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_,
-             time1, time2);
-      printf("(%3.2f)\n", time1 / time2);
-    }
-  }
-};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BlendA64MaskTest8B);
-
-TEST_P(BlendA64MaskTest8B, RandomValues) {
-  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
-    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
-    for (int i = 0; i < kBufSize; ++i) {
-      dst_ref_[i] = rng_.Rand8();
-      dst_tst_[i] = rng_.Rand8();
-
-      src0_[i] = rng_.Rand8();
-      src1_[i] = rng_.Rand8();
-    }
-
-    for (int i = 0; i < kMaxMaskSize; ++i)
-      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
-
-    RunTest(bsize, 1);
-  }
-}
-
-TEST_P(BlendA64MaskTest8B, ExtremeValues) {
-  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
-    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
-    for (int i = 0; i < kBufSize; ++i) {
-      dst_ref_[i] = rng_(2) + 254;
-      dst_tst_[i] = rng_(2) + 254;
-      src0_[i] = rng_(2) + 254;
-      src1_[i] = rng_(2) + 254;
-    }
-
-    for (int i = 0; i < kMaxMaskSize; ++i)
-      mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
-
-    RunTest(bsize, 1);
-  }
-}
-TEST_P(BlendA64MaskTest8B, DISABLED_Speed) {
-  const int kRunTimes = 10000000;
-  for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
-    for (int i = 0; i < kBufSize; ++i) {
-      dst_ref_[i] = rng_.Rand8();
-      dst_tst_[i] = rng_.Rand8();
-
-      src0_[i] = rng_.Rand8();
-      src1_[i] = rng_.Rand8();
-    }
-
-    for (int i = 0; i < kMaxMaskSize; ++i)
-      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
-
-    RunOneTest(bsize, 1, 1, kRunTimes);
-    RunOneTest(bsize, 1, 0, kRunTimes);
-    RunOneTest(bsize, 0, 1, kRunTimes);
-    RunOneTest(bsize, 0, 0, kRunTimes);
-  }
-}
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(SSE4_1, BlendA64MaskTest8B,
-                         ::testing::Values(TestFuncs(
-                             aom_blend_a64_mask_c, aom_blend_a64_mask_sse4_1)));
-#endif  // HAVE_SSE4_1
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(AVX2, BlendA64MaskTest8B,
-                         ::testing::Values(TestFuncs(aom_blend_a64_mask_sse4_1,
-                                                     aom_blend_a64_mask_avx2)));
-#endif  // HAVE_AVX2
-
-//////////////////////////////////////////////////////////////////////////////
-// 8 bit _d16 version
-//////////////////////////////////////////////////////////////////////////////
-
-typedef void (*F8B_D16)(uint8_t *dst, uint32_t dst_stride, const uint16_t *src0,
-                        uint32_t src0_stride, const uint16_t *src1,
-                        uint32_t src1_stride, const uint8_t *mask,
-                        uint32_t mask_stride, int w, int h, int subx, int suby,
-                        ConvolveParams *conv_params);
-typedef libaom_test::FuncParam<F8B_D16> TestFuncs_d16;
-
-class BlendA64MaskTest8B_d16
-    : public BlendA64MaskTest<F8B_D16, uint16_t, uint8_t> {
- protected:
-  // max number of bits used by the source
-  static const int kSrcMaxBitsMask = 0x3fff;
-
-  void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) {
-    ConvolveParams conv_params;
-    conv_params.round_0 = ROUND0_BITS;
-    conv_params.round_1 = COMPOUND_ROUND1_BITS;
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-    for (int i = 0; i < run_times; ++i) {
-      params_.ref_func(dst_ref_ + dst_offset_, dst_stride_,
-                       p_src0 + src0_offset_, src0_stride_,
-                       p_src1 + src1_offset_, src1_stride_, mask_,
-                       kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params);
-    }
-    aom_usec_timer_mark(&timer);
-    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
-    aom_usec_timer_start(&timer);
-    for (int i = 0; i < run_times; ++i) {
-      params_.tst_func(dst_tst_ + dst_offset_, dst_stride_,
-                       p_src0 + src0_offset_, src0_stride_,
-                       p_src1 + src1_offset_, src1_stride_, mask_,
-                       kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params);
-    }
-    aom_usec_timer_mark(&timer);
-    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
-    if (run_times > 1) {
-      printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_,
-             time1, time2);
-      printf("(%3.2f)\n", time1 / time2);
-    }
-  }
-};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BlendA64MaskTest8B_d16);
-
-TEST_P(BlendA64MaskTest8B_d16, RandomValues) {
-  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
-    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
-    for (int i = 0; i < kBufSize; ++i) {
-      dst_ref_[i] = rng_.Rand8();
-      dst_tst_[i] = rng_.Rand8();
-
-      src0_[i] = rng_.Rand16() & kSrcMaxBitsMask;
-      src1_[i] = rng_.Rand16() & kSrcMaxBitsMask;
-    }
-
-    for (int i = 0; i < kMaxMaskSize; ++i)
-      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
-
-    RunTest(bsize, 1);
-  }
-}
-
-TEST_P(BlendA64MaskTest8B_d16, ExtremeValues) {
-  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
-    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
-    for (int i = 0; i < kBufSize; ++i) {
-      dst_ref_[i] = 255;
-      dst_tst_[i] = 255;
-
-      src0_[i] = kSrcMaxBitsMask;
-      src1_[i] = kSrcMaxBitsMask;
-    }
-
-    for (int i = 0; i < kMaxMaskSize; ++i)
-      mask_[i] = AOM_BLEND_A64_MAX_ALPHA - 1;
-
-    RunTest(bsize, 1);
-  }
-}
-
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(
-    SSE4_1, BlendA64MaskTest8B_d16,
-    ::testing::Values(TestFuncs_d16(aom_lowbd_blend_a64_d16_mask_c,
-                                    aom_lowbd_blend_a64_d16_mask_sse4_1)));
-#endif  // HAVE_SSE4_1
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(
-    AVX2, BlendA64MaskTest8B_d16,
-    ::testing::Values(TestFuncs_d16(aom_lowbd_blend_a64_d16_mask_c,
-                                    aom_lowbd_blend_a64_d16_mask_avx2)));
-#endif  // HAVE_AVX2
-
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(
-    NEON, BlendA64MaskTest8B_d16,
-    ::testing::Values(TestFuncs_d16(aom_lowbd_blend_a64_d16_mask_c,
-                                    aom_lowbd_blend_a64_d16_mask_neon)));
-#endif  // HAVE_NEON
-
-//////////////////////////////////////////////////////////////////////////////
 // High bit-depth version
 //////////////////////////////////////////////////////////////////////////////
 typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,

diff --git a/test/cfl_test.cc b/test/cfl_test.cc
index fe8bdb0..fb5e2a1 100644
--- a/test/cfl_test.cc
+++ b/test/cfl_test.cc

@@ -272,49 +272,6 @@
   }
 };
 
-typedef cfl_subsample_lbd_fn (*get_subsample_lbd_fn)(TX_SIZE tx_size);
-typedef std::tuple<TX_SIZE, get_subsample_lbd_fn, get_subsample_lbd_fn,
-                   get_subsample_lbd_fn>
-    subsample_lbd_param;
-class CFLSubsampleLBDTest
-    : public CFLSubsampleTest<subsample_lbd_param, cfl_subsample_lbd_fn,
-                              uint8_t> {
- public:
-  virtual ~CFLSubsampleLBDTest() {}
-  virtual void SetUp() {
-    CFLSubsampleTest::SetUp();
-    fun_420_ref = cfl_get_luma_subsampling_420_lbd_c(tx_size);
-    fun_422_ref = cfl_get_luma_subsampling_422_lbd_c(tx_size);
-    fun_444_ref = cfl_get_luma_subsampling_444_lbd_c(tx_size);
-  }
-};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CFLSubsampleLBDTest);
-
-TEST_P(CFLSubsampleLBDTest, SubsampleLBD420Test) {
-  subsampleTest(fun_420, fun_420_ref, width >> 1, height >> 1,
-                &ACMRandom::Rand8);
-}
-
-TEST_P(CFLSubsampleLBDTest, DISABLED_SubsampleLBD420SpeedTest) {
-  subsampleSpeedTest(fun_420, fun_420_ref, &ACMRandom::Rand8);
-}
-
-TEST_P(CFLSubsampleLBDTest, SubsampleLBD422Test) {
-  subsampleTest(fun_422, fun_422_ref, width >> 1, height, &ACMRandom::Rand8);
-}
-
-TEST_P(CFLSubsampleLBDTest, DISABLED_SubsampleLBD422SpeedTest) {
-  subsampleSpeedTest(fun_422, fun_422_ref, &ACMRandom::Rand8);
-}
-
-TEST_P(CFLSubsampleLBDTest, SubsampleLBD444Test) {
-  subsampleTest(fun_444, fun_444_ref, width, height, &ACMRandom::Rand8);
-}
-
-TEST_P(CFLSubsampleLBDTest, DISABLED_SubsampleLBD444SpeedTest) {
-  subsampleSpeedTest(fun_444, fun_444_ref, &ACMRandom::Rand8);
-}
-
 typedef cfl_subsample_hbd_fn (*get_subsample_hbd_fn)(TX_SIZE tx_size);
 typedef std::tuple<TX_SIZE, get_subsample_hbd_fn, get_subsample_hbd_fn,
                    get_subsample_hbd_fn>
@@ -358,53 +315,6 @@
   subsampleSpeedTest(fun_444, fun_444_ref, &ACMRandom::Rand12);
 }
 
-typedef cfl_predict_lbd_fn (*get_predict_fn)(TX_SIZE tx_size);
-typedef std::tuple<TX_SIZE, get_predict_fn> predict_param;
-class CFLPredictTest : public ::testing::TestWithParam<predict_param>,
-                       public CFLTestWithAlignedData<uint8_t> {
- public:
-  virtual void SetUp() {
-    CFLTest::init(std::get<0>(this->GetParam()));
-    predict = std::get<1>(this->GetParam())(tx_size);
-    predict_ref = cfl_get_predict_lbd_fn_c(tx_size);
-  }
-  virtual ~CFLPredictTest() {}
-
- protected:
-  cfl_predict_lbd_fn predict;
-  cfl_predict_lbd_fn predict_ref;
-};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CFLPredictTest);
-
-TEST_P(CFLPredictTest, PredictTest) {
-  for (int it = 0; it < NUM_ITERATIONS; it++) {
-    randData(8);
-    predict(sub_luma_pels, chroma_pels, CFL_BUF_LINE, alpha_q3);
-    predict_ref(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, alpha_q3);
-    assert_eq<uint8_t>(chroma_pels, chroma_pels_ref, width, height);
-  }
-}
-TEST_P(CFLPredictTest, DISABLED_PredictSpeedTest) {
-  aom_usec_timer ref_timer;
-  aom_usec_timer timer;
-  randData(8);
-  aom_usec_timer_start(&ref_timer);
-  for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
-    predict_ref(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, alpha_q3);
-  }
-  aom_usec_timer_mark(&ref_timer);
-  int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
-
-  aom_usec_timer_start(&timer);
-  for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
-    predict(sub_luma_pels, chroma_pels, CFL_BUF_LINE, alpha_q3);
-  }
-  aom_usec_timer_mark(&timer);
-  int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
-  printSpeed(ref_elapsed_time, elapsed_time, width, height);
-  assertFaster(ref_elapsed_time, elapsed_time);
-}
-
 typedef cfl_predict_hbd_fn (*get_predict_fn_hbd)(TX_SIZE tx_size);
 typedef std::tuple<TX_SIZE, get_predict_fn_hbd> predict_param_hbd;
 class CFLPredictHBDTest : public ::testing::TestWithParam<predict_param_hbd>,
@@ -464,21 +374,6 @@
 #endif
 
 #if HAVE_SSSE3
-const subsample_lbd_param subsample_lbd_sizes_ssse3[] = {
-  ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_lbd_ssse3,
-                             cfl_get_luma_subsampling_422_lbd_ssse3,
-                             cfl_get_luma_subsampling_444_lbd_ssse3)
-};
-
-const predict_param predict_sizes_ssse3[] = { ALL_CFL_TX_SIZES(
-    cfl_get_predict_lbd_fn_ssse3) };
-
-INSTANTIATE_TEST_SUITE_P(SSSE3, CFLSubsampleLBDTest,
-                         ::testing::ValuesIn(subsample_lbd_sizes_ssse3));
-
-INSTANTIATE_TEST_SUITE_P(SSSE3, CFLPredictTest,
-                         ::testing::ValuesIn(predict_sizes_ssse3));
-
 const subsample_hbd_param subsample_hbd_sizes_ssse3[] = {
   ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_ssse3,
                              cfl_get_luma_subsampling_422_hbd_ssse3,
@@ -499,24 +394,9 @@
 const sub_avg_param sub_avg_sizes_avx2[] = { ALL_CFL_TX_SIZES(
     cfl_get_subtract_average_fn_avx2) };
 
-const subsample_lbd_param subsample_lbd_sizes_avx2[] = {
-  ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_lbd_avx2,
-                             cfl_get_luma_subsampling_422_lbd_avx2,
-                             cfl_get_luma_subsampling_444_lbd_avx2)
-};
-
-const predict_param predict_sizes_avx2[] = { ALL_CFL_TX_SIZES(
-    cfl_get_predict_lbd_fn_avx2) };
-
 INSTANTIATE_TEST_SUITE_P(AVX2, CFLSubAvgTest,
                          ::testing::ValuesIn(sub_avg_sizes_avx2));
 
-INSTANTIATE_TEST_SUITE_P(AVX2, CFLSubsampleLBDTest,
-                         ::testing::ValuesIn(subsample_lbd_sizes_avx2));
-
-INSTANTIATE_TEST_SUITE_P(AVX2, CFLPredictTest,
-                         ::testing::ValuesIn(predict_sizes_avx2));
-
 const subsample_hbd_param subsample_hbd_sizes_avx2[] = {
   ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_avx2,
                              cfl_get_luma_subsampling_422_hbd_avx2,
@@ -537,24 +417,9 @@
 const sub_avg_param sub_avg_sizes_neon[] = { ALL_CFL_TX_SIZES(
     cfl_get_subtract_average_fn_neon) };
 
-const predict_param predict_sizes_neon[] = { ALL_CFL_TX_SIZES(
-    cfl_get_predict_lbd_fn_neon) };
-
-const subsample_lbd_param subsample_lbd_sizes_neon[] = {
-  ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_lbd_neon,
-                             cfl_get_luma_subsampling_422_lbd_neon,
-                             cfl_get_luma_subsampling_444_lbd_neon)
-};
-
 INSTANTIATE_TEST_SUITE_P(NEON, CFLSubAvgTest,
                          ::testing::ValuesIn(sub_avg_sizes_neon));
 
-INSTANTIATE_TEST_SUITE_P(NEON, CFLSubsampleLBDTest,
-                         ::testing::ValuesIn(subsample_lbd_sizes_neon));
-
-INSTANTIATE_TEST_SUITE_P(NEON, CFLPredictTest,
-                         ::testing::ValuesIn(predict_sizes_neon));
-
 const subsample_hbd_param subsample_hbd_sizes_neon[] = {
   ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_neon,
                              cfl_get_luma_subsampling_422_hbd_neon,

diff --git a/test/coding_path_sync.cc b/test/coding_path_sync.cc
deleted file mode 100644
index b84b57d..0000000
--- a/test/coding_path_sync.cc
+++ /dev/null

@@ -1,215 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <vector>
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "test/acm_random.h"
-
-#include "config/aom_config.h"
-
-#include "aom/aomcx.h"
-#include "aom/aomdx.h"
-#include "aom/aom_encoder.h"
-#include "aom/aom_decoder.h"
-
-#define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
-
-using libaom_test::ACMRandom;
-namespace {
-
-class CompressedSource {
- public:
-  explicit CompressedSource(int seed) : rnd_(seed), frame_count_(0) {
-    aom_codec_iface_t *algo = aom_codec_av1_cx();
-
-    aom_codec_enc_cfg_t cfg;
-    aom_codec_enc_config_default(algo, &cfg, 0);
-
-    // force the quantizer, to reduce the sensitivity on encoding choices.
-    // e.g, we don't want this test to break when the rate control is modified.
-    {
-      const int max_qp = cfg.rc_max_quantizer;
-      const int min_qp = cfg.rc_min_quantizer;
-      const int q = rnd_.PseudoUniform(max_qp - min_qp + 1) + min_qp;
-
-      cfg.rc_end_usage = AOM_Q;
-      cfg.rc_max_quantizer = q;
-      cfg.rc_min_quantizer = q;
-    }
-
-    // choose the picture size
-    {
-      width_ = rnd_.PseudoUniform(kWidth - 8) + 8;
-      height_ = rnd_.PseudoUniform(kHeight - 8) + 8;
-    }
-
-    // choose the chroma subsampling
-    {
-      const aom_img_fmt_t fmts[] = {
-        AOM_IMG_FMT_I420,
-        AOM_IMG_FMT_I422,
-        AOM_IMG_FMT_I444,
-      };
-
-      format_ = fmts[rnd_.PseudoUniform(NELEMENTS(fmts))];
-    }
-
-    cfg.g_w = width_;
-    cfg.g_h = height_;
-    cfg.g_lag_in_frames = 0;
-    if (format_ == AOM_IMG_FMT_I420)
-      cfg.g_profile = 0;
-    else if (format_ == AOM_IMG_FMT_I444)
-      cfg.g_profile = 1;
-    else if (format_ == AOM_IMG_FMT_I422)
-      cfg.g_profile = 2;
-
-    aom_codec_enc_init(&enc_, algo, &cfg, 0);
-  }
-
-  ~CompressedSource() { aom_codec_destroy(&enc_); }
-
-  const aom_codec_cx_pkt_t *ReadFrame() {
-    uint8_t buf[kWidth * kHeight * 3] = { 0 };
-
-    // render regular pattern
-    const int period = rnd_.Rand8() % 32 + 1;
-    const int phase = rnd_.Rand8() % period;
-
-    const int val_a = rnd_.Rand8();
-    const int val_b = rnd_.Rand8();
-
-    for (int i = 0; i < (int)sizeof buf; ++i)
-      buf[i] = (i + phase) % period < period / 2 ? val_a : val_b;
-
-    aom_image_t img;
-    aom_img_wrap(&img, format_, width_, height_, 0, buf);
-    aom_codec_encode(&enc_, &img, frame_count_++, 1, 0);
-
-    aom_codec_iter_t iter = NULL;
-
-    const aom_codec_cx_pkt_t *pkt = NULL;
-
-    do {
-      pkt = aom_codec_get_cx_data(&enc_, &iter);
-    } while (pkt && pkt->kind != AOM_CODEC_CX_FRAME_PKT);
-
-    return pkt;
-  }
-
- private:
-  static const int kWidth = 128;
-  static const int kHeight = 128;
-
-  ACMRandom rnd_;
-  aom_img_fmt_t format_;
-  aom_codec_ctx_t enc_;
-  int frame_count_;
-  int width_, height_;
-};
-
-// lowers an aom_image_t to a easily comparable/printable form
-std::vector<int16_t> Serialize(const aom_image_t *img) {
-  std::vector<int16_t> bytes;
-  bytes.reserve(img->d_w * img->d_h * 3);
-  for (int plane = 0; plane < 3; ++plane) {
-    const int w = aom_img_plane_width(img, plane);
-    const int h = aom_img_plane_height(img, plane);
-
-    for (int r = 0; r < h; ++r) {
-      for (int c = 0; c < w; ++c) {
-        unsigned char *row = img->planes[plane] + r * img->stride[plane];
-        if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH)
-          bytes.push_back(row[c * 2]);
-        else
-          bytes.push_back(row[c]);
-      }
-    }
-  }
-
-  return bytes;
-}
-
-class Decoder {
- public:
-  explicit Decoder(int allowLowbitdepth) {
-    aom_codec_iface_t *algo = aom_codec_av1_dx();
-
-    aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
-    cfg.allow_lowbitdepth = allowLowbitdepth;
-
-    aom_codec_dec_init(&dec_, algo, &cfg, 0);
-  }
-
-  ~Decoder() { aom_codec_destroy(&dec_); }
-
-  std::vector<int16_t> decode(const aom_codec_cx_pkt_t *pkt) {
-    aom_codec_decode(&dec_, static_cast<uint8_t *>(pkt->data.frame.buf),
-                     pkt->data.frame.sz, NULL);
-
-    aom_codec_iter_t iter = NULL;
-    return Serialize(aom_codec_get_frame(&dec_, &iter));
-  }
-
- private:
-  aom_codec_ctx_t dec_;
-};
-
-// Try to reveal a mismatch between LBD and HBD coding paths.
-#if CONFIG_CCSO_EXT
-TEST(DISABLED_CodingPathSync, SearchForHbdLbdMismatch) {
-#else
-TEST(CodingPathSync, SearchForHbdLbdMismatch) {
-#endif
-  const int count_tests = 10;
-  for (int i = 0; i < count_tests; ++i) {
-    Decoder dec_hbd(0);
-    Decoder dec_lbd(1);
-
-    CompressedSource enc(i);
-
-    for (int k = 0; k < 3; ++k) {
-      const aom_codec_cx_pkt_t *frame = enc.ReadFrame();
-
-      std::vector<int16_t> lbd_yuv = dec_lbd.decode(frame);
-      std::vector<int16_t> hbd_yuv = dec_hbd.decode(frame);
-
-      ASSERT_EQ(lbd_yuv, hbd_yuv);
-    }
-  }
-}
-
-#if CONFIG_CCSO_EXT
-TEST(DISABLED_CodingPathSyncLarge, SearchForHbdLbdMismatchLarge) {
-#else
-TEST(CodingPathSyncLarge, SearchForHbdLbdMismatchLarge) {
-#endif
-  const int count_tests = 100;
-  const int seed = 1234;
-  for (int i = 0; i < count_tests; ++i) {
-    Decoder dec_hbd(0);
-    Decoder dec_lbd(1);
-
-    CompressedSource enc(seed + i);
-
-    for (int k = 0; k < 5; ++k) {
-      const aom_codec_cx_pkt_t *frame = enc.ReadFrame();
-
-      std::vector<int16_t> lbd_yuv = dec_lbd.decode(frame);
-      std::vector<int16_t> hbd_yuv = dec_hbd.decode(frame);
-
-      ASSERT_EQ(lbd_yuv, hbd_yuv);
-    }
-  }
-}
-
-}  // namespace

diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc
index b53d60d..d9bfb02 100644
--- a/test/comp_avg_pred_test.cc
+++ b/test/comp_avg_pred_test.cc

@@ -13,10 +13,6 @@
 #include "test/comp_avg_pred_test.h"
 
 using libaom_test::ACMRandom;
-using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGTest;
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1DISTWTDCOMPAVGTest);
-using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGUPSAMPLEDTest;
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1DISTWTDCOMPAVGUPSAMPLEDTest);
 using libaom_test::AV1DISTWTDCOMPAVG::AV1HighBDDISTWTDCOMPAVGTest;
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighBDDISTWTDCOMPAVGTest);
 using libaom_test::AV1DISTWTDCOMPAVG::AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest;
@@ -27,30 +23,6 @@
 
 namespace {
 
-TEST_P(AV1DISTWTDCOMPAVGTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
-
-TEST_P(AV1DISTWTDCOMPAVGTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
-
-#if HAVE_SSSE3
-INSTANTIATE_TEST_SUITE_P(SSSE3, AV1DISTWTDCOMPAVGTest,
-                         libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
-                             aom_dist_wtd_comp_avg_pred_ssse3));
-#endif
-
-TEST_P(AV1DISTWTDCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
-  RunSpeedTest(GET_PARAM(0));
-}
-
-TEST_P(AV1DISTWTDCOMPAVGUPSAMPLEDTest, CheckOutput) {
-  RunCheckOutput(GET_PARAM(0));
-}
-
-#if HAVE_SSSE3
-INSTANTIATE_TEST_SUITE_P(SSSE3, AV1DISTWTDCOMPAVGUPSAMPLEDTest,
-                         libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
-                             aom_dist_wtd_comp_avg_upsampled_pred_ssse3));
-#endif
-
 TEST_P(AV1HighBDDISTWTDCOMPAVGTest, DISABLED_Speed) {
   RunSpeedTest(GET_PARAM(1));
 }

diff --git a/test/comp_avg_pred_test.h b/test/comp_avg_pred_test.h
index a879c12..e93a5b6 100644
--- a/test/comp_avg_pred_test.h
+++ b/test/comp_avg_pred_test.h

@@ -86,233 +86,6 @@
                             ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
 }
 
-class AV1DISTWTDCOMPAVGTest
-    : public ::testing::TestWithParam<DISTWTDCOMPAVGParam> {
- public:
-  ~AV1DISTWTDCOMPAVGTest() {}
-  void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
-  void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RunCheckOutput(distwtdcompavg_func test_impl) {
-    const int w = kMaxSize, h = kMaxSize;
-    const int block_idx = GET_PARAM(1);
-
-    uint8_t pred8[kMaxSize * kMaxSize];
-    uint8_t ref8[kMaxSize * kMaxSize];
-    uint8_t output[kMaxSize * kMaxSize];
-    uint8_t output2[kMaxSize * kMaxSize];
-
-    for (int i = 0; i < h; ++i)
-      for (int j = 0; j < w; ++j) {
-        pred8[i * w + j] = rnd_.Rand8();
-        ref8[i * w + j] = rnd_.Rand8();
-      }
-    const int in_w = block_size_wide[block_idx];
-    const int in_h = block_size_high[block_idx];
-
-    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
-
-    for (int ii = 0; ii < 2; ii++) {
-      for (int jj = 0; jj < 4; jj++) {
-        dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
-        dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
-
-        const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
-        const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
-        aom_dist_wtd_comp_avg_pred_c(output, pred8 + offset_r * w + offset_c,
-                                     in_w, in_h, ref8 + offset_r * w + offset_c,
-                                     in_w, &dist_wtd_comp_params);
-        test_impl(output2, pred8 + offset_r * w + offset_c, in_w, in_h,
-                  ref8 + offset_r * w + offset_c, in_w, &dist_wtd_comp_params);
-
-        for (int i = 0; i < in_h; ++i) {
-          for (int j = 0; j < in_w; ++j) {
-            int idx = i * in_w + j;
-            ASSERT_EQ(output[idx], output2[idx])
-                << "Mismatch at unit tests for AV1DISTWTDCOMPAVGTest\n"
-                << in_w << "x" << in_h << " Pixel mismatch at index " << idx
-                << " = (" << i << ", " << j << ")";
-          }
-        }
-      }
-    }
-  }
-  void RunSpeedTest(distwtdcompavg_func test_impl) {
-    const int w = kMaxSize, h = kMaxSize;
-    const int block_idx = GET_PARAM(1);
-
-    uint8_t pred8[kMaxSize * kMaxSize];
-    uint8_t ref8[kMaxSize * kMaxSize];
-    uint8_t output[kMaxSize * kMaxSize];
-    uint8_t output2[kMaxSize * kMaxSize];
-
-    for (int i = 0; i < h; ++i)
-      for (int j = 0; j < w; ++j) {
-        pred8[i * w + j] = rnd_.Rand8();
-        ref8[i * w + j] = rnd_.Rand8();
-      }
-    const int in_w = block_size_wide[block_idx];
-    const int in_h = block_size_high[block_idx];
-
-    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
-
-    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
-    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
-
-    const int num_loops = 1000000000 / (in_w + in_h);
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-
-    for (int i = 0; i < num_loops; ++i)
-      aom_dist_wtd_comp_avg_pred_c(output, pred8, in_w, in_h, ref8, in_w,
-                                   &dist_wtd_comp_params);
-
-    aom_usec_timer_mark(&timer);
-    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-    printf("distwtdcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
-           1000.0 * elapsed_time / num_loops);
-
-    aom_usec_timer timer1;
-    aom_usec_timer_start(&timer1);
-
-    for (int i = 0; i < num_loops; ++i)
-      test_impl(output2, pred8, in_w, in_h, ref8, in_w, &dist_wtd_comp_params);
-
-    aom_usec_timer_mark(&timer1);
-    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
-    printf("distwtdcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
-           1000.0 * elapsed_time1 / num_loops);
-  }
-
-  libaom_test::ACMRandom rnd_;
-};  // class AV1DISTWTDCOMPAVGTest
-
-class AV1DISTWTDCOMPAVGUPSAMPLEDTest
-    : public ::testing::TestWithParam<DISTWTDCOMPAVGUPSAMPLEDParam> {
- public:
-  ~AV1DISTWTDCOMPAVGUPSAMPLEDTest() {}
-  void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
-  void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RunCheckOutput(distwtdcompavgupsampled_func test_impl) {
-    const int w = kMaxSize, h = kMaxSize;
-    const int block_idx = GET_PARAM(1);
-
-    uint8_t pred8[kMaxSize * kMaxSize];
-    uint8_t ref8[kMaxSize * kMaxSize];
-    DECLARE_ALIGNED(16, uint8_t, output[MAX_SB_SQUARE]);
-    DECLARE_ALIGNED(16, uint8_t, output2[MAX_SB_SQUARE]);
-
-    for (int i = 0; i < h; ++i)
-      for (int j = 0; j < w; ++j) {
-        pred8[i * w + j] = rnd_.Rand8();
-        ref8[i * w + j] = rnd_.Rand8();
-      }
-    const int in_w = block_size_wide[block_idx];
-    const int in_h = block_size_high[block_idx];
-
-    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
-    int sub_x_q3, sub_y_q3;
-    int subpel_search;
-    for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
-         ++subpel_search) {
-      for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) {
-        for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
-          for (int ii = 0; ii < 2; ii++) {
-            for (int jj = 0; jj < 4; jj++) {
-              dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
-              dist_wtd_comp_params.bck_offset =
-                  quant_dist_lookup_table[jj][1 - ii];
-
-              const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
-              const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
-
-              aom_dist_wtd_comp_avg_upsampled_pred_c(
-                  NULL, NULL, 0, 0, NULL, output,
-                  pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
-                  sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
-                  &dist_wtd_comp_params, subpel_search);
-              test_impl(NULL, NULL, 0, 0, NULL, output2,
-                        pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
-                        sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
-                        &dist_wtd_comp_params, subpel_search);
-
-              for (int i = 0; i < in_h; ++i) {
-                for (int j = 0; j < in_w; ++j) {
-                  int idx = i * in_w + j;
-                  ASSERT_EQ(output[idx], output2[idx])
-                      << "Mismatch at unit tests for "
-                         "AV1DISTWTDCOMPAVGUPSAMPLEDTest\n"
-                      << in_w << "x" << in_h << " Pixel mismatch at index "
-                      << idx << " = (" << i << ", " << j
-                      << "), sub pixel offset = (" << sub_y_q3 << ", "
-                      << sub_x_q3 << ")";
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  void RunSpeedTest(distwtdcompavgupsampled_func test_impl) {
-    const int w = kMaxSize, h = kMaxSize;
-    const int block_idx = GET_PARAM(1);
-
-    uint8_t pred8[kMaxSize * kMaxSize];
-    uint8_t ref8[kMaxSize * kMaxSize];
-    DECLARE_ALIGNED(16, uint8_t, output[MAX_SB_SQUARE]);
-    DECLARE_ALIGNED(16, uint8_t, output2[MAX_SB_SQUARE]);
-
-    for (int i = 0; i < h; ++i)
-      for (int j = 0; j < w; ++j) {
-        pred8[i * w + j] = rnd_.Rand8();
-        ref8[i * w + j] = rnd_.Rand8();
-      }
-    const int in_w = block_size_wide[block_idx];
-    const int in_h = block_size_high[block_idx];
-
-    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
-
-    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
-    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
-
-    int sub_x_q3 = 0;
-    int sub_y_q3 = 0;
-
-    const int num_loops = 1000000000 / (in_w + in_h);
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-    int subpel_search = USE_8_TAPS;  // set to USE_4_TAPS to test 4-tap filter.
-
-    for (int i = 0; i < num_loops; ++i)
-      aom_dist_wtd_comp_avg_upsampled_pred_c(
-          NULL, NULL, 0, 0, NULL, output, pred8, in_w, in_h, sub_x_q3, sub_y_q3,
-          ref8, in_w, &dist_wtd_comp_params, subpel_search);
-
-    aom_usec_timer_mark(&timer);
-    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-    printf("distwtdcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
-           1000.0 * elapsed_time / num_loops);
-
-    aom_usec_timer timer1;
-    aom_usec_timer_start(&timer1);
-
-    for (int i = 0; i < num_loops; ++i)
-      test_impl(NULL, NULL, 0, 0, NULL, output2, pred8, in_w, in_h, sub_x_q3,
-                sub_y_q3, ref8, in_w, &dist_wtd_comp_params, subpel_search);
-
-    aom_usec_timer_mark(&timer1);
-    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
-    printf("distwtdcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
-           1000.0 * elapsed_time1 / num_loops);
-  }
-
-  libaom_test::ACMRandom rnd_;
-};  // class AV1DISTWTDCOMPAVGUPSAMPLEDTest
-
 class AV1HighBDDISTWTDCOMPAVGTest
     : public ::testing::TestWithParam<HighbdDISTWTDCOMPAVGParam> {
  public:

diff --git a/test/comp_mask_variance_test.cc b/test/comp_mask_variance_test.cc
index f8e7f3e..4c230d2 100644
--- a/test/comp_mask_variance_test.cc
+++ b/test/comp_mask_variance_test.cc

@@ -31,11 +31,6 @@
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 namespace AV1CompMaskVariance {
-typedef void (*comp_mask_pred_func)(uint8_t *comp_pred, const uint8_t *pred,
-                                    int width, int height, const uint8_t *ref,
-                                    int ref_stride, const uint8_t *mask,
-                                    int mask_stride, int invert_mask);
-
 #if HAVE_SSSE3 || HAVE_SSE2 || HAVE_AVX2
 const BLOCK_SIZE kValidBlockSize[] = {
   BLOCK_8X8,   BLOCK_8X16,  BLOCK_8X32,   BLOCK_16X8,   BLOCK_16X16,
@@ -44,240 +39,6 @@
   BLOCK_16X64, BLOCK_64X16
 };
 #endif
-typedef std::tuple<comp_mask_pred_func, BLOCK_SIZE> CompMaskPredParam;
-
-class AV1CompMaskVarianceTest
-    : public ::testing::TestWithParam<CompMaskPredParam> {
- public:
-  ~AV1CompMaskVarianceTest();
-  void SetUp();
-
-  void TearDown();
-
- protected:
-  void RunCheckOutput(comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv);
-  void RunSpeedTest(comp_mask_pred_func test_impl, BLOCK_SIZE bsize);
-  bool CheckResult(int width, int height) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        const int idx = y * width + x;
-        if (comp_pred1_[idx] != comp_pred2_[idx]) {
-          printf("%dx%d mismatch @%d(%d,%d) ", width, height, idx, y, x);
-          printf("%d != %d ", comp_pred1_[idx], comp_pred2_[idx]);
-          return false;
-        }
-      }
-    }
-    return true;
-  }
-
-  libaom_test::ACMRandom rnd_;
-  uint8_t *comp_pred1_;
-  uint8_t *comp_pred2_;
-  uint8_t *pred_;
-  uint8_t *ref_buffer_;
-  uint8_t *ref_;
-};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1CompMaskVarianceTest);
-
-AV1CompMaskVarianceTest::~AV1CompMaskVarianceTest() { ; }
-
-void AV1CompMaskVarianceTest::SetUp() {
-  rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
-  av1_init_wedge_masks();
-  comp_pred1_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
-  comp_pred2_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
-  pred_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
-  ref_buffer_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE + (8 * MAX_SB_SIZE));
-  ref_ = ref_buffer_ + (8 * MAX_SB_SIZE);
-  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
-    pred_[i] = rnd_.Rand8();
-  }
-  for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) {
-    ref_buffer_[i] = rnd_.Rand8();
-  }
-}
-
-void AV1CompMaskVarianceTest::TearDown() {
-  aom_free(comp_pred1_);
-  aom_free(comp_pred2_);
-  aom_free(pred_);
-  aom_free(ref_buffer_);
-  libaom_test::ClearSystemState();
-}
-
-void AV1CompMaskVarianceTest::RunCheckOutput(comp_mask_pred_func test_impl,
-                                             BLOCK_SIZE bsize, int inv) {
-  const int w = block_size_wide[bsize];
-  const int h = block_size_high[bsize];
-  const int wedge_types = get_wedge_types_lookup(bsize);
-  for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
-    const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
-
-    aom_comp_mask_pred_c(comp_pred1_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w,
-                         inv);
-    test_impl(comp_pred2_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w, inv);
-
-    ASSERT_EQ(CheckResult(w, h), true)
-        << " wedge " << wedge_index << " inv " << inv;
-  }
-}
-
-void AV1CompMaskVarianceTest::RunSpeedTest(comp_mask_pred_func test_impl,
-                                           BLOCK_SIZE bsize) {
-  const int w = block_size_wide[bsize];
-  const int h = block_size_high[bsize];
-  const int wedge_types = get_wedge_types_lookup(bsize);
-  int wedge_index = wedge_types / 2;
-  const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
-  const int num_loops = 1000000000 / (w + h);
-
-  comp_mask_pred_func funcs[2] = { aom_comp_mask_pred_c, test_impl };
-  double elapsed_time[2] = { 0 };
-  for (int i = 0; i < 2; ++i) {
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-    comp_mask_pred_func func = funcs[i];
-    for (int j = 0; j < num_loops; ++j) {
-      func(comp_pred1_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w, 0);
-    }
-    aom_usec_timer_mark(&timer);
-    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
-    elapsed_time[i] = 1000.0 * time / num_loops;
-  }
-  printf("compMask %3dx%-3d: %7.2f/%7.2fns", w, h, elapsed_time[0],
-         elapsed_time[1]);
-  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
-}
-
-TEST_P(AV1CompMaskVarianceTest, CheckOutput) {
-  // inv = 0, 1
-  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 0);
-  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 1);
-}
-
-TEST_P(AV1CompMaskVarianceTest, DISABLED_Speed) {
-  RunSpeedTest(GET_PARAM(0), GET_PARAM(1));
-}
-
-#if HAVE_SSSE3
-INSTANTIATE_TEST_SUITE_P(
-    SSSE3, AV1CompMaskVarianceTest,
-    ::testing::Combine(::testing::Values(&aom_comp_mask_pred_ssse3),
-                       ::testing::ValuesIn(kValidBlockSize)));
-#endif
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(
-    AVX2, AV1CompMaskVarianceTest,
-    ::testing::Combine(::testing::Values(&aom_comp_mask_pred_avx2),
-                       ::testing::ValuesIn(kValidBlockSize)));
-#endif
-
-#ifndef aom_comp_mask_pred
-// can't run this test if aom_comp_mask_pred is defined to aom_comp_mask_pred_c
-class AV1CompMaskUpVarianceTest : public AV1CompMaskVarianceTest {
- public:
-  ~AV1CompMaskUpVarianceTest();
-
- protected:
-  void RunCheckOutput(comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv);
-  void RunSpeedTest(comp_mask_pred_func test_impl, BLOCK_SIZE bsize,
-                    int havSub);
-};
-
-AV1CompMaskUpVarianceTest::~AV1CompMaskUpVarianceTest() { ; }
-
-void AV1CompMaskUpVarianceTest::RunCheckOutput(comp_mask_pred_func test_impl,
-                                               BLOCK_SIZE bsize, int inv) {
-  const int w = block_size_wide[bsize];
-  const int h = block_size_high[bsize];
-  const int wedge_types = get_wedge_types_lookup(bsize);
-  int subpel_search;
-  for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
-       ++subpel_search) {
-    // loop through subx and suby
-    for (int sub = 0; sub < 8 * 8; ++sub) {
-      int subx = sub & 0x7;
-      int suby = (sub >> 3);
-      for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
-        const uint8_t *mask =
-            av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
-
-        // ref
-        aom_comp_mask_upsampled_pred_c(
-            NULL, NULL, 0, 0, NULL, comp_pred1_, pred_, w, h, subx, suby, ref_,
-            MAX_SB_SIZE, mask, w, inv, subpel_search);
-
-        aom_comp_mask_pred = test_impl;  // test
-        aom_comp_mask_upsampled_pred(NULL, NULL, 0, 0, NULL, comp_pred2_, pred_,
-                                     w, h, subx, suby, ref_, MAX_SB_SIZE, mask,
-                                     w, inv, subpel_search);
-        ASSERT_EQ(CheckResult(w, h), true)
-            << " wedge " << wedge_index << " inv " << inv << "sub (" << subx
-            << "," << suby << ")";
-      }
-    }
-  }
-}
-
-void AV1CompMaskUpVarianceTest::RunSpeedTest(comp_mask_pred_func test_impl,
-                                             BLOCK_SIZE bsize, int havSub) {
-  const int w = block_size_wide[bsize];
-  const int h = block_size_high[bsize];
-  const int subx = havSub ? 3 : 0;
-  const int suby = havSub ? 4 : 0;
-  const int wedge_types = get_wedge_types_lookup(bsize);
-  int wedge_index = wedge_types / 2;
-  const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
-
-  const int num_loops = 1000000000 / (w + h);
-  comp_mask_pred_func funcs[2] = { &aom_comp_mask_pred_c, test_impl };
-  double elapsed_time[2] = { 0 };
-  int subpel_search = USE_8_TAPS;  // set to USE_4_TAPS to test 4-tap filter.
-  for (int i = 0; i < 2; ++i) {
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-    aom_comp_mask_pred = funcs[i];
-    for (int j = 0; j < num_loops; ++j) {
-      aom_comp_mask_upsampled_pred(NULL, NULL, 0, 0, NULL, comp_pred1_, pred_,
-                                   w, h, subx, suby, ref_, MAX_SB_SIZE, mask, w,
-                                   0, subpel_search);
-    }
-    aom_usec_timer_mark(&timer);
-    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
-    elapsed_time[i] = 1000.0 * time / num_loops;
-  }
-  printf("CompMaskUp[%d] %3dx%-3d:%7.2f/%7.2fns", havSub, w, h, elapsed_time[0],
-         elapsed_time[1]);
-  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
-}
-
-TEST_P(AV1CompMaskUpVarianceTest, CheckOutput) {
-  // inv mask = 0, 1
-  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 0);
-  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 1);
-}
-
-TEST_P(AV1CompMaskUpVarianceTest, DISABLED_Speed) {
-  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 1);
-}
-
-#if HAVE_SSSE3
-INSTANTIATE_TEST_SUITE_P(
-    SSSE3, AV1CompMaskUpVarianceTest,
-    ::testing::Combine(::testing::Values(&aom_comp_mask_pred_ssse3),
-                       ::testing::ValuesIn(kValidBlockSize)));
-#endif
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(
-    AVX2, AV1CompMaskUpVarianceTest,
-    ::testing::Combine(::testing::Values(&aom_comp_mask_pred_avx2),
-                       ::testing::ValuesIn(kValidBlockSize)));
-#endif
-
-#endif  // ifndef aom_comp_mask_pred
 
 typedef void (*highbd_comp_mask_pred_func)(uint8_t *comp_pred8,
                                            const uint8_t *pred8, int width,

diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 105f291..f4077d4 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc

@@ -776,15 +776,6 @@
                          ::testing::ValuesIn(kArrayConvolve_sse2));
 #endif
 
-#if HAVE_SSSE3
-const ConvolveFunctions convolve8_ssse3(aom_convolve8_horiz_ssse3,
-                                        aom_convolve8_vert_ssse3, 0);
-
-const ConvolveParam kArrayConvolve8_ssse3[] = { ALL_SIZES(convolve8_ssse3) };
-INSTANTIATE_TEST_SUITE_P(SSSE3, ConvolveTest,
-                         ::testing::ValuesIn(kArrayConvolve8_ssse3));
-#endif
-
 #if HAVE_AVX2
 const ConvolveFunctions wrap_convolve8_avx2(wrap_convolve8_horiz_avx2_8,
                                             wrap_convolve8_vert_avx2_8, 8);

diff --git a/test/decode_multithreaded_test.cc b/test/decode_multithreaded_test.cc
index b196093..2da1c3c 100644
--- a/test/decode_multithreaded_test.cc
+++ b/test/decode_multithreaded_test.cc

@@ -40,7 +40,6 @@
     cfg.w = 704;
     cfg.h = 576;
     cfg.threads = 1;
-    cfg.allow_lowbitdepth = 1;
     single_thread_dec_ = codec_->CreateDecoder(cfg, 0);
 
     // Test cfg.threads == powers of 2.

diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc
index 0bff17b..5e5380e 100644
--- a/test/decode_perf_test.cc
+++ b/test/decode_perf_test.cc

@@ -66,7 +66,6 @@
 
   aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
   cfg.threads = threads;
-  cfg.allow_lowbitdepth = 1;
   libaom_test::AV1Decoder decoder(cfg, 0);
 
   aom_usec_timer t;
@@ -215,7 +214,6 @@
 
   aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
   cfg.threads = threads;
-  cfg.allow_lowbitdepth = 1;
   libaom_test::AV1Decoder decoder(cfg, 0);
 
   aom_usec_timer t;

diff --git a/test/edge_detect_test.cc b/test/edge_detect_test.cc
index f5ffa02..29eb0be 100644
--- a/test/edge_detect_test.cc
+++ b/test/edge_detect_test.cc

@@ -23,12 +23,8 @@
 using std::get;
 using std::tuple;
 
-static int get_pix(uint8_t *buf, int i, bool high_bd) {
-  if (high_bd) {
-    return *CONVERT_TO_SHORTPTR(buf + i);
-  } else {
-    return buf[i];
-  }
+static int get_pix(uint8_t *buf, int i) {
+  return *CONVERT_TO_SHORTPTR(buf + i);
 }
 
 /** Get the (i, j) value from the input; if i or j is outside of the width
@@ -45,7 +41,7 @@
  * padded data. Must be freed with free_pad_8tap. The output will be either
  * 8-bit or 16-bit, depending on the high bit-depth (high_bd) field.
  */
-static uint8_t *pad_8tap_convolve(const int *data, int w, int h, bool high_bd) {
+static uint8_t *pad_8tap_convolve(const int *data, int w, int h) {
   // SIMD optimizations require the width to be a multiple of 8 and the height
   // to be multiples of 4.
   assert(w % 8 == 0);
@@ -56,12 +52,7 @@
   const int pad_h = h + 7;
 
   uint8_t *dst;
-  if (high_bd) {
-    dst =
-        CONVERT_TO_BYTEPTR(aom_memalign(32, sizeof(uint16_t) * pad_w * pad_h));
-  } else {
-    dst = (uint8_t *)aom_memalign(32, sizeof(uint8_t) * pad_w * pad_h);
-  }
+  dst = CONVERT_TO_BYTEPTR(aom_memalign(32, sizeof(uint16_t) * pad_w * pad_h));
   if (dst == nullptr) {
     EXPECT_NE(dst, nullptr);
     return nullptr;
@@ -70,11 +61,7 @@
   for (int j = 0; j < pad_h; ++j) {
     for (int i = 0; i < pad_w; ++i) {
       const int v = get_nearest_pix(data, w, h, i - 3, j - 3);
-      if (high_bd) {
-        *CONVERT_TO_SHORTPTR(dst + i + j * pad_w) = v;
-      } else {
-        dst[i + j * pad_w] = static_cast<uint8_t>(v);
-      }
+      *CONVERT_TO_SHORTPTR(dst + i + j * pad_w) = v;
     }
   }
   return dst + (w + 7) * 3 + 3;
@@ -82,55 +69,38 @@
 
 static int stride_8tap(int width) { return width + 7; }
 
-static void free_pad_8tap(uint8_t *padded, int width, bool high_bd) {
-  if (high_bd) {
-    aom_free(CONVERT_TO_SHORTPTR(padded - (width + 7) * 3 - 3));
-  } else {
-    aom_free(padded - (width + 7) * 3 - 3);
-  }
+static void free_pad_8tap(uint8_t *padded, int width) {
+  aom_free(CONVERT_TO_SHORTPTR(padded - (width + 7) * 3 - 3));
 }
 
 struct Pad8TapConvolveDeleter {
-  Pad8TapConvolveDeleter(const int width, const bool high_bd)
-      : width(width), high_bd(high_bd) {}
+  Pad8TapConvolveDeleter(const int width) : width(width) {}
   void operator()(uint8_t *p) {
     if (p != nullptr) {
-      free_pad_8tap(p, width, high_bd);
+      free_pad_8tap(p, width);
     }
   }
   const int width;
-  const bool high_bd;
 };
 
-static uint8_t *malloc_bd(int num_entries, bool high_bd) {
-  const int bytes_per_entry = high_bd ? sizeof(uint16_t) : sizeof(uint8_t);
+static uint8_t *malloc_bd(int num_entries) {
+  const int bytes_per_entry = sizeof(uint16_t);
 
   uint8_t *buf = (uint8_t *)aom_memalign(32, bytes_per_entry * num_entries);
-  if (high_bd) {
-    return CONVERT_TO_BYTEPTR(buf);
-  } else {
-    return buf;
-  }
+  return CONVERT_TO_BYTEPTR(buf);
 }
 
-static void free_bd(uint8_t *p, bool high_bd) {
-  if (high_bd) {
-    aom_free(CONVERT_TO_SHORTPTR(p));
-  } else {
-    aom_free(p);
-  }
-}
+static void free_bd(uint8_t *p) { aom_free(CONVERT_TO_SHORTPTR(p)); }
 
 struct MallocBdDeleter {
-  explicit MallocBdDeleter(const bool high_bd) : high_bd(high_bd) {}
-  void operator()(uint8_t *p) { free_bd(p, high_bd); }
-  const bool high_bd;
+  explicit MallocBdDeleter(void) {}
+  void operator()(uint8_t *p) { free_bd(p); }
 };
 
 class EdgeDetectBrightnessTest :
     // Parameters are (brightness, width, height, high bit depth representation,
     // bit depth).
-    public ::testing::TestWithParam<tuple<int, int, int, bool, int> > {
+    public ::testing::TestWithParam<tuple<int, int, int, int> > {
  protected:
   void SetUp() override {
     // Allocate a (width by height) array of luma values in orig_.
@@ -139,7 +109,6 @@
     const int brightness = GET_PARAM(0);
     const int width = GET_PARAM(1);
     const int height = GET_PARAM(2);
-    const bool high_bd = GET_PARAM(3);
 
     // Create the padded image of uniform brightness.
     std::unique_ptr<int[]> orig(new int[width * height]);
@@ -147,17 +116,16 @@
     for (int i = 0; i < width * height; ++i) {
       orig[i] = brightness;
     }
-    input_ = pad_8tap_convolve(orig.get(), width, height, high_bd);
+    input_ = pad_8tap_convolve(orig.get(), width, height);
     ASSERT_NE(input_, nullptr);
-    output_ = malloc_bd(width * height, high_bd);
+    output_ = malloc_bd(width * height);
     ASSERT_NE(output_, nullptr);
   }
 
   void TearDown() override {
     const int width = GET_PARAM(1);
-    const bool high_bd = GET_PARAM(3);
-    free_pad_8tap(input_, width, high_bd);
-    free_bd(output_, high_bd);
+    free_pad_8tap(input_, width);
+    free_bd(output_);
   }
 
   // Skip the tests where brightness exceeds the bit-depth; we run into this
@@ -166,14 +134,10 @@
   // high bit depth representation is not set.
   bool should_skip() const {
     const int brightness = GET_PARAM(0);
-    const int bd = GET_PARAM(4);
+    const int bd = GET_PARAM(3);
     if (brightness >= (1 << bd)) {
       return true;
     }
-    const bool high_bd = GET_PARAM(3);
-    if (bd > 8 && !high_bd) {
-      return true;
-    }
     return false;
   }
 
@@ -193,13 +157,11 @@
   const int brightness = GET_PARAM(0);
   const int width = GET_PARAM(1);
   const int height = GET_PARAM(2);
-  const bool high_bd = GET_PARAM(3);
-  const int bd = GET_PARAM(4);
+  const int bd = GET_PARAM(3);
 
-  av1_gaussian_blur(input_, stride_8tap(width), width, height, output_, high_bd,
-                    bd);
+  av1_gaussian_blur(input_, stride_8tap(width), width, height, output_, bd);
   for (int i = 0; i < width * height; ++i) {
-    ASSERT_EQ(brightness, get_pix(output_, i, high_bd));
+    ASSERT_EQ(brightness, get_pix(output_, i));
   }
 }
 
@@ -210,12 +172,11 @@
   }
   const int width = GET_PARAM(1);
   const int height = GET_PARAM(2);
-  const bool high_bd = GET_PARAM(3);
-  const int bd = GET_PARAM(4);
+  const int bd = GET_PARAM(3);
 
   ASSERT_EQ(
-      0, av1_edge_exists(input_, stride_8tap(width), width, height, high_bd, bd)
-             .magnitude);
+      0,
+      av1_edge_exists(input_, stride_8tap(width), width, height, bd).magnitude);
 }
 
 INSTANTIATE_TEST_SUITE_P(ImageBrightnessTests, EdgeDetectBrightnessTest,
@@ -228,36 +189,18 @@
                              ::testing::Values(8, 16, 32),
                              // Height
                              ::testing::Values(4, 8, 12, 32),
-                             // High bit depth representation
-                             ::testing::Bool(),
                              // Bit depth
                              ::testing::Values(8, 10, 12)));
 
 class EdgeDetectImageTest :
     // Parameters are (width, height, high bit depth representation, bit depth).
-    public ::testing::TestWithParam<tuple<int, int, bool, int> > {
- protected:
-  // Skip the tests where bit depth is greater than 8, but high bit depth
-  // representation is not set (limitation of testing framework).
-  bool should_skip() const {
-    const bool high_bd = GET_PARAM(2);
-    const int bd = GET_PARAM(3);
-    return bd > 8 && !high_bd;
-  }
-};
+    public ::testing::TestWithParam<tuple<int, int, int> > {};
 
 // Generate images with black on one side and white on the other.
 TEST_P(EdgeDetectImageTest, BlackWhite) {
-  // Some combination of parameters are non-sensical, due to limitations
-  // of the testing framework. Ignore these.
-  if (should_skip()) {
-    return;
-  }
-
   const int width = GET_PARAM(0);
   const int height = GET_PARAM(1);
-  const bool high_bd = GET_PARAM(2);
-  const int bd = GET_PARAM(3);
+  const int bd = GET_PARAM(2);
 
   const int white = (1 << bd) - 1;
   std::unique_ptr<int[]> orig(new int[width * height]);
@@ -272,16 +215,16 @@
   }
 
   std::unique_ptr<uint8_t[], Pad8TapConvolveDeleter> padded(
-      pad_8tap_convolve(orig.get(), width, height, high_bd),
-      Pad8TapConvolveDeleter(width, high_bd));
+      pad_8tap_convolve(orig.get(), width, height),
+      Pad8TapConvolveDeleter(width));
   ASSERT_NE(padded, nullptr);
   // Value should be between 556 and 560.
-  ASSERT_LE(556, av1_edge_exists(padded.get(), stride_8tap(width), width,
-                                 height, high_bd, bd)
-                     .magnitude);
-  ASSERT_GE(560, av1_edge_exists(padded.get(), stride_8tap(width), width,
-                                 height, high_bd, bd)
-                     .magnitude);
+  ASSERT_LE(556,
+            av1_edge_exists(padded.get(), stride_8tap(width), width, height, bd)
+                .magnitude);
+  ASSERT_GE(560,
+            av1_edge_exists(padded.get(), stride_8tap(width), width, height, bd)
+                .magnitude);
 }
 
 // Hardcoded blur tests.
@@ -294,26 +237,21 @@
                                     147, 149, 145, 142, 143, 138, 126, 118,
                                     164, 156, 148, 144, 148, 148, 138, 126 };
 
-static void hardcoded_blur_test_aux(const bool high_bd) {
+static void hardcoded_blur_test_aux(void) {
   const int w = 8;
   const int h = 4;
   for (int bd = 8; bd <= 12; bd += 2) {
     // Skip the tests where bit depth is greater than 8, but high bit depth
     // representation is not set.
-    if (bd > 8 && !high_bd) {
-      break;
-    }
-    std::unique_ptr<uint8_t[], MallocBdDeleter> output(
-        malloc_bd(w * h, high_bd), MallocBdDeleter(high_bd));
+    std::unique_ptr<uint8_t[], MallocBdDeleter> output(malloc_bd(w * h),
+                                                       MallocBdDeleter());
     ASSERT_NE(output, nullptr);
     std::unique_ptr<uint8_t[], Pad8TapConvolveDeleter> padded(
-        pad_8tap_convolve(luma, w, h, high_bd),
-        Pad8TapConvolveDeleter(w, high_bd));
+        pad_8tap_convolve(luma, w, h), Pad8TapConvolveDeleter(w));
     ASSERT_NE(padded, nullptr);
-    av1_gaussian_blur(padded.get(), stride_8tap(w), w, h, output.get(), high_bd,
-                      bd);
+    av1_gaussian_blur(padded.get(), stride_8tap(w), w, h, output.get(), bd);
     for (int i = 0; i < w * h; ++i) {
-      ASSERT_EQ(expected[i], get_pix(output.get(), i, high_bd));
+      ASSERT_EQ(expected[i], get_pix(output.get(), i));
     }
 
     // If we multiply the inputs by a constant factor, the output should not
@@ -323,42 +261,29 @@
       for (int i = 0; i < 32; ++i) {
         scaled_luma[i] = luma[i] * c;
       }
-      padded.reset(pad_8tap_convolve(scaled_luma, w, h, high_bd));
+      padded.reset(pad_8tap_convolve(scaled_luma, w, h));
       ASSERT_NE(padded, nullptr);
-      av1_gaussian_blur(padded.get(), stride_8tap(w), w, h, output.get(),
-                        high_bd, bd);
+      av1_gaussian_blur(padded.get(), stride_8tap(w), w, h, output.get(), bd);
       for (int i = 0; i < w * h; ++i) {
-        ASSERT_GE(c / 2,
-                  abs(expected[i] * c - get_pix(output.get(), i, high_bd)));
+        ASSERT_GE(c / 2, abs(expected[i] * c - get_pix(output.get(), i)));
       }
     }
   }
 }
 
-TEST(EdgeDetectImageTest, HardcodedBlurTest) {
-  hardcoded_blur_test_aux(false);
-  hardcoded_blur_test_aux(true);
-}
+TEST(EdgeDetectImageTest, HardcodedBlurTest) { hardcoded_blur_test_aux(); }
 
 TEST(EdgeDetectImageTest, SobelTest) {
   // Randomly generated 3x3. Compute Sobel for middle value.
-  const uint8_t buf[9] = { 241, 147, 7, 90, 184, 103, 28, 186, 2 };
-  const int stride = 3;
-  bool high_bd = false;
-  sobel_xy result = av1_sobel(buf, stride, 1, 1, high_bd);
-  ASSERT_EQ(234, result.x);
-  ASSERT_EQ(140, result.y);
-
-  // Verify it works for 8-bit values in a high bit-depth buffer.
   const uint16_t buf8_16[9] = { 241, 147, 7, 90, 184, 103, 28, 186, 2 };
-  high_bd = true;
-  result = av1_sobel(CONVERT_TO_BYTEPTR(buf8_16), stride, 1, 1, high_bd);
+  const int stride = 3;
+  sobel_xy result = av1_sobel(CONVERT_TO_BYTEPTR(buf8_16), stride, 1, 1);
   ASSERT_EQ(234, result.x);
   ASSERT_EQ(140, result.y);
 
   // Verify it works for high bit-depth values as well.
   const uint16_t buf16[9] = { 241, 147, 7, 90, 184, 2003, 1028, 186, 2 };
-  result = av1_sobel(CONVERT_TO_BYTEPTR(buf16), stride, 1, 1, high_bd);
+  result = av1_sobel(CONVERT_TO_BYTEPTR(buf16), stride, 1, 1);
   ASSERT_EQ(-2566, result.x);
   ASSERT_EQ(-860, result.y);
 }
@@ -369,8 +294,6 @@
                              ::testing::Values(8, 16, 32),
                              // Height
                              ::testing::Values(4, 8, 12, 32),
-                             // High bit depth representation
-                             ::testing::Bool(),
                              // Bit depth
                              ::testing::Values(8, 10, 12)));
 }  // namespace

diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index 55b7ea3..2d191cd 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc

@@ -88,16 +88,18 @@
   passes_ = 1;
 }
 
-static bool compare_plane(const uint8_t *const buf1, int stride1,
-                          const uint8_t *const buf2, int stride2, int w, int h,
+static bool compare_plane(const uint8_t *const _buf1, int stride1,
+                          const uint8_t *const _buf2, int stride2, int w, int h,
                           int *const mismatch_row, int *const mismatch_col,
                           int *const mismatch_pix1, int *const mismatch_pix2) {
   int r, c;
 
   for (r = 0; r < h; ++r) {
+    const uint16_t *buf1 = (const uint16_t *)(_buf1 + r * stride1);
+    const uint16_t *buf2 = (const uint16_t *)(_buf2 + r * stride2);
     for (c = 0; c < w; ++c) {
-      const int pix1 = buf1[r * stride1 + c];
-      const int pix2 = buf2[r * stride2 + c];
+      const int pix1 = buf1[c];
+      const int pix2 = buf2[c];
 
       if (pix1 != pix2) {
         if (mismatch_row != NULL) *mismatch_row = r;
@@ -163,7 +165,6 @@
 
 void EncoderTest::RunLoop(VideoSource *video) {
   aom_codec_dec_cfg_t dec_cfg = aom_codec_dec_cfg_t();
-  dec_cfg.allow_lowbitdepth = 1;
 
   ASSERT_EQ(1, (int)passes_);
   for (unsigned int pass = 0; pass < passes_; pass++) {

diff --git a/test/end_to_end_test.cc b/test/end_to_end_test.cc
index a79d09f..6a06224 100644
--- a/test/end_to_end_test.cc
+++ b/test/end_to_end_test.cc

@@ -149,7 +149,6 @@
     cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
     cfg_.g_bit_depth = test_video_param_.bit_depth;
     init_flags_ = AOM_CODEC_USE_PSNR;
-    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
 
     std::unique_ptr<libaom_test::VideoSource> video;
     if (is_extension_y4m(test_video_param_.filename)) {

diff --git a/test/error_block_test.cc b/test/error_block_test.cc
index 03b89a4..9ca0477 100644
--- a/test/error_block_test.cc
+++ b/test/error_block_test.cc

@@ -37,21 +37,9 @@
                                   const tran_low_t *dqcoeff,
                                   intptr_t block_size, int64_t *ssz, int bps);
 
-typedef int64_t (*ErrorBlockFunc8Bits)(const tran_low_t *coeff,
-                                       const tran_low_t *dqcoeff,
-                                       intptr_t block_size, int64_t *ssz);
-
 typedef std::tuple<ErrorBlockFunc, ErrorBlockFunc, aom_bit_depth_t>
     ErrorBlockParam;
 
-template <ErrorBlockFunc8Bits fn>
-int64_t BlockError8BitWrapper(const tran_low_t *coeff,
-                              const tran_low_t *dqcoeff, intptr_t block_size,
-                              int64_t *ssz, int bps) {
-  EXPECT_EQ(bps, 8);
-  return fn(coeff, dqcoeff, block_size, ssz);
-}
-
 class ErrorBlockTest : public ::testing::TestWithParam<ErrorBlockParam> {
  public:
   virtual ~ErrorBlockTest() {}
@@ -245,8 +233,6 @@
              AOM_BITS_12),
   make_tuple(&av1_highbd_block_error_sse2, &av1_highbd_block_error_c,
              AOM_BITS_8),
-  make_tuple(&BlockError8BitWrapper<av1_block_error_sse2>,
-             &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8)
 };
 
 INSTANTIATE_TEST_SUITE_P(SSE2, ErrorBlockTest,
@@ -261,27 +247,10 @@
              AOM_BITS_12),
   make_tuple(&av1_highbd_block_error_avx2, &av1_highbd_block_error_c,
              AOM_BITS_8),
-  make_tuple(&BlockError8BitWrapper<av1_block_error_avx2>,
-             &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8)
 };
 
 INSTANTIATE_TEST_SUITE_P(AVX2, ErrorBlockTest,
                          ::testing::ValuesIn(kErrorBlockTestParamsAvx2));
 #endif  // HAVE_AVX2
 
-#if (HAVE_MSA)
-INSTANTIATE_TEST_SUITE_P(
-    MSA, ErrorBlockTest,
-    ::testing::Values(make_tuple(&BlockError8BitWrapper<av1_block_error_msa>,
-                                 &BlockError8BitWrapper<av1_block_error_c>,
-                                 AOM_BITS_8)));
-#endif  // HAVE_MSA
-
-#if (HAVE_NEON)
-INSTANTIATE_TEST_SUITE_P(
-    NEON, ErrorBlockTest,
-    ::testing::Values(make_tuple(&BlockError8BitWrapper<av1_block_error_neon>,
-                                 &BlockError8BitWrapper<av1_block_error_c>,
-                                 AOM_BITS_8)));
-#endif  // HAVE_NEON
 }  // namespace

diff --git a/test/ethread_test.cc b/test/ethread_test.cc
index c9c7105..c00b7f5 100644
--- a/test/ethread_test.cc
+++ b/test/ethread_test.cc

@@ -211,7 +211,6 @@
     aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
     cfg.w = 1280;
     cfg.h = 720;
-    cfg.allow_lowbitdepth = 1;
     decoder_ = codec_->CreateDecoder(cfg, 0);
     if (decoder_->IsAV1()) {
       decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1);

diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc
index 575a6f2..6f9029f 100644
--- a/test/external_frame_buffer_test.cc
+++ b/test/external_frame_buffer_test.cc

@@ -312,7 +312,6 @@
     video_->Begin();
 
     aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
-    cfg.allow_lowbitdepth = 0;
     decoder_ = new libaom_test::AV1Decoder(cfg, 0);
     ASSERT_TRUE(decoder_ != NULL);
   }
@@ -380,7 +379,6 @@
     video_->Begin();
 
     aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
-    cfg.allow_lowbitdepth = 0;
     decoder_ = new libaom_test::AV1Decoder(cfg, 0);
     ASSERT_TRUE(decoder_ != NULL);
   }
@@ -429,7 +427,6 @@
   OpenMD5File(md5_filename);
 
   // Set decode config.
-  cfg.allow_lowbitdepth = 0;
   set_cfg(cfg);
 
   // Decode frame, and check the md5 matching.

diff --git a/test/frame_error_test.cc b/test/frame_error_test.cc
deleted file mode 100644
index f8e521b..0000000
--- a/test/frame_error_test.cc
+++ /dev/null

@@ -1,166 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <tuple>
-
-#include "config/av1_rtcd.h"
-
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/aom_timer.h"
-#include "aom_ports/mem.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/util.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-namespace {
-typedef int64_t (*frame_error_func)(const uint8_t *const ref, int stride,
-                                    const uint8_t *const dst, int p_width,
-                                    int p_height, int p_stride);
-#if HAVE_AVX2 || HAVE_SSE2
-const int kBlockWidth[] = {
-  832, 834, 640, 1280, 1920,
-};
-const int kBlockHeight[] = {
-  480, 482, 360, 720, 1080,
-};
-#endif
-typedef std::tuple<frame_error_func, int, int> FrameErrorParam;
-
-class AV1FrameErrorTest : public ::testing::TestWithParam<FrameErrorParam> {
- public:
-  virtual ~AV1FrameErrorTest() {}
-  virtual void SetUp() {
-    rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
-  }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RandomValues(frame_error_func test_impl, int width, int height);
-  void ExtremeValues(frame_error_func test_impl, int width, int height);
-  void RunSpeedTest(frame_error_func test_impl, int width, int height);
-  libaom_test::ACMRandom rnd_;
-};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1FrameErrorTest);
-
-void AV1FrameErrorTest::RandomValues(frame_error_func test_impl, int width,
-                                     int height) {
-  const int stride = (((width * 3) / 2) + 15) & ~15;
-  const int max_blk_size = stride * height;
-  uint8_t *const dst =
-      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*dst)));
-  uint8_t *const ref =
-      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*ref)));
-  ASSERT_TRUE(dst != NULL);
-  ASSERT_TRUE(ref != NULL);
-  for (int i = 0; i < max_blk_size; ++i) {
-    dst[i] = rnd_.Rand8();
-    ref[i] = rnd_.Rand8();
-  }
-  const int64_t ref_error =
-      av1_calc_frame_error_c(ref, stride, dst, width, height, stride);
-  const int64_t test_error = test_impl(ref, stride, dst, width, height, stride);
-  ASSERT_EQ(test_error, ref_error) << width << "x" << height;
-  aom_free(dst);
-  aom_free(ref);
-}
-
-void AV1FrameErrorTest::ExtremeValues(frame_error_func test_impl, int width,
-                                      int height) {
-  const int stride = (((width * 3) / 2) + 15) & ~15;
-  const int max_blk_size = stride * height;
-  uint8_t *const dst =
-      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*dst)));
-  uint8_t *const ref =
-      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*ref)));
-  ASSERT_TRUE(dst != NULL);
-  ASSERT_TRUE(ref != NULL);
-  for (int r = 0; r < 2; r++) {
-    if (r == 0) {
-      memset(dst, 0, max_blk_size);
-      memset(ref, 255, max_blk_size);
-    } else if (r == 1) {
-      memset(dst, 255, max_blk_size);
-      memset(ref, 0, max_blk_size);
-    }
-    const int64_t ref_error =
-        av1_calc_frame_error_c(ref, stride, dst, width, height, stride);
-    const int64_t test_error =
-        test_impl(ref, stride, dst, width, height, stride);
-    ASSERT_EQ(test_error, ref_error) << width << "x" << height;
-  }
-  aom_free(dst);
-  aom_free(ref);
-}
-
-void AV1FrameErrorTest::RunSpeedTest(frame_error_func test_impl, int width,
-                                     int height) {
-  const int stride = (((width * 3) / 2) + 15) & ~15;
-  const int max_blk_size = stride * height;
-  uint8_t *const dst =
-      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*dst)));
-  uint8_t *const ref =
-      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*ref)));
-  ASSERT_TRUE(dst != NULL);
-  ASSERT_TRUE(ref != NULL);
-  for (int i = 0; i < max_blk_size; ++i) {
-    dst[i] = ref[i] = rnd_.Rand8();
-  }
-  const int num_loops = 10000000 / (width + height);
-  frame_error_func funcs[2] = { av1_calc_frame_error_c, test_impl };
-  double elapsed_time[2] = { 0 };
-  for (int i = 0; i < 2; ++i) {
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-    frame_error_func func = funcs[i];
-    for (int j = 0; j < num_loops; ++j) {
-      func(ref, stride, dst, width, height, stride);
-    }
-    aom_usec_timer_mark(&timer);
-    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
-    elapsed_time[i] = 1000.0 * time / num_loops;
-  }
-  aom_free(dst);
-  aom_free(ref);
-  printf("av1_calc_frame_error %3dx%-3d: %7.2f/%7.2fns", width, height,
-         elapsed_time[0], elapsed_time[1]);
-  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
-}
-
-TEST_P(AV1FrameErrorTest, CheckOutput) {
-  RandomValues(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
-  ExtremeValues(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
-}
-
-TEST_P(AV1FrameErrorTest, DISABLED_Speed) {
-  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
-}
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(
-    SSE2, AV1FrameErrorTest,
-    ::testing::Combine(::testing::Values(&av1_calc_frame_error_sse2),
-                       ::testing::ValuesIn(kBlockWidth),
-                       ::testing::ValuesIn(kBlockHeight)));
-#endif
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(
-    AVX2, AV1FrameErrorTest,
-    ::testing::Combine(::testing::Values(&av1_calc_frame_error_avx2),
-                       ::testing::ValuesIn(kBlockWidth),
-                       ::testing::ValuesIn(kBlockHeight)));
-#endif
-}  // namespace

diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
deleted file mode 100644
index 7903259..0000000
--- a/test/hadamard_test.cc
+++ /dev/null

@@ -1,261 +0,0 @@
-/*
- *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <algorithm>
-#include <ostream>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-
-namespace {
-
-using libaom_test::ACMRandom;
-
-typedef void (*HadamardFunc)(const int16_t *a, ptrdiff_t a_stride,
-                             tran_low_t *b);
-
-void HadamardLoop(const tran_low_t *a, tran_low_t *out) {
-  tran_low_t b[8];
-  for (int i = 0; i < 8; i += 2) {
-    b[i + 0] = a[i * 8] + a[(i + 1) * 8];
-    b[i + 1] = a[i * 8] - a[(i + 1) * 8];
-  }
-  tran_low_t c[8];
-  for (int i = 0; i < 8; i += 4) {
-    c[i + 0] = b[i + 0] + b[i + 2];
-    c[i + 1] = b[i + 1] + b[i + 3];
-    c[i + 2] = b[i + 0] - b[i + 2];
-    c[i + 3] = b[i + 1] - b[i + 3];
-  }
-  out[0] = c[0] + c[4];
-  out[7] = c[1] + c[5];
-  out[3] = c[2] + c[6];
-  out[4] = c[3] + c[7];
-  out[2] = c[0] - c[4];
-  out[6] = c[1] - c[5];
-  out[1] = c[2] - c[6];
-  out[5] = c[3] - c[7];
-}
-
-void ReferenceHadamard8x8(const int16_t *a, int a_stride, tran_low_t *b) {
-  tran_low_t input[64];
-  tran_low_t buf[64];
-  for (int i = 0; i < 8; ++i) {
-    for (int j = 0; j < 8; ++j) {
-      input[i * 8 + j] = static_cast<tran_low_t>(a[i * a_stride + j]);
-    }
-  }
-  for (int i = 0; i < 8; ++i) HadamardLoop(input + i, buf + i * 8);
-  for (int i = 0; i < 8; ++i) HadamardLoop(buf + i, b + i * 8);
-}
-
-void ReferenceHadamard16x16(const int16_t *a, int a_stride, tran_low_t *b) {
-  /* The source is a 16x16 block. The destination is rearranged to 8x32.
-   * Input is 9 bit. */
-  ReferenceHadamard8x8(a + 0 + 0 * a_stride, a_stride, b + 0);
-  ReferenceHadamard8x8(a + 8 + 0 * a_stride, a_stride, b + 64);
-  ReferenceHadamard8x8(a + 0 + 8 * a_stride, a_stride, b + 128);
-  ReferenceHadamard8x8(a + 8 + 8 * a_stride, a_stride, b + 192);
-
-  /* Overlay the 8x8 blocks and combine. */
-  for (int i = 0; i < 64; ++i) {
-    /* 8x8 steps the range up to 15 bits. */
-    const tran_low_t a0 = b[0];
-    const tran_low_t a1 = b[64];
-    const tran_low_t a2 = b[128];
-    const tran_low_t a3 = b[192];
-
-    /* Prevent the result from escaping int16_t. */
-    const tran_low_t b0 = (a0 + a1) >> 1;
-    const tran_low_t b1 = (a0 - a1) >> 1;
-    const tran_low_t b2 = (a2 + a3) >> 1;
-    const tran_low_t b3 = (a2 - a3) >> 1;
-
-    /* Store a 16 bit value. */
-    b[0] = b0 + b2;
-    b[64] = b1 + b3;
-    b[128] = b0 - b2;
-    b[192] = b1 - b3;
-
-    ++b;
-  }
-}
-
-void ReferenceHadamard32x32(const int16_t *a, int a_stride, tran_low_t *b) {
-  ReferenceHadamard16x16(a + 0 + 0 * a_stride, a_stride, b + 0);
-  ReferenceHadamard16x16(a + 16 + 0 * a_stride, a_stride, b + 256);
-  ReferenceHadamard16x16(a + 0 + 16 * a_stride, a_stride, b + 512);
-  ReferenceHadamard16x16(a + 16 + 16 * a_stride, a_stride, b + 768);
-
-  for (int i = 0; i < 256; ++i) {
-    const tran_low_t a0 = b[0];
-    const tran_low_t a1 = b[256];
-    const tran_low_t a2 = b[512];
-    const tran_low_t a3 = b[768];
-
-    const tran_low_t b0 = (a0 + a1) >> 2;
-    const tran_low_t b1 = (a0 - a1) >> 2;
-    const tran_low_t b2 = (a2 + a3) >> 2;
-    const tran_low_t b3 = (a2 - a3) >> 2;
-
-    b[0] = b0 + b2;
-    b[256] = b1 + b3;
-    b[512] = b0 - b2;
-    b[768] = b1 - b3;
-
-    ++b;
-  }
-}
-
-struct HadamardFuncWithSize {
-  HadamardFuncWithSize(HadamardFunc f, int s) : func(f), block_size(s) {}
-  HadamardFunc func;
-  int block_size;
-};
-
-std::ostream &operator<<(std::ostream &os, const HadamardFuncWithSize &hfs) {
-  return os << "block size: " << hfs.block_size;
-}
-
-class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> {
- public:
-  virtual void SetUp() {
-    h_func_ = GetParam().func;
-    bwh_ = GetParam().block_size;
-    block_size_ = bwh_ * bwh_;
-    rnd_.Reset(ACMRandom::DeterministicSeed());
-  }
-
-  virtual int16_t Rand() = 0;
-
-  void ReferenceHadamard(const int16_t *a, int a_stride, tran_low_t *b,
-                         int bwh) {
-    if (bwh == 32)
-      ReferenceHadamard32x32(a, a_stride, b);
-    else if (bwh == 16)
-      ReferenceHadamard16x16(a, a_stride, b);
-    else
-      ReferenceHadamard8x8(a, a_stride, b);
-  }
-
-  void CompareReferenceRandom() {
-    const int kMaxBlockSize = 32 * 32;
-    DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize]);
-    DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]);
-    memset(a, 0, sizeof(a));
-    memset(b, 0, sizeof(b));
-
-    tran_low_t b_ref[kMaxBlockSize];
-    memset(b_ref, 0, sizeof(b_ref));
-
-    for (int i = 0; i < block_size_; ++i) a[i] = Rand();
-
-    ReferenceHadamard(a, bwh_, b_ref, bwh_);
-    ASM_REGISTER_STATE_CHECK(h_func_(a, bwh_, b));
-
-    // The order of the output is not important. Sort before checking.
-    std::sort(b, b + block_size_);
-    std::sort(b_ref, b_ref + block_size_);
-    EXPECT_EQ(memcmp(b, b_ref, sizeof(b)), 0);
-  }
-
-  void VaryStride() {
-    const int kMaxBlockSize = 32 * 32;
-    DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize * 8]);
-    DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]);
-    memset(a, 0, sizeof(a));
-    for (int i = 0; i < block_size_ * 8; ++i) a[i] = Rand();
-
-    tran_low_t b_ref[kMaxBlockSize];
-    for (int i = 8; i < 64; i += 8) {
-      memset(b, 0, sizeof(b));
-      memset(b_ref, 0, sizeof(b_ref));
-
-      ReferenceHadamard(a, i, b_ref, bwh_);
-      ASM_REGISTER_STATE_CHECK(h_func_(a, i, b));
-
-      // The order of the output is not important. Sort before checking.
-      std::sort(b, b + block_size_);
-      std::sort(b_ref, b_ref + block_size_);
-      EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
-    }
-  }
-
-  void SpeedTest(int times) {
-    const int kMaxBlockSize = 32 * 32;
-    DECLARE_ALIGNED(16, int16_t, input[kMaxBlockSize]);
-    DECLARE_ALIGNED(16, tran_low_t, output[kMaxBlockSize]);
-    memset(input, 1, sizeof(input));
-    memset(output, 0, sizeof(output));
-
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-    for (int i = 0; i < times; ++i) {
-      h_func_(input, bwh_, output);
-    }
-    aom_usec_timer_mark(&timer);
-
-    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-    printf("Hadamard%dx%d[%12d runs]: %d us\n", bwh_, bwh_, times,
-           elapsed_time);
-  }
-
-  ACMRandom rnd_;
-
- private:
-  int bwh_;
-  int block_size_;
-  HadamardFunc h_func_;
-};
-
-class HadamardLowbdTest : public HadamardTestBase {
- public:
-  virtual int16_t Rand() { return rnd_.Rand9Signed(); }
-};
-
-TEST_P(HadamardLowbdTest, CompareReferenceRandom) { CompareReferenceRandom(); }
-
-TEST_P(HadamardLowbdTest, VaryStride) { VaryStride(); }
-
-INSTANTIATE_TEST_SUITE_P(
-    C, HadamardLowbdTest,
-    ::testing::Values(HadamardFuncWithSize(&aom_hadamard_8x8_c, 8),
-                      HadamardFuncWithSize(&aom_hadamard_16x16_c, 16),
-                      HadamardFuncWithSize(&aom_hadamard_32x32_c, 32)));
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(
-    SSE2, HadamardLowbdTest,
-    ::testing::Values(HadamardFuncWithSize(&aom_hadamard_8x8_sse2, 8),
-                      HadamardFuncWithSize(&aom_hadamard_16x16_sse2, 16),
-                      HadamardFuncWithSize(&aom_hadamard_32x32_sse2, 32)));
-#endif  // HAVE_SSE2
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(
-    AVX2, HadamardLowbdTest,
-    ::testing::Values(HadamardFuncWithSize(&aom_hadamard_16x16_avx2, 16),
-                      HadamardFuncWithSize(&aom_hadamard_32x32_avx2, 32)));
-#endif  // HAVE_AVX2
-
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(
-    NEON, HadamardLowbdTest,
-    ::testing::Values(HadamardFuncWithSize(&aom_hadamard_8x8_neon, 8),
-                      HadamardFuncWithSize(&aom_hadamard_16x16_neon, 16)));
-#endif  // HAVE_NEON
-
-}  // namespace

diff --git a/test/hbd_metrics_test.cc b/test/hbd_metrics_test.cc
deleted file mode 100644
index fa45654..0000000
--- a/test/hbd_metrics_test.cc
+++ /dev/null

@@ -1,241 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <math.h>
-#include <stdlib.h>
-#include <new>
-#include <tuple>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "test/acm_random.h"
-#include "test/util.h"
-
-#include "config/aom_config.h"
-
-#include "aom_dsp/psnr.h"
-#include "aom_dsp/ssim.h"
-#include "aom_ports/mem.h"
-#include "aom_ports/msvc.h"
-#include "aom_scale/yv12config.h"
-
-using libaom_test::ACMRandom;
-
-namespace {
-
-typedef double (*LBDMetricFunc)(const YV12_BUFFER_CONFIG *source,
-                                const YV12_BUFFER_CONFIG *dest);
-typedef double (*HBDMetricFunc)(const YV12_BUFFER_CONFIG *source,
-                                const YV12_BUFFER_CONFIG *dest, uint32_t in_bd,
-                                uint32_t bd);
-
-double compute_hbd_psnr(const YV12_BUFFER_CONFIG *source,
-                        const YV12_BUFFER_CONFIG *dest, uint32_t in_bd,
-                        uint32_t bd) {
-  PSNR_STATS psnr;
-  aom_calc_highbd_psnr(source, dest, &psnr, bd, in_bd);
-  return psnr.psnr[0];
-}
-
-double compute_psnr(const YV12_BUFFER_CONFIG *source,
-                    const YV12_BUFFER_CONFIG *dest) {
-  PSNR_STATS psnr;
-  aom_calc_psnr(source, dest, &psnr);
-  return psnr.psnr[0];
-}
-
-double compute_hbd_psnrhvs(const YV12_BUFFER_CONFIG *source,
-                           const YV12_BUFFER_CONFIG *dest, uint32_t in_bd,
-                           uint32_t bd) {
-  double tempy, tempu, tempv;
-  return aom_psnrhvs(source, dest, &tempy, &tempu, &tempv, bd, in_bd);
-}
-
-double compute_psnrhvs(const YV12_BUFFER_CONFIG *source,
-                       const YV12_BUFFER_CONFIG *dest) {
-  double tempy, tempu, tempv;
-  return aom_psnrhvs(source, dest, &tempy, &tempu, &tempv, 8, 8);
-}
-
-double compute_hbd_fastssim(const YV12_BUFFER_CONFIG *source,
-                            const YV12_BUFFER_CONFIG *dest, uint32_t in_bd,
-                            uint32_t bd) {
-  double tempy, tempu, tempv;
-  return aom_calc_fastssim(source, dest, &tempy, &tempu, &tempv, bd, in_bd);
-}
-
-double compute_fastssim(const YV12_BUFFER_CONFIG *source,
-                        const YV12_BUFFER_CONFIG *dest) {
-  double tempy, tempu, tempv;
-  return aom_calc_fastssim(source, dest, &tempy, &tempu, &tempv, 8, 8);
-}
-
-double compute_hbd_aomssim(const YV12_BUFFER_CONFIG *source,
-                           const YV12_BUFFER_CONFIG *dest, uint32_t in_bd,
-                           uint32_t bd) {
-  double ssim, weight;
-  ssim = aom_highbd_calc_ssim(source, dest, &weight, bd, in_bd);
-  return 100 * pow(ssim / weight, 8.0);
-}
-
-double compute_aomssim(const YV12_BUFFER_CONFIG *source,
-                       const YV12_BUFFER_CONFIG *dest) {
-  double ssim, weight;
-  ssim = aom_calc_ssim(source, dest, &weight);
-  return 100 * pow(ssim / weight, 8.0);
-}
-
-class HBDMetricsTestBase {
- public:
-  virtual ~HBDMetricsTestBase() {}
-
- protected:
-  void RunAccuracyCheck() {
-    const int width = 1920;
-    const int height = 1080;
-    size_t i = 0;
-    const uint8_t kPixFiller = 128;
-    YV12_BUFFER_CONFIG lbd_src, lbd_dst;
-    YV12_BUFFER_CONFIG hbd_src, hbd_dst;
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    double lbd_db, hbd_db;
-
-    memset(&lbd_src, 0, sizeof(lbd_src));
-    memset(&lbd_dst, 0, sizeof(lbd_dst));
-    memset(&hbd_src, 0, sizeof(hbd_src));
-    memset(&hbd_dst, 0, sizeof(hbd_dst));
-
-    aom_alloc_frame_buffer(&lbd_src, width, height, 1, 1, 0, 32, 16);
-    aom_alloc_frame_buffer(&lbd_dst, width, height, 1, 1, 0, 32, 16);
-    aom_alloc_frame_buffer(&hbd_src, width, height, 1, 1, 1, 32, 16);
-    aom_alloc_frame_buffer(&hbd_dst, width, height, 1, 1, 1, 32, 16);
-
-    memset(lbd_src.buffer_alloc, kPixFiller, lbd_src.buffer_alloc_sz);
-    while (i < lbd_src.buffer_alloc_sz) {
-      uint16_t spel, dpel;
-      spel = lbd_src.buffer_alloc[i];
-      // Create some distortion for dst buffer.
-      dpel = rnd.Rand8();
-      lbd_dst.buffer_alloc[i] = (uint8_t)dpel;
-      ((uint16_t *)(hbd_src.buffer_alloc))[i] = spel << (bit_depth_ - 8);
-      ((uint16_t *)(hbd_dst.buffer_alloc))[i] = dpel << (bit_depth_ - 8);
-      i++;
-    }
-
-    lbd_db = lbd_metric_(&lbd_src, &lbd_dst);
-    hbd_db = hbd_metric_(&hbd_src, &hbd_dst, input_bit_depth_, bit_depth_);
-    EXPECT_LE(fabs(lbd_db - hbd_db), threshold_);
-
-    i = 0;
-    while (i < lbd_src.buffer_alloc_sz) {
-      uint16_t dpel;
-      // Create some small distortion for dst buffer.
-      dpel = 120 + (rnd.Rand8() >> 4);
-      lbd_dst.buffer_alloc[i] = (uint8_t)dpel;
-      ((uint16_t *)(hbd_dst.buffer_alloc))[i] = dpel << (bit_depth_ - 8);
-      i++;
-    }
-
-    lbd_db = lbd_metric_(&lbd_src, &lbd_dst);
-    hbd_db = hbd_metric_(&hbd_src, &hbd_dst, input_bit_depth_, bit_depth_);
-    EXPECT_LE(fabs(lbd_db - hbd_db), threshold_);
-
-    i = 0;
-    while (i < lbd_src.buffer_alloc_sz) {
-      uint16_t dpel;
-      // Create some small distortion for dst buffer.
-      dpel = 126 + (rnd.Rand8() >> 6);
-      lbd_dst.buffer_alloc[i] = (uint8_t)dpel;
-      ((uint16_t *)(hbd_dst.buffer_alloc))[i] = dpel << (bit_depth_ - 8);
-      i++;
-    }
-
-    lbd_db = lbd_metric_(&lbd_src, &lbd_dst);
-    hbd_db = hbd_metric_(&hbd_src, &hbd_dst, input_bit_depth_, bit_depth_);
-    EXPECT_LE(fabs(lbd_db - hbd_db), threshold_);
-
-    aom_free_frame_buffer(&lbd_src);
-    aom_free_frame_buffer(&lbd_dst);
-    aom_free_frame_buffer(&hbd_src);
-    aom_free_frame_buffer(&hbd_dst);
-  }
-
-  int input_bit_depth_;
-  int bit_depth_;
-  double threshold_;
-  LBDMetricFunc lbd_metric_;
-  HBDMetricFunc hbd_metric_;
-};
-
-typedef std::tuple<LBDMetricFunc, HBDMetricFunc, int, int, double>
-    MetricTestTParam;
-class HBDMetricsTest : public HBDMetricsTestBase,
-                       public ::testing::TestWithParam<MetricTestTParam> {
- public:
-  virtual void SetUp() {
-    lbd_metric_ = GET_PARAM(0);
-    hbd_metric_ = GET_PARAM(1);
-    input_bit_depth_ = GET_PARAM(2);
-    bit_depth_ = GET_PARAM(3);
-    threshold_ = GET_PARAM(4);
-  }
-  virtual void TearDown() {}
-};
-
-TEST_P(HBDMetricsTest, RunAccuracyCheck) { RunAccuracyCheck(); }
-
-// Allow small variation due to floating point operations.
-static const double kSsim_thresh = 0.001;
-// Allow some additional errors accumulated in floating point operations.
-static const double kFSsim_thresh = 0.03;
-// Allow some extra variation due to rounding error accumulated in dct.
-static const double kPhvs_thresh = 0.3;
-
-INSTANTIATE_TEST_SUITE_P(
-    AOMSSIM, HBDMetricsTest,
-    ::testing::Values(MetricTestTParam(&compute_aomssim, &compute_hbd_aomssim,
-                                       8, 10, kSsim_thresh),
-                      MetricTestTParam(&compute_aomssim, &compute_hbd_aomssim,
-                                       10, 10, kPhvs_thresh),
-                      MetricTestTParam(&compute_aomssim, &compute_hbd_aomssim,
-                                       8, 12, kSsim_thresh),
-                      MetricTestTParam(&compute_aomssim, &compute_hbd_aomssim,
-                                       12, 12, kPhvs_thresh)));
-INSTANTIATE_TEST_SUITE_P(
-    FASTSSIM, HBDMetricsTest,
-    ::testing::Values(MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim,
-                                       8, 10, kFSsim_thresh),
-                      MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim,
-                                       10, 10, kFSsim_thresh),
-                      MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim,
-                                       8, 12, kFSsim_thresh),
-                      MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim,
-                                       12, 12, kFSsim_thresh)));
-INSTANTIATE_TEST_SUITE_P(
-    PSNRHVS, HBDMetricsTest,
-    ::testing::Values(MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs,
-                                       8, 10, kPhvs_thresh),
-                      MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs,
-                                       10, 10, kPhvs_thresh),
-                      MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs,
-                                       8, 12, kPhvs_thresh),
-                      MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs,
-                                       12, 12, kPhvs_thresh)));
-INSTANTIATE_TEST_SUITE_P(
-    PSNR, HBDMetricsTest,
-    ::testing::Values(
-        MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 8, 10, kPhvs_thresh),
-        MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 10, 10,
-                         kPhvs_thresh),
-        MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 8, 12, kPhvs_thresh),
-        MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 12, 12,
-                         kPhvs_thresh)));
-}  // namespace

diff --git a/test/hiprec_convolve_test.cc b/test/hiprec_convolve_test.cc
index 267d315..e6ba2f0 100644
--- a/test/hiprec_convolve_test.cc
+++ b/test/hiprec_convolve_test.cc

@@ -18,33 +18,11 @@
 using libaom_test::ACMRandom;
 using libaom_test::AV1HighbdHiprecConvolve::AV1HighbdHiprecConvolveTest;
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdHiprecConvolveTest);
-using libaom_test::AV1HiprecConvolve::AV1HiprecConvolveTest;
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HiprecConvolveTest);
 using std::make_tuple;
 using std::tuple;
 
 namespace {
 
-TEST_P(AV1HiprecConvolveTest, CheckOutput) { RunCheckOutput(GET_PARAM(3)); }
-TEST_P(AV1HiprecConvolveTest, DISABLED_SpeedTest) {
-  RunSpeedTest(GET_PARAM(3));
-}
-#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(SSE2, AV1HiprecConvolveTest,
-                         libaom_test::AV1HiprecConvolve::BuildParams(
-                             av1_wiener_convolve_add_src_sse2));
-#endif
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(AVX2, AV1HiprecConvolveTest,
-                         libaom_test::AV1HiprecConvolve::BuildParams(
-                             av1_wiener_convolve_add_src_avx2));
-#endif
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(NEON, AV1HiprecConvolveTest,
-                         libaom_test::AV1HiprecConvolve::BuildParams(
-                             av1_wiener_convolve_add_src_neon));
-#endif
-
 #if HAVE_SSSE3 || HAVE_AVX2
 TEST_P(AV1HighbdHiprecConvolveTest, CheckOutput) {
   RunCheckOutput(GET_PARAM(4));

diff --git a/test/hiprec_convolve_test_util.cc b/test/hiprec_convolve_test_util.cc
index f25bdf0..20ffb16 100644
--- a/test/hiprec_convolve_test_util.cc
+++ b/test/hiprec_convolve_test_util.cc

@@ -65,142 +65,6 @@
   }
 }
 
-namespace AV1HiprecConvolve {
-
-::testing::internal::ParamGenerator<HiprecConvolveParam> BuildParams(
-    hiprec_convolve_func filter) {
-  const HiprecConvolveParam params[] = {
-    make_tuple(8, 8, 50000, filter),   make_tuple(8, 4, 50000, filter),
-    make_tuple(64, 24, 1000, filter),  make_tuple(64, 64, 1000, filter),
-    make_tuple(64, 56, 1000, filter),  make_tuple(32, 8, 10000, filter),
-    make_tuple(32, 28, 10000, filter), make_tuple(32, 32, 10000, filter),
-    make_tuple(16, 34, 10000, filter), make_tuple(32, 34, 10000, filter),
-    make_tuple(64, 34, 1000, filter),  make_tuple(8, 17, 10000, filter),
-    make_tuple(16, 17, 10000, filter), make_tuple(32, 17, 10000, filter)
-  };
-  return ::testing::ValuesIn(params);
-}
-
-AV1HiprecConvolveTest::~AV1HiprecConvolveTest() {}
-void AV1HiprecConvolveTest::SetUp() {
-  rnd_.Reset(ACMRandom::DeterministicSeed());
-}
-
-void AV1HiprecConvolveTest::TearDown() { libaom_test::ClearSystemState(); }
-
-void AV1HiprecConvolveTest::RunCheckOutput(hiprec_convolve_func test_impl) {
-  const int w = 128, h = 128;
-  const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
-  const int num_iters = GET_PARAM(2);
-  int i, j, k, m;
-  const ConvolveParams conv_params = get_conv_params_wiener(8);
-
-  uint8_t *input_ = new uint8_t[h * w];
-  uint8_t *input = input_;
-
-  // The AVX2 convolve functions always write rows with widths that are
-  // multiples of 16. So to avoid a buffer overflow, we may need to pad
-  // rows to a multiple of 16.
-  int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
-  uint8_t *output = new uint8_t[output_n];
-  uint8_t *output2 = new uint8_t[output_n];
-
-  // Generate random filter kernels
-  DECLARE_ALIGNED(16, InterpKernel, hkernel);
-  DECLARE_ALIGNED(16, InterpKernel, vkernel);
-
-  for (int kernel_type = 0; kernel_type < 3; kernel_type++) {
-    generate_kernels(&rnd_, hkernel, vkernel, kernel_type);
-    for (i = 0; i < num_iters; ++i) {
-      for (k = 0; k < h; ++k)
-        for (m = 0; m < w; ++m) input[k * w + m] = rnd_.Rand8();
-      // Choose random locations within the source block
-      int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
-      int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
-      av1_wiener_convolve_add_src_c(input + offset_r * w + offset_c, w, output,
-                                    out_w, hkernel, 16, vkernel, 16, out_w,
-                                    out_h, &conv_params);
-      test_impl(input + offset_r * w + offset_c, w, output2, out_w, hkernel, 16,
-                vkernel, 16, out_w, out_h, &conv_params);
-
-      for (j = 0; j < out_w * out_h; ++j)
-        ASSERT_EQ(output[j], output2[j])
-            << "Pixel mismatch at index " << j << " = (" << (j % out_w) << ", "
-            << (j / out_w) << ") on iteration " << i;
-    }
-  }
-  delete[] input_;
-  delete[] output;
-  delete[] output2;
-}
-
-void AV1HiprecConvolveTest::RunSpeedTest(hiprec_convolve_func test_impl) {
-  const int w = 128, h = 128;
-  const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
-  const int num_iters = GET_PARAM(2) / 500;
-  int i, j, k;
-  const ConvolveParams conv_params = get_conv_params_wiener(8);
-
-  uint8_t *input_ = new uint8_t[h * w];
-  uint8_t *input = input_;
-
-  // The AVX2 convolve functions always write rows with widths that are
-  // multiples of 16. So to avoid a buffer overflow, we may need to pad
-  // rows to a multiple of 16.
-  int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
-  uint8_t *output = new uint8_t[output_n];
-  uint8_t *output2 = new uint8_t[output_n];
-
-  // Generate random filter kernels
-  DECLARE_ALIGNED(16, InterpKernel, hkernel);
-  DECLARE_ALIGNED(16, InterpKernel, vkernel);
-
-  generate_kernels(&rnd_, hkernel, vkernel);
-
-  for (i = 0; i < h; ++i)
-    for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
-
-  aom_usec_timer ref_timer;
-  aom_usec_timer_start(&ref_timer);
-  for (i = 0; i < num_iters; ++i) {
-    for (j = 3; j < h - out_h - 4; j++) {
-      for (k = 3; k < w - out_w - 4; k++) {
-        av1_wiener_convolve_add_src_c(input + j * w + k, w, output, out_w,
-                                      hkernel, 16, vkernel, 16, out_w, out_h,
-                                      &conv_params);
-      }
-    }
-  }
-  aom_usec_timer_mark(&ref_timer);
-  const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
-
-  aom_usec_timer tst_timer;
-  aom_usec_timer_start(&tst_timer);
-  for (i = 0; i < num_iters; ++i) {
-    for (j = 3; j < h - out_h - 4; j++) {
-      for (k = 3; k < w - out_w - 4; k++) {
-        test_impl(input + j * w + k, w, output2, out_w, hkernel, 16, vkernel,
-                  16, out_w, out_h, &conv_params);
-      }
-    }
-  }
-  aom_usec_timer_mark(&tst_timer);
-  const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
-
-  std::cout << "[          ] C time = " << ref_time / 1000
-            << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
-
-  EXPECT_GT(ref_time, tst_time)
-      << "Error: AV1HiprecConvolveTest.SpeedTest, SIMD slower than C.\n"
-      << "C time: " << ref_time << " us\n"
-      << "SIMD time: " << tst_time << " us\n";
-
-  delete[] input_;
-  delete[] output;
-  delete[] output2;
-}
-}  // namespace AV1HiprecConvolve
-
 namespace AV1HighbdHiprecConvolve {
 
 ::testing::internal::ParamGenerator<HighbdHiprecConvolveParam> BuildParams(

diff --git a/test/horz_superres_test.cc b/test/horz_superres_test.cc
index 488b326..5e3268a 100644
--- a/test/horz_superres_test.cc
+++ b/test/horz_superres_test.cc

@@ -112,7 +112,6 @@
     cfg_.g_input_bit_depth = (unsigned int)test_video_param_.bit_depth;
     cfg_.g_bit_depth = test_video_param_.bit_depth;
     init_flags_ = AOM_CODEC_USE_PSNR;
-    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
 
     // Set superres parameters
     cfg_.rc_superres_mode = superres_mode_;
@@ -216,7 +215,6 @@
     cfg_.g_input_bit_depth = (unsigned int)test_video_param_.bit_depth;
     cfg_.g_bit_depth = test_video_param_.bit_depth;
     init_flags_ = AOM_CODEC_USE_PSNR;
-    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
 
     // Set superres parameters
     cfg_.rc_superres_mode = superres_mode_;
@@ -327,7 +325,6 @@
     cfg_.g_input_bit_depth = (unsigned int)test_video_param_.bit_depth;
     cfg_.g_bit_depth = test_video_param_.bit_depth;
     init_flags_ = AOM_CODEC_USE_PSNR;
-    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
 
     // Set superres parameters
     cfg_.rc_superres_mode = superres_mode_;

diff --git a/test/intra_edge_test.cc b/test/intra_edge_test.cc
index 6720db3..b938f6a 100644
--- a/test/intra_edge_test.cc
+++ b/test/intra_edge_test.cc

@@ -64,50 +64,6 @@
 };
 
 //////////////////////////////////////////////////////////////////////////////
-// 8 bit version
-//////////////////////////////////////////////////////////////////////////////
-
-typedef void (*UP8B)(uint8_t *p, int size);
-typedef libaom_test::FuncParam<UP8B> TestFuncs;
-
-class UpsampleTest8B : public UpsampleTest<UP8B, uint8_t> {
- protected:
-  void Execute(uint8_t *edge_tst) {
-    params_.ref_func(edge_ref_, size_);
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_));
-  }
-};
-
-TEST_P(UpsampleTest8B, RandomValues) {
-  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
-    size_ = 4 * (this->rng_(4) + 1);
-
-    int i, pix = 0;
-    for (i = 0; i < kOffset + size_; ++i) {
-      pix = rng_.Rand8();
-      edge_ref_data_[i] = pix;
-      edge_tst_data_[i] = edge_ref_data_[i];
-    }
-
-    // Extend final sample
-    while (i < kBufSize) {
-      edge_ref_data_[i] = pix;
-      edge_tst_data_[i] = pix;
-      i++;
-    }
-
-    Common();
-  }
-}
-
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(
-    SSE4_1, UpsampleTest8B,
-    ::testing::Values(TestFuncs(av1_upsample_intra_edge_c,
-                                av1_upsample_intra_edge_sse4_1)));
-#endif  // HAVE_SSE4_1
-
-//////////////////////////////////////////////////////////////////////////////
 // High bit-depth version
 //////////////////////////////////////////////////////////////////////////////
 
@@ -193,44 +149,6 @@
 };
 
 //////////////////////////////////////////////////////////////////////////////
-// 8 bit version
-//////////////////////////////////////////////////////////////////////////////
-
-typedef void (*FE8B)(uint8_t *p, int size, int strength);
-typedef libaom_test::FuncParam<FE8B> FilterEdgeTestFuncs;
-
-class FilterEdgeTest8B : public FilterEdgeTest<FE8B, uint8_t> {
- protected:
-  void Execute(uint8_t *edge_tst) {
-    params_.ref_func(edge_ref_, size_, strength_);
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_));
-  }
-};
-
-TEST_P(FilterEdgeTest8B, RandomValues) {
-  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
-    strength_ = this->rng_(4);
-    size_ = 4 * (this->rng_(128 / 4) + 1) + 1;
-
-    int i, pix = 0;
-    for (i = 0; i < kOffset + size_; ++i) {
-      pix = rng_.Rand8();
-      edge_ref_data_[i] = pix;
-      edge_tst_data_[i] = pix;
-    }
-
-    Common();
-  }
-}
-
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(
-    SSE4_1, FilterEdgeTest8B,
-    ::testing::Values(FilterEdgeTestFuncs(av1_filter_intra_edge_c,
-                                          av1_filter_intra_edge_sse4_1)));
-#endif  // HAVE_SSE4_1
-
-//////////////////////////////////////////////////////////////////////////////
 // High bit-depth version
 //////////////////////////////////////////////////////////////////////////////
 
@@ -277,18 +195,6 @@
 
 // Speed tests
 
-TEST_P(UpsampleTest8B, DISABLED_Speed) {
-  const int test_count = 10000000;
-  size_ = kMaxEdge;
-  for (int i = 0; i < kOffset + size_; ++i) {
-    edge_tst_data_[i] = rng_.Rand8();
-  }
-  edge_tst_ = &edge_tst_data_[kOffset];
-  for (int iter = 0; iter < test_count; ++iter) {
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_));
-  }
-}
-
 TEST_P(UpsampleTestHB, DISABLED_Speed) {
   const int test_count = 10000000;
   size_ = kMaxEdge;
@@ -303,21 +209,6 @@
   }
 }
 
-TEST_P(FilterEdgeTest8B, DISABLED_Speed) {
-  const int test_count = 10000000;
-  size_ = kMaxEdge;
-  strength_ = 1;
-  for (int i = 0; i < kOffset + size_; ++i) {
-    edge_tst_data_[i] = rng_.Rand8();
-  }
-  edge_tst_ = &edge_tst_data_[kOffset];
-  for (int iter = 0; iter < test_count; ++iter) {
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_));
-    // iterate over filter strengths (1,2,3)
-    strength_ = (strength_ == 3) ? 1 : strength_ + 1;
-  }
-}
-
 TEST_P(FilterEdgeTestHB, DISABLED_Speed) {
   const int test_count = 10000000;
   size_ = kMaxEdge;

diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index 137a4a7..59bec65 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc

@@ -35,9 +35,6 @@
 typedef void (*HighbdIntraPred)(uint16_t *dst, ptrdiff_t stride,
                                 const uint16_t *above, const uint16_t *left,
                                 int bps);
-typedef void (*IntraPred)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
-                          const uint8_t *left);
-
 }  // namespace
 
 // NOTE: Under gcc version 7.3.0 (Debian 7.3.0-5), if this template is in the
@@ -217,26 +214,6 @@
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(HighbdIntraPredTest);
 
-class LowbdIntraPredTest : public AV1IntraPredTest<IntraPred, uint8_t> {
- protected:
-  void Predict() {
-    params_.ref_fn(ref_dst_, stride_, above_row_, left_col_);
-    ASM_REGISTER_STATE_CHECK(
-        params_.pred_fn(dst_, stride_, above_row_, left_col_));
-  }
-  void PredictRefSpeedTest(int num) {
-    for (int i = 0; i < num; i++) {
-      params_.ref_fn(ref_dst_, stride_, above_row_, left_col_);
-    }
-  }
-  void PredictFncSpeedTest(int num) {
-    for (int i = 0; i < num; i++) {
-      params_.pred_fn(dst_, stride_, above_row_, left_col_);
-    }
-  }
-};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(LowbdIntraPredTest);
-
 // Suppress an unitialized warning. Once there are implementations to test then
 // this can be restored.
 TEST_P(HighbdIntraPredTest, Bitexact) {
@@ -250,27 +227,6 @@
   RunTest(left_col, above_data, dst, ref_dst);
 }
 
-TEST_P(LowbdIntraPredTest, Bitexact) {
-  // max block size is 64
-  DECLARE_ALIGNED(16, uint8_t, left_col[2 * 64]);
-  DECLARE_ALIGNED(16, uint8_t, above_data[2 * 64 + 64]);
-  DECLARE_ALIGNED(16, uint8_t, dst[3 * 64 * 64]);
-  DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 64 * 64]);
-  av1_zero(left_col);
-  av1_zero(above_data);
-  RunTest(left_col, above_data, dst, ref_dst);
-}
-TEST_P(LowbdIntraPredTest, DISABLED_Speed) {
-  // max block size is 64
-  DECLARE_ALIGNED(16, uint8_t, left_col[2 * 64]);
-  DECLARE_ALIGNED(16, uint8_t, above_data[2 * 64 + 64]);
-  DECLARE_ALIGNED(16, uint8_t, dst[3 * 64 * 64]);
-  DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 64 * 64]);
-  av1_zero(left_col);
-  av1_zero(above_data);
-  RunSpeedTest(left_col, above_data, dst, ref_dst);
-}
-
 // -----------------------------------------------------------------------------
 // High Bit Depth Tests
 #define highbd_entry(type, width, height, opt, bd)                          \
@@ -288,77 +244,6 @@
       highbd_entry(type, 16, 32, opt, bd),                                    \
       highbd_entry(type, 32, 16, opt, bd), highbd_entry(type, 32, 32, opt, bd)
 #endif
-// ---------------------------------------------------------------------------
-// Low Bit Depth Tests
-
-#define lowbd_entry(type, width, height, opt)                                  \
-  IntraPredFunc<IntraPred>(&aom_##type##_predictor_##width##x##height##_##opt, \
-                           &aom_##type##_predictor_##width##x##height##_c,     \
-                           width, height, 8)
-
-#define lowbd_intrapred(type, opt)                                    \
-  lowbd_entry(type, 4, 4, opt), lowbd_entry(type, 4, 8, opt),         \
-      lowbd_entry(type, 8, 4, opt), lowbd_entry(type, 8, 8, opt),     \
-      lowbd_entry(type, 8, 16, opt), lowbd_entry(type, 16, 8, opt),   \
-      lowbd_entry(type, 16, 16, opt), lowbd_entry(type, 16, 32, opt), \
-      lowbd_entry(type, 32, 16, opt), lowbd_entry(type, 32, 32, opt)
-
-#if HAVE_SSE2
-const IntraPredFunc<IntraPred> LowbdIntraPredTestVector[] = {
-  lowbd_intrapred(dc, sse2),      lowbd_intrapred(dc_top, sse2),
-  lowbd_intrapred(dc_left, sse2), lowbd_intrapred(dc_128, sse2),
-  lowbd_intrapred(v, sse2),       lowbd_intrapred(h, sse2),
-};
-
-INSTANTIATE_TEST_SUITE_P(SSE2, LowbdIntraPredTest,
-                         ::testing::ValuesIn(LowbdIntraPredTestVector));
-
-#endif  // HAVE_SSE2
-
-#if HAVE_NEON
-const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorNeon[] = {
-  lowbd_entry(smooth, 4, 4, neon),   lowbd_entry(smooth, 4, 8, neon),
-  lowbd_entry(smooth, 4, 16, neon),  lowbd_entry(smooth, 8, 4, neon),
-  lowbd_entry(smooth, 8, 8, neon),   lowbd_entry(smooth, 8, 16, neon),
-  lowbd_entry(smooth, 8, 32, neon),  lowbd_entry(smooth, 16, 4, neon),
-  lowbd_entry(smooth, 16, 8, neon),  lowbd_entry(smooth, 16, 16, neon),
-  lowbd_entry(smooth, 16, 32, neon), lowbd_entry(smooth, 16, 64, neon),
-  lowbd_entry(smooth, 32, 8, neon),  lowbd_entry(smooth, 32, 16, neon),
-  lowbd_entry(smooth, 32, 32, neon), lowbd_entry(smooth, 32, 64, neon),
-  lowbd_entry(smooth, 64, 16, neon), lowbd_entry(smooth, 64, 32, neon),
-  lowbd_entry(smooth, 64, 64, neon)
-};
-INSTANTIATE_TEST_SUITE_P(NEON, LowbdIntraPredTest,
-                         ::testing::ValuesIn(LowbdIntraPredTestVectorNeon));
-#endif  // HAVE_NEON
-
-#if HAVE_SSSE3
-const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorSsse3[] = {
-  lowbd_intrapred(paeth, ssse3),
-  lowbd_intrapred(smooth, ssse3),
-};
-
-INSTANTIATE_TEST_SUITE_P(SSSE3, LowbdIntraPredTest,
-                         ::testing::ValuesIn(LowbdIntraPredTestVectorSsse3));
-
-#endif  // HAVE_SSSE3
-
-#if HAVE_AVX2
-const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorAvx2[] = {
-  lowbd_entry(dc, 32, 32, avx2),      lowbd_entry(dc_top, 32, 32, avx2),
-  lowbd_entry(dc_left, 32, 32, avx2), lowbd_entry(dc_128, 32, 32, avx2),
-  lowbd_entry(v, 32, 32, avx2),       lowbd_entry(h, 32, 32, avx2),
-  lowbd_entry(dc, 32, 16, avx2),      lowbd_entry(dc_top, 32, 16, avx2),
-  lowbd_entry(dc_left, 32, 16, avx2), lowbd_entry(dc_128, 32, 16, avx2),
-  lowbd_entry(v, 32, 16, avx2),       lowbd_entry(paeth, 16, 8, avx2),
-  lowbd_entry(paeth, 16, 16, avx2),   lowbd_entry(paeth, 16, 32, avx2),
-  lowbd_entry(paeth, 32, 16, avx2),   lowbd_entry(paeth, 32, 32, avx2),
-};
-
-INSTANTIATE_TEST_SUITE_P(AVX2, LowbdIntraPredTest,
-                         ::testing::ValuesIn(LowbdIntraPredTestVectorAvx2));
-
-#endif  // HAVE_AVX2
 
 #if HAVE_NEON
 const IntraPredFunc<HighbdIntraPred> HighbdIntraPredTestVectorNeon[] = {

diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index a553fcb..2ae87aa 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc

@@ -102,7 +102,7 @@
 
   void RunTest() {
     const DecodeParam input = GET_PARAM(1);
-    aom_codec_dec_cfg_t cfg = { 0, 0, 0, 0 };
+    aom_codec_dec_cfg_t cfg = { 0, 0, 0 };
     cfg.threads = input.threads;
     const std::string filename = input.filename;
     libaom_test::IVFVideoSource decode_video(filename);

diff --git a/test/lpf_test.cc b/test/lpf_test.cc
index 6a65bff..4d75c3d 100644
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc

@@ -156,24 +156,11 @@
   op(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1, bd);
 }
 
-void call_filter(uint8_t *s, LOOP_PARAM, int bd, loop_op_t op) {
-  (void)bd;
-  op(s, p, blimit, limit, thresh);
-}
-void call_dualfilter(uint8_t *s, DUAL_LOOP_PARAM, int bd, dual_loop_op_t op) {
-  (void)bd;
-  op(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1);
-};
-
 typedef LoopTestParam<hbdloop_op_t, hbdloop_param_t> Loop8Test6Param_hbd;
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test6Param_hbd);
 typedef LoopTestParam<hbddual_loop_op_t, hbddual_loop_param_t>
     Loop8Test9Param_hbd;
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test9Param_hbd);
-typedef LoopTestParam<loop_op_t, loop_param_t> Loop8Test6Param_lbd;
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test6Param_lbd);
-typedef LoopTestParam<dual_loop_op_t, dual_loop_param_t> Loop8Test9Param_lbd;
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test9Param_lbd);
 
 #define OPCHECK(a, b)                                                          \
   ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
@@ -216,7 +203,6 @@
       << "First failed at test case " << first_failure;
 
 TEST_P(Loop8Test6Param_hbd, OperationCheck) { OPCHECK(uint16_t, 16); }
-TEST_P(Loop8Test6Param_lbd, OperationCheck) { OPCHECK(uint8_t, 8); }
 
 #define VALCHECK(a, b)                                                         \
   ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
@@ -262,7 +248,6 @@
       << "First failed at test case " << first_failure;
 
 TEST_P(Loop8Test6Param_hbd, ValueCheck) { VALCHECK(uint16_t, 16); }
-TEST_P(Loop8Test6Param_lbd, ValueCheck) { VALCHECK(uint8_t, 8); }
 
 #define SPEEDCHECK(a, b)                                                      \
   ACMRandom rnd(ACMRandom::DeterministicSeed());                              \
@@ -290,7 +275,6 @@
   }
 
 TEST_P(Loop8Test6Param_hbd, DISABLED_Speed) { SPEEDCHECK(uint16_t, 16); }
-TEST_P(Loop8Test6Param_lbd, DISABLED_Speed) { SPEEDCHECK(uint8_t, 8); }
 
 #define OPCHECKd(a, b)                                                         \
   ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
@@ -347,7 +331,6 @@
       << "First failed at test case " << first_failure;
 
 TEST_P(Loop8Test9Param_hbd, OperationCheck) { OPCHECKd(uint16_t, 16); }
-TEST_P(Loop8Test9Param_lbd, OperationCheck) { OPCHECKd(uint8_t, 8); }
 
 #define VALCHECKd(a, b)                                                        \
   ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
@@ -406,7 +389,6 @@
       << "First failed at test case " << first_failure;
 
 TEST_P(Loop8Test9Param_hbd, ValueCheck) { VALCHECKd(uint16_t, 16); }
-TEST_P(Loop8Test9Param_lbd, ValueCheck) { VALCHECKd(uint8_t, 8); }
 
 #define SPEEDCHECKd(a, b)                                                    \
   ACMRandom rnd(ACMRandom::DeterministicSeed());                             \
@@ -446,7 +428,6 @@
   }
 
 TEST_P(Loop8Test9Param_hbd, DISABLED_Speed) { SPEEDCHECKd(uint16_t, 16); }
-TEST_P(Loop8Test9Param_lbd, DISABLED_Speed) { SPEEDCHECKd(uint8_t, 8); }
 
 using std::make_tuple;
 
@@ -497,35 +478,6 @@
 INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test6Param_hbd,
                          ::testing::ValuesIn(kHbdLoop8Test6));
 
-const loop_param_t kLoop8Test6[] = {
-  make_tuple(&aom_lpf_horizontal_4_sse2, &aom_lpf_horizontal_4_c, 8),
-  make_tuple(&aom_lpf_horizontal_8_sse2, &aom_lpf_horizontal_8_c, 8),
-  make_tuple(&aom_lpf_horizontal_6_sse2, &aom_lpf_horizontal_6_c, 8),
-  make_tuple(&aom_lpf_vertical_6_sse2, &aom_lpf_vertical_6_c, 8),
-  make_tuple(&aom_lpf_horizontal_14_sse2, &aom_lpf_horizontal_14_c, 8),
-  make_tuple(&aom_lpf_vertical_4_sse2, &aom_lpf_vertical_4_c, 8),
-  make_tuple(&aom_lpf_vertical_8_sse2, &aom_lpf_vertical_8_c, 8),
-  make_tuple(&aom_lpf_vertical_14_sse2, &aom_lpf_vertical_14_c, 8),
-};
-
-INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test6Param_lbd,
-                         ::testing::ValuesIn(kLoop8Test6));
-
-const dual_loop_param_t kLoop8Test9[] = {
-  make_tuple(&aom_lpf_horizontal_4_dual_sse2, &aom_lpf_horizontal_4_dual_c, 8),
-  make_tuple(&aom_lpf_vertical_4_dual_sse2, &aom_lpf_vertical_4_dual_c, 8),
-  make_tuple(&aom_lpf_horizontal_6_dual_sse2, &aom_lpf_horizontal_6_dual_c, 8),
-  make_tuple(&aom_lpf_vertical_6_dual_sse2, &aom_lpf_vertical_6_dual_c, 8),
-  make_tuple(&aom_lpf_horizontal_8_dual_sse2, &aom_lpf_horizontal_8_dual_c, 8),
-  make_tuple(&aom_lpf_vertical_8_dual_sse2, &aom_lpf_vertical_8_dual_c, 8),
-  make_tuple(&aom_lpf_horizontal_14_dual_sse2, &aom_lpf_horizontal_14_dual_c,
-             8),
-  make_tuple(&aom_lpf_vertical_14_dual_sse2, &aom_lpf_vertical_14_dual_c, 8)
-};
-
-INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test9Param_lbd,
-                         ::testing::ValuesIn(kLoop8Test9));
-
 #endif  // HAVE_SSE2
 
 #if HAVE_SSE2
@@ -585,22 +537,6 @@
 
 #endif  // HAVE_SSE2
 
-#if HAVE_NEON
-const loop_param_t kLoop8Test6[] = {
-  make_tuple(&aom_lpf_vertical_14_neon, &aom_lpf_vertical_14_c, 8),
-  make_tuple(&aom_lpf_vertical_8_neon, &aom_lpf_vertical_8_c, 8),
-  make_tuple(&aom_lpf_vertical_6_neon, &aom_lpf_vertical_6_c, 8),
-  make_tuple(&aom_lpf_vertical_4_neon, &aom_lpf_vertical_4_c, 8),
-  make_tuple(&aom_lpf_horizontal_14_neon, &aom_lpf_horizontal_14_c, 8),
-  make_tuple(&aom_lpf_horizontal_8_neon, &aom_lpf_horizontal_8_c, 8),
-  make_tuple(&aom_lpf_horizontal_6_neon, &aom_lpf_horizontal_6_c, 8),
-  make_tuple(&aom_lpf_horizontal_4_neon, &aom_lpf_horizontal_4_c, 8)
-};
-
-INSTANTIATE_TEST_SUITE_P(NEON, Loop8Test6Param_lbd,
-                         ::testing::ValuesIn(kLoop8Test6));
-#endif  // HAVE_NEON
-
 #if HAVE_AVX2
 const hbddual_loop_param_t kHbdLoop8Test9Avx2[] = {
   make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,

diff --git a/test/masked_sad_test.cc b/test/masked_sad_test.cc
index 55953a7..4039046 100644
--- a/test/masked_sad_test.cc
+++ b/test/masked_sad_test.cc

@@ -30,213 +30,6 @@
 namespace {
 const int number_of_iterations = 200;
 
-typedef unsigned int (*MaskedSADFunc)(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride,
-                                      const uint8_t *second_pred,
-                                      const uint8_t *msk, int msk_stride,
-                                      int invert_mask);
-typedef std::tuple<MaskedSADFunc, MaskedSADFunc> MaskedSADParam;
-
-typedef void (*MaskedSADx4Func)(const uint8_t *src, int src_stride,
-                                const uint8_t *ref[], int ref_stride,
-                                const uint8_t *second_pred, const uint8_t *msk,
-                                int msk_stride, int invert_mask,
-                                unsigned sads[]);
-
-typedef std::tuple<MaskedSADx4Func, MaskedSADx4Func> MaskedSADx4Param;
-
-class MaskedSADTestBase : public ::testing::Test {
- public:
-  virtual ~MaskedSADTestBase() {}
-  virtual void SetUp() = 0;
-  virtual void runRef(const uint8_t *src_ptr, int src_stride,
-                      const uint8_t *ref_ptr[], int ref_stride,
-                      const uint8_t *second_pred, const uint8_t *msk,
-                      int msk_stride, int inv_mask, unsigned sads[],
-                      int times) = 0;
-  virtual void runTest(const uint8_t *src_ptr, int src_stride,
-                       const uint8_t *ref_ptr[], int ref_stride,
-                       const uint8_t *second_pred, const uint8_t *msk,
-                       int msk_stride, int inv_mask, unsigned sads[],
-                       int times) = 0;
-
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-  void runMaskedSADTest(int run_times);
-};
-
-class MaskedSADTest : public MaskedSADTestBase,
-                      public ::testing::WithParamInterface<MaskedSADParam> {
- public:
-  virtual ~MaskedSADTest() {}
-  virtual void SetUp() {
-    maskedSAD_op_ = GET_PARAM(0);
-    ref_maskedSAD_op_ = GET_PARAM(1);
-  }
-
-  virtual void runRef(const uint8_t *src_ptr, int src_stride,
-                      const uint8_t *ref_ptr[], int ref_stride,
-                      const uint8_t *second_pred, const uint8_t *msk,
-                      int msk_stride, int inv_mask, unsigned sads[], int times);
-  virtual void runTest(const uint8_t *src_ptr, int src_stride,
-                       const uint8_t *ref_ptr[], int ref_stride,
-                       const uint8_t *second_pred, const uint8_t *msk,
-                       int msk_stride, int inv_mask, unsigned sads[],
-                       int times);
-
- protected:
-  MaskedSADFunc maskedSAD_op_;
-  MaskedSADFunc ref_maskedSAD_op_;
-};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(MaskedSADTest);
-
-class MaskedSADx4Test : public MaskedSADTestBase,
-                        public ::testing::WithParamInterface<MaskedSADx4Param> {
- public:
-  virtual ~MaskedSADx4Test() {}
-  virtual void SetUp() {
-    maskedSAD_op_ = GET_PARAM(0);
-    ref_maskedSAD_op_ = GET_PARAM(1);
-  }
-  virtual void runRef(const uint8_t *src_ptr, int src_stride,
-                      const uint8_t *ref_ptr[], int ref_stride,
-                      const uint8_t *second_pred, const uint8_t *msk,
-                      int msk_stride, int inv_mask, unsigned sads[], int times);
-  virtual void runTest(const uint8_t *src_ptr, int src_stride,
-                       const uint8_t *ref_ptr[], int ref_stride,
-                       const uint8_t *second_pred, const uint8_t *msk,
-                       int msk_stride, int inv_mask, unsigned sads[],
-                       int times);
-
- protected:
-  MaskedSADx4Func maskedSAD_op_;
-  MaskedSADx4Func ref_maskedSAD_op_;
-};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(MaskedSADx4Test);
-
-void MaskedSADTest::runRef(const uint8_t *src_ptr, int src_stride,
-                           const uint8_t *ref_ptr[], int ref_stride,
-                           const uint8_t *second_pred, const uint8_t *msk,
-                           int msk_stride, int invert_mask, unsigned sads[],
-                           int times) {
-  for (int repeat = 0; repeat < times; ++repeat) {
-    sads[0] = ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr[0], ref_stride,
-                                second_pred, msk, msk_stride, invert_mask);
-  }
-}
-
-void MaskedSADTest::runTest(const uint8_t *src_ptr, int src_stride,
-                            const uint8_t *ref_ptr[], int ref_stride,
-                            const uint8_t *second_pred, const uint8_t *msk,
-                            int msk_stride, int invert_mask, unsigned sads[],
-                            int times) {
-  if (times == 1) {
-    sads[0] = maskedSAD_op_(src_ptr, src_stride, ref_ptr[0], ref_stride,
-                            second_pred, msk, msk_stride, invert_mask);
-  } else {
-    for (int repeat = 0; repeat < times; ++repeat) {
-      ASM_REGISTER_STATE_CHECK(
-          sads[0] = maskedSAD_op_(src_ptr, src_stride, ref_ptr[0], ref_stride,
-                                  second_pred, msk, msk_stride, invert_mask));
-    }
-  }
-}
-
-void MaskedSADx4Test::runRef(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *ref_ptr[], int ref_stride,
-                             const uint8_t *second_pred, const uint8_t *msk,
-                             int msk_stride, int invert_mask, unsigned sads[],
-                             int times) {
-  for (int repeat = 0; repeat < times; ++repeat) {
-    ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride, second_pred,
-                      msk, msk_stride, invert_mask, sads);
-  }
-}
-
-void MaskedSADx4Test::runTest(const uint8_t *src_ptr, int src_stride,
-                              const uint8_t *ref_ptr[], int ref_stride,
-                              const uint8_t *second_pred, const uint8_t *msk,
-                              int msk_stride, int invert_mask, unsigned sads[],
-                              int times) {
-  if (times == 1) {
-    ASM_REGISTER_STATE_CHECK(maskedSAD_op_(src_ptr, src_stride, ref_ptr,
-                                           ref_stride, second_pred, msk,
-                                           msk_stride, invert_mask, sads));
-  } else {
-    for (int repeat = 0; repeat < times; ++repeat) {
-      maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, msk,
-                    msk_stride, invert_mask, sads);
-    }
-  }
-}
-
-void MaskedSADTestBase::runMaskedSADTest(int run_times) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const unsigned kBlockSize = MAX_SB_SIZE * MAX_SB_SIZE;
-  DECLARE_ALIGNED(16, uint8_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE * 4]);
-  DECLARE_ALIGNED(16, uint8_t, second_pred_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
-
-  const uint8_t *refs[] = { ref_ptr, ref_ptr + kBlockSize,
-                            ref_ptr + 2 * kBlockSize,
-                            ref_ptr + 3 * kBlockSize };
-  unsigned sads[] = { 0, 0, 0, 0 };
-  unsigned sads_ref[] = { 0, 0, 0, 0 };
-  int err_count = 0;
-  int first_failure = -1;
-  int src_stride = MAX_SB_SIZE;
-  int ref_stride = MAX_SB_SIZE;
-  int msk_stride = MAX_SB_SIZE;
-  const int iters = run_times == 1 ? number_of_iterations : 1;
-  for (int i = 0; i < iters; ++i) {
-    for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) {
-      src_ptr[j] = rnd.Rand8();
-      ref_ptr[j] = rnd.Rand8();
-      (ref_ptr + kBlockSize)[j] = rnd.Rand8();
-      (ref_ptr + 2 * kBlockSize)[j] = rnd.Rand8();
-      (ref_ptr + 3 * kBlockSize)[j] = rnd.Rand8();
-      second_pred_ptr[j] = rnd.Rand8();
-      msk_ptr[j] = ((rnd.Rand8() & 0x7f) > 64) ? rnd.Rand8() & 0x3f : 64;
-      assert(msk_ptr[j] <= 64);
-    }
-
-    for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
-      aom_usec_timer timer;
-      aom_usec_timer_start(&timer);
-      runRef(src_ptr, src_stride, refs, ref_stride, second_pred_ptr, msk_ptr,
-             msk_stride, invert_mask, sads_ref, run_times);
-      aom_usec_timer_mark(&timer);
-      const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
-
-      aom_usec_timer_start(&timer);
-      runTest(src_ptr, src_stride, refs, ref_stride, second_pred_ptr, msk_ptr,
-              msk_stride, invert_mask, sads, run_times);
-      aom_usec_timer_mark(&timer);
-      const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
-
-      if (run_times > 10) {
-        printf("%7.2f/%7.2fns", time1, time2);
-        printf("(%3.2f)\n", time1 / time2);
-      }
-      if (sads_ref[0] != sads[0] || sads_ref[1] != sads[1] ||
-          sads_ref[2] != sads[2] || sads_ref[3] != sads[3]) {
-        err_count++;
-        if (first_failure == -1) first_failure = i;
-      }
-    }
-  }
-  EXPECT_EQ(0, err_count) << "Error: Masked SAD Test,  output doesn't match. "
-                          << "First failed at test case " << first_failure;
-}
-
-TEST_P(MaskedSADTest, OperationCheck) { runMaskedSADTest(1); }
-
-TEST_P(MaskedSADTest, DISABLED_Speed) { runMaskedSADTest(2000000); }
-
-TEST_P(MaskedSADx4Test, OperationCheck) { runMaskedSADTest(1); }
-
-TEST_P(MaskedSADx4Test, DISABLED_Speed) { runMaskedSADTest(2000000); }
-
 typedef unsigned int (*HighbdMaskedSADFunc)(const uint8_t *src, int src_stride,
                                             const uint8_t *ref, int ref_stride,
                                             const uint8_t *second_pred,
@@ -334,61 +127,6 @@
 using std::make_tuple;
 
 #if HAVE_SSSE3
-const MaskedSADParam msad_test[] = {
-  make_tuple(&aom_masked_sad4x4_ssse3, &aom_masked_sad4x4_c),
-  make_tuple(&aom_masked_sad4x8_ssse3, &aom_masked_sad4x8_c),
-  make_tuple(&aom_masked_sad8x4_ssse3, &aom_masked_sad8x4_c),
-  make_tuple(&aom_masked_sad8x8_ssse3, &aom_masked_sad8x8_c),
-  make_tuple(&aom_masked_sad8x16_ssse3, &aom_masked_sad8x16_c),
-  make_tuple(&aom_masked_sad16x8_ssse3, &aom_masked_sad16x8_c),
-  make_tuple(&aom_masked_sad16x16_ssse3, &aom_masked_sad16x16_c),
-  make_tuple(&aom_masked_sad16x32_ssse3, &aom_masked_sad16x32_c),
-  make_tuple(&aom_masked_sad32x16_ssse3, &aom_masked_sad32x16_c),
-  make_tuple(&aom_masked_sad32x32_ssse3, &aom_masked_sad32x32_c),
-  make_tuple(&aom_masked_sad32x64_ssse3, &aom_masked_sad32x64_c),
-  make_tuple(&aom_masked_sad64x32_ssse3, &aom_masked_sad64x32_c),
-  make_tuple(&aom_masked_sad64x64_ssse3, &aom_masked_sad64x64_c),
-  make_tuple(&aom_masked_sad64x128_ssse3, &aom_masked_sad64x128_c),
-  make_tuple(&aom_masked_sad128x64_ssse3, &aom_masked_sad128x64_c),
-  make_tuple(&aom_masked_sad128x128_ssse3, &aom_masked_sad128x128_c),
-  make_tuple(&aom_masked_sad4x16_ssse3, &aom_masked_sad4x16_c),
-  make_tuple(&aom_masked_sad16x4_ssse3, &aom_masked_sad16x4_c),
-  make_tuple(&aom_masked_sad8x32_ssse3, &aom_masked_sad8x32_c),
-  make_tuple(&aom_masked_sad32x8_ssse3, &aom_masked_sad32x8_c),
-  make_tuple(&aom_masked_sad16x64_ssse3, &aom_masked_sad16x64_c),
-  make_tuple(&aom_masked_sad64x16_ssse3, &aom_masked_sad64x16_c),
-};
-
-INSTANTIATE_TEST_SUITE_P(SSSE3, MaskedSADTest, ::testing::ValuesIn(msad_test));
-
-const MaskedSADx4Param msadx4_test[] = {
-  make_tuple(&aom_masked_sad4x4x4d_ssse3, &aom_masked_sad4x4x4d_c),
-  make_tuple(&aom_masked_sad4x8x4d_ssse3, &aom_masked_sad4x8x4d_c),
-  make_tuple(&aom_masked_sad8x4x4d_ssse3, &aom_masked_sad8x4x4d_c),
-  make_tuple(&aom_masked_sad8x8x4d_ssse3, &aom_masked_sad8x8x4d_c),
-  make_tuple(&aom_masked_sad8x16x4d_ssse3, &aom_masked_sad8x16x4d_c),
-  make_tuple(&aom_masked_sad16x8x4d_ssse3, &aom_masked_sad16x8x4d_c),
-  make_tuple(&aom_masked_sad16x16x4d_ssse3, &aom_masked_sad16x16x4d_c),
-  make_tuple(&aom_masked_sad16x32x4d_ssse3, &aom_masked_sad16x32x4d_c),
-  make_tuple(&aom_masked_sad32x16x4d_ssse3, &aom_masked_sad32x16x4d_c),
-  make_tuple(&aom_masked_sad32x32x4d_ssse3, &aom_masked_sad32x32x4d_c),
-  make_tuple(&aom_masked_sad32x64x4d_ssse3, &aom_masked_sad32x64x4d_c),
-  make_tuple(&aom_masked_sad64x32x4d_ssse3, &aom_masked_sad64x32x4d_c),
-  make_tuple(&aom_masked_sad64x64x4d_ssse3, &aom_masked_sad64x64x4d_c),
-  make_tuple(&aom_masked_sad64x128x4d_ssse3, &aom_masked_sad64x128x4d_c),
-  make_tuple(&aom_masked_sad128x64x4d_ssse3, &aom_masked_sad128x64x4d_c),
-  make_tuple(&aom_masked_sad128x128x4d_ssse3, &aom_masked_sad128x128x4d_c),
-  make_tuple(&aom_masked_sad4x16x4d_ssse3, &aom_masked_sad4x16x4d_c),
-  make_tuple(&aom_masked_sad16x4x4d_ssse3, &aom_masked_sad16x4x4d_c),
-  make_tuple(&aom_masked_sad8x32x4d_ssse3, &aom_masked_sad8x32x4d_c),
-  make_tuple(&aom_masked_sad32x8x4d_ssse3, &aom_masked_sad32x8x4d_c),
-  make_tuple(&aom_masked_sad16x64x4d_ssse3, &aom_masked_sad16x64x4d_c),
-  make_tuple(&aom_masked_sad64x16x4d_ssse3, &aom_masked_sad64x16x4d_c),
-};
-
-INSTANTIATE_TEST_SUITE_P(SSSE3, MaskedSADx4Test,
-                         ::testing::ValuesIn(msadx4_test));
-
 const HighbdMaskedSADParam hbd_msad_test[] = {
   make_tuple(&aom_highbd_masked_sad4x4_ssse3, &aom_highbd_masked_sad4x4_c),
   make_tuple(&aom_highbd_masked_sad4x8_ssse3, &aom_highbd_masked_sad4x8_c),
@@ -422,34 +160,6 @@
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX2
-const MaskedSADParam msad_avx2_test[] = {
-  make_tuple(&aom_masked_sad4x4_avx2, &aom_masked_sad4x4_ssse3),
-  make_tuple(&aom_masked_sad4x8_avx2, &aom_masked_sad4x8_ssse3),
-  make_tuple(&aom_masked_sad8x4_avx2, &aom_masked_sad8x4_ssse3),
-  make_tuple(&aom_masked_sad8x8_avx2, &aom_masked_sad8x8_ssse3),
-  make_tuple(&aom_masked_sad8x16_avx2, &aom_masked_sad8x16_ssse3),
-  make_tuple(&aom_masked_sad16x8_avx2, &aom_masked_sad16x8_ssse3),
-  make_tuple(&aom_masked_sad16x16_avx2, &aom_masked_sad16x16_ssse3),
-  make_tuple(&aom_masked_sad16x32_avx2, &aom_masked_sad16x32_ssse3),
-  make_tuple(&aom_masked_sad32x16_avx2, &aom_masked_sad32x16_ssse3),
-  make_tuple(&aom_masked_sad32x32_avx2, &aom_masked_sad32x32_ssse3),
-  make_tuple(&aom_masked_sad32x64_avx2, &aom_masked_sad32x64_ssse3),
-  make_tuple(&aom_masked_sad64x32_avx2, &aom_masked_sad64x32_ssse3),
-  make_tuple(&aom_masked_sad64x64_avx2, &aom_masked_sad64x64_ssse3),
-  make_tuple(&aom_masked_sad64x128_avx2, &aom_masked_sad64x128_ssse3),
-  make_tuple(&aom_masked_sad128x64_avx2, &aom_masked_sad128x64_ssse3),
-  make_tuple(&aom_masked_sad128x128_avx2, &aom_masked_sad128x128_ssse3),
-  make_tuple(&aom_masked_sad4x16_avx2, &aom_masked_sad4x16_ssse3),
-  make_tuple(&aom_masked_sad16x4_avx2, &aom_masked_sad16x4_ssse3),
-  make_tuple(&aom_masked_sad8x32_avx2, &aom_masked_sad8x32_ssse3),
-  make_tuple(&aom_masked_sad32x8_avx2, &aom_masked_sad32x8_ssse3),
-  make_tuple(&aom_masked_sad16x64_avx2, &aom_masked_sad16x64_ssse3),
-  make_tuple(&aom_masked_sad64x16_avx2, &aom_masked_sad64x16_ssse3)
-};
-
-INSTANTIATE_TEST_SUITE_P(AVX2, MaskedSADTest,
-                         ::testing::ValuesIn(msad_avx2_test));
-
 const HighbdMaskedSADParam hbd_msad_avx2_test[] = {
   make_tuple(&aom_highbd_masked_sad4x4_avx2, &aom_highbd_masked_sad4x4_ssse3),
   make_tuple(&aom_highbd_masked_sad4x8_avx2, &aom_highbd_masked_sad4x8_ssse3),

diff --git a/test/masked_variance_test.cc b/test/masked_variance_test.cc
index 0763c83..a3c87ce 100644
--- a/test/masked_variance_test.cc
+++ b/test/masked_variance_test.cc

@@ -39,140 +39,6 @@
     const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
     const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
 
-typedef std::tuple<MaskedSubPixelVarianceFunc, MaskedSubPixelVarianceFunc>
-    MaskedSubPixelVarianceParam;
-
-class MaskedSubPixelVarianceTest
-    : public ::testing::TestWithParam<MaskedSubPixelVarianceParam> {
- public:
-  virtual ~MaskedSubPixelVarianceTest() {}
-  virtual void SetUp() {
-    opt_func_ = GET_PARAM(0);
-    ref_func_ = GET_PARAM(1);
-  }
-
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  MaskedSubPixelVarianceFunc opt_func_;
-  MaskedSubPixelVarianceFunc ref_func_;
-};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(MaskedSubPixelVarianceTest);
-
-TEST_P(MaskedSubPixelVarianceTest, OperationCheck) {
-  unsigned int ref_ret, opt_ret;
-  unsigned int ref_sse, opt_sse;
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  // Note: We pad out the input array to a multiple of 16 bytes wide, so that
-  // consecutive rows keep the 16-byte alignment.
-  DECLARE_ALIGNED(16, uint8_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
-  DECLARE_ALIGNED(16, uint8_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
-  DECLARE_ALIGNED(16, uint8_t,
-                  second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
-  DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
-  int err_count = 0;
-  int first_failure = -1;
-  int src_stride = (MAX_SB_SIZE + 16);
-  int ref_stride = (MAX_SB_SIZE + 16);
-  int msk_stride = (MAX_SB_SIZE + 16);
-  int xoffset;
-  int yoffset;
-
-  for (int i = 0; i < number_of_iterations; ++i) {
-    int xoffsets[] = { 0, 4, rnd(BIL_SUBPEL_SHIFTS) };
-    int yoffsets[] = { 0, 4, rnd(BIL_SUBPEL_SHIFTS) };
-    for (int j = 0; j < (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16); j++) {
-      src_ptr[j] = rnd.Rand8();
-      ref_ptr[j] = rnd.Rand8();
-      second_pred_ptr[j] = rnd.Rand8();
-      msk_ptr[j] = rnd(65);
-    }
-    for (int k = 0; k < 3; k++) {
-      for (int l = 0; l < 3; l++) {
-        xoffset = xoffsets[k];
-        yoffset = yoffsets[l];
-        for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
-          ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
-                              ref_stride, second_pred_ptr, msk_ptr, msk_stride,
-                              invert_mask, &ref_sse);
-          ASM_REGISTER_STATE_CHECK(
-              opt_ret = opt_func_(src_ptr, src_stride, xoffset, yoffset,
-                                  ref_ptr, ref_stride, second_pred_ptr, msk_ptr,
-                                  msk_stride, invert_mask, &opt_sse));
-
-          if (opt_ret != ref_ret || opt_sse != ref_sse) {
-            err_count++;
-            if (first_failure == -1) first_failure = i;
-          }
-        }
-      }
-    }
-  }
-
-  EXPECT_EQ(0, err_count)
-      << "Error: Masked Sub Pixel Variance Test OperationCheck,"
-      << "C output doesn't match SSSE3 output. "
-      << "First failed at test case " << first_failure;
-}
-
-TEST_P(MaskedSubPixelVarianceTest, ExtremeValues) {
-  unsigned int ref_ret, opt_ret;
-  unsigned int ref_sse, opt_sse;
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, uint8_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
-  DECLARE_ALIGNED(16, uint8_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
-  DECLARE_ALIGNED(16, uint8_t,
-                  second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
-  DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
-  int first_failure_x = -1;
-  int first_failure_y = -1;
-  int err_count = 0;
-  int first_failure = -1;
-  int src_stride = (MAX_SB_SIZE + 16);
-  int ref_stride = (MAX_SB_SIZE + 16);
-  int msk_stride = (MAX_SB_SIZE + 16);
-
-  for (int xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
-    for (int yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
-      for (int i = 0; i < 16; ++i) {
-        memset(src_ptr, (i & 0x1) ? 255 : 0,
-               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16));
-        memset(ref_ptr, (i & 0x2) ? 255 : 0,
-               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16));
-        memset(second_pred_ptr, (i & 0x4) ? 255 : 0,
-               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16));
-        memset(msk_ptr, (i & 0x8) ? 64 : 0,
-               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16));
-
-        for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
-          ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
-                              ref_stride, second_pred_ptr, msk_ptr, msk_stride,
-                              invert_mask, &ref_sse);
-          ASM_REGISTER_STATE_CHECK(
-              opt_ret = opt_func_(src_ptr, src_stride, xoffset, yoffset,
-                                  ref_ptr, ref_stride, second_pred_ptr, msk_ptr,
-                                  msk_stride, invert_mask, &opt_sse));
-
-          if (opt_ret != ref_ret || opt_sse != ref_sse) {
-            err_count++;
-            if (first_failure == -1) {
-              first_failure = i;
-              first_failure_x = xoffset;
-              first_failure_y = yoffset;
-            }
-          }
-        }
-      }
-    }
-  }
-
-  EXPECT_EQ(0, err_count) << "Error: Masked Variance Test ExtremeValues,"
-                          << "C output doesn't match SSSE3 output. "
-                          << "First failed at test case " << first_failure
-                          << " x_offset = " << first_failure_x
-                          << " y_offset = " << first_failure_y;
-}
-
 typedef std::tuple<MaskedSubPixelVarianceFunc, MaskedSubPixelVarianceFunc,
                    aom_bit_depth_t>
     HighbdMaskedSubPixelVarianceParam;
@@ -319,58 +185,6 @@
 using std::make_tuple;
 
 #if HAVE_SSSE3
-
-const MaskedSubPixelVarianceParam sub_pel_var_test[] = {
-  make_tuple(&aom_masked_sub_pixel_variance128x128_ssse3,
-             &aom_masked_sub_pixel_variance128x128_c),
-  make_tuple(&aom_masked_sub_pixel_variance128x64_ssse3,
-             &aom_masked_sub_pixel_variance128x64_c),
-  make_tuple(&aom_masked_sub_pixel_variance64x128_ssse3,
-             &aom_masked_sub_pixel_variance64x128_c),
-  make_tuple(&aom_masked_sub_pixel_variance64x64_ssse3,
-             &aom_masked_sub_pixel_variance64x64_c),
-  make_tuple(&aom_masked_sub_pixel_variance64x32_ssse3,
-             &aom_masked_sub_pixel_variance64x32_c),
-  make_tuple(&aom_masked_sub_pixel_variance32x64_ssse3,
-             &aom_masked_sub_pixel_variance32x64_c),
-  make_tuple(&aom_masked_sub_pixel_variance32x32_ssse3,
-             &aom_masked_sub_pixel_variance32x32_c),
-  make_tuple(&aom_masked_sub_pixel_variance32x16_ssse3,
-             &aom_masked_sub_pixel_variance32x16_c),
-  make_tuple(&aom_masked_sub_pixel_variance16x32_ssse3,
-             &aom_masked_sub_pixel_variance16x32_c),
-  make_tuple(&aom_masked_sub_pixel_variance16x16_ssse3,
-             &aom_masked_sub_pixel_variance16x16_c),
-  make_tuple(&aom_masked_sub_pixel_variance16x8_ssse3,
-             &aom_masked_sub_pixel_variance16x8_c),
-  make_tuple(&aom_masked_sub_pixel_variance8x16_ssse3,
-             &aom_masked_sub_pixel_variance8x16_c),
-  make_tuple(&aom_masked_sub_pixel_variance8x8_ssse3,
-             &aom_masked_sub_pixel_variance8x8_c),
-  make_tuple(&aom_masked_sub_pixel_variance8x4_ssse3,
-             &aom_masked_sub_pixel_variance8x4_c),
-  make_tuple(&aom_masked_sub_pixel_variance4x8_ssse3,
-             &aom_masked_sub_pixel_variance4x8_c),
-  make_tuple(&aom_masked_sub_pixel_variance4x4_ssse3,
-             &aom_masked_sub_pixel_variance4x4_c),
-
-  make_tuple(&aom_masked_sub_pixel_variance64x16_ssse3,
-             &aom_masked_sub_pixel_variance64x16_c),
-  make_tuple(&aom_masked_sub_pixel_variance16x64_ssse3,
-             &aom_masked_sub_pixel_variance16x64_c),
-  make_tuple(&aom_masked_sub_pixel_variance32x8_ssse3,
-             &aom_masked_sub_pixel_variance32x8_c),
-  make_tuple(&aom_masked_sub_pixel_variance8x32_ssse3,
-             &aom_masked_sub_pixel_variance8x32_c),
-  make_tuple(&aom_masked_sub_pixel_variance16x4_ssse3,
-             &aom_masked_sub_pixel_variance16x4_c),
-  make_tuple(&aom_masked_sub_pixel_variance4x16_ssse3,
-             &aom_masked_sub_pixel_variance4x16_c),
-};
-
-INSTANTIATE_TEST_SUITE_P(SSSE3_C_COMPARE, MaskedSubPixelVarianceTest,
-                         ::testing::ValuesIn(sub_pel_var_test));
-
 const HighbdMaskedSubPixelVarianceParam hbd_sub_pel_var_test[] = {
   make_tuple(&aom_highbd_8_masked_sub_pixel_variance128x128_ssse3,
              &aom_highbd_8_masked_sub_pixel_variance128x128_c, AOM_BITS_8),

diff --git a/test/monochrome_test.cc b/test/monochrome_test.cc
index 30d52c3..767b87e 100644
--- a/test/monochrome_test.cc
+++ b/test/monochrome_test.cc

@@ -38,8 +38,9 @@
                                      aom_codec_pts_t pts) {
     (void)pts;
 
+    const uint16_t *const u_plane = (const uint16_t *)img.planes[AOM_PLANE_U];
     // Get value of top-left corner pixel of U plane
-    int chroma_value = img.planes[AOM_PLANE_U][0];
+    int chroma_value = u_plane[0];
 
     bool is_chroma_constant =
         ComparePlaneToValue(img, AOM_PLANE_U, chroma_value) &&
@@ -60,12 +61,13 @@
                            const int value) {
     const int w = aom_img_plane_width(&img, plane);
     const int h = aom_img_plane_height(&img, plane);
-    const uint8_t *const buf = img.planes[plane];
     const int stride = img.stride[plane];
 
     for (int r = 0; r < h; ++r) {
+      const uint16_t *const buf =
+          (const uint16_t *)(img.planes[plane] + r * stride);
       for (int c = 0; c < w; ++c) {
-        if (buf[r * stride + c] != value) return false;
+        if (buf[c] != value) return false;
       }
     }
     return true;

diff --git a/test/noise_model_test.cc b/test/noise_model_test.cc
index 41548d7..71fece9 100644
--- a/test/noise_model_test.cc
+++ b/test/noise_model_test.cc

@@ -268,7 +268,7 @@
 }
 
 TEST(NoiseModel, InitSuccessWithValidSquareShape) {
-  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 2, 8, 0 };
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 2, 8 };
   aom_noise_model_t model;
 
   EXPECT_TRUE(aom_noise_model_init(&model, params));
@@ -288,7 +288,7 @@
 
 TEST(NoiseModel, InitSuccessWithValidDiamondShape) {
   aom_noise_model_t model;
-  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_DIAMOND, 2, 8, 0 };
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_DIAMOND, 2, 8 };
   EXPECT_TRUE(aom_noise_model_init(&model, params));
   EXPECT_EQ(6, model.n);
   const int kNumCoords = 6;
@@ -305,21 +305,21 @@
 
 TEST(NoiseModel, InitFailsWithTooLargeLag) {
   aom_noise_model_t model;
-  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 10, 8, 0 };
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 10, 8 };
   EXPECT_FALSE(aom_noise_model_init(&model, params));
   aom_noise_model_free(&model);
 }
 
 TEST(NoiseModel, InitFailsWithTooSmallLag) {
   aom_noise_model_t model;
-  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 0, 8, 0 };
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 0, 8 };
   EXPECT_FALSE(aom_noise_model_init(&model, params));
   aom_noise_model_free(&model);
 }
 
 TEST(NoiseModel, InitFailsWithInvalidShape) {
   aom_noise_model_t model;
-  aom_noise_model_params_t params = { aom_noise_shape(100), 3, 8, 0 };
+  aom_noise_model_params_t params = { aom_noise_shape(100), 3, 8 };
   EXPECT_FALSE(aom_noise_model_init(&model, params));
   aom_noise_model_free(&model);
 }
@@ -328,11 +328,10 @@
 // All of these args are bundled into one struct so that we can use
 // parameterized tests on combinations of supported data types
 // (uint8_t and uint16_t) and bit depths (8, 10, 12).
-template <typename T, int bit_depth, bool use_highbd>
+template <typename T, int bit_depth>
 struct BitDepthParams {
   typedef T data_type_t;
   static const int kBitDepth = bit_depth;
-  static const bool kUseHighBD = use_highbd;
 };
 
 template <typename T>
@@ -350,7 +349,7 @@
   const int kBlockSize = 16;
   aom_flat_block_finder_t flat_block_finder;
   ASSERT_EQ(1, aom_flat_block_finder_init(&flat_block_finder, kBlockSize,
-                                          this->kBitDepth, this->kUseHighBD));
+                                          this->kBitDepth));
   const double normalization = flat_block_finder.normalization;
 
   // Test with an image of more than one block.
@@ -405,7 +404,7 @@
   const int kBlockSize = 32;
   aom_flat_block_finder_t flat_block_finder;
   ASSERT_EQ(1, aom_flat_block_finder_init(&flat_block_finder, kBlockSize,
-                                          this->kBitDepth, this->kUseHighBD));
+                                          this->kBitDepth));
 
   const int num_blocks_w = 8;
   const int h = kBlockSize;
@@ -498,10 +497,9 @@
 REGISTER_TYPED_TEST_SUITE_P(FlatBlockEstimatorTest, ExtractBlock,
                             FindFlatBlocks);
 
-typedef ::testing::Types<BitDepthParams<uint8_t, 8, false>,   // lowbd
-                         BitDepthParams<uint16_t, 8, true>,   // lowbd in 16-bit
-                         BitDepthParams<uint16_t, 10, true>,  // highbd data
-                         BitDepthParams<uint16_t, 12, true> >
+typedef ::testing::Types<BitDepthParams<uint16_t, 8>,   // lowbd in 16-bit
+                         BitDepthParams<uint16_t, 10>,  // highbd data
+                         BitDepthParams<uint16_t, 12> >
     AllBitDepthParams;
 INSTANTIATE_TYPED_TEST_SUITE_P(FlatBlockInstatiation, FlatBlockEstimatorTest,
                                AllBitDepthParams);
@@ -517,7 +515,7 @@
 
   virtual void SetUp() {
     const aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3,
-                                              T::kBitDepth, T::kUseHighBD };
+                                              T::kBitDepth };
     ASSERT_TRUE(aom_noise_model_init(&model_, params));
 
     random_.Reset(100171);
@@ -945,7 +943,7 @@
 TEST(NoiseModelGetGrainParameters, TestLagSize) {
   aom_film_grain_t film_grain;
   for (int lag = 1; lag <= 3; ++lag) {
-    aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8, 0 };
+    aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8 };
     aom_noise_model_t model;
     EXPECT_TRUE(aom_noise_model_init(&model, params));
     EXPECT_TRUE(aom_noise_model_get_grain_parameters(&model, &film_grain));
@@ -953,7 +951,7 @@
     aom_noise_model_free(&model);
   }
 
-  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 4, 8, 0 };
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 4, 8 };
   aom_noise_model_t model;
   EXPECT_TRUE(aom_noise_model_init(&model, params));
   EXPECT_FALSE(aom_noise_model_get_grain_parameters(&model, &film_grain));
@@ -993,7 +991,7 @@
     { 4, 6, 127 },
     { -4, 6, -128 },
   };
-  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8, 0 };
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8 };
   aom_noise_model_t model;
   EXPECT_TRUE(aom_noise_model_init(&model, params));
 
@@ -1023,7 +1021,7 @@
     { 31.99, 8, 255 }, { 64, 8, 255 },  // clipped
   };
   const int lag = 1;
-  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8, 0 };
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8 };
   aom_noise_model_t model;
   EXPECT_TRUE(aom_noise_model_init(&model, params));
 
@@ -1072,7 +1070,7 @@
   };
 
   const int lag = 3;
-  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8, 0 };
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8 };
   aom_noise_model_t model;
   EXPECT_TRUE(aom_noise_model_init(&model, params));
 
@@ -1243,18 +1241,18 @@
     reinterpret_cast<uint8_t *>(&this->denoised_[1][0]),
     reinterpret_cast<uint8_t *>(&this->denoised_[2][0]),
   };
-  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
-                                     this->kHeight, this->stride_,
-                                     this->chroma_sub_, this->noise_psd_ptrs_,
-                                     18, this->kBitDepth, this->kUseHighBD));
-  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
-                                     this->kHeight, this->stride_,
-                                     this->chroma_sub_, this->noise_psd_ptrs_,
-                                     48, this->kBitDepth, this->kUseHighBD));
-  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
-                                     this->kHeight, this->stride_,
-                                     this->chroma_sub_, this->noise_psd_ptrs_,
-                                     64, this->kBitDepth, this->kUseHighBD));
+  EXPECT_EQ(
+      0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                               this->kHeight, this->stride_, this->chroma_sub_,
+                               this->noise_psd_ptrs_, 18, this->kBitDepth));
+  EXPECT_EQ(
+      0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                               this->kHeight, this->stride_, this->chroma_sub_,
+                               this->noise_psd_ptrs_, 48, this->kBitDepth));
+  EXPECT_EQ(
+      0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                               this->kHeight, this->stride_, this->chroma_sub_,
+                               this->noise_psd_ptrs_, 64, this->kBitDepth));
 }
 
 TYPED_TEST_P(WienerDenoiseTest, InvalidChromaSubsampling) {
@@ -1269,17 +1267,17 @@
     reinterpret_cast<uint8_t *>(&this->denoised_[2][0]),
   };
   int chroma_sub[2] = { 1, 0 };
-  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
-                                     this->kHeight, this->stride_, chroma_sub,
-                                     this->noise_psd_ptrs_, 32, this->kBitDepth,
-                                     this->kUseHighBD));
+  EXPECT_EQ(0,
+            aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                                  this->kHeight, this->stride_, chroma_sub,
+                                  this->noise_psd_ptrs_, 32, this->kBitDepth));
 
   chroma_sub[0] = 0;
   chroma_sub[1] = 1;
-  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
-                                     this->kHeight, this->stride_, chroma_sub,
-                                     this->noise_psd_ptrs_, 32, this->kBitDepth,
-                                     this->kUseHighBD));
+  EXPECT_EQ(0,
+            aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                                  this->kHeight, this->stride_, chroma_sub,
+                                  this->noise_psd_ptrs_, 32, this->kBitDepth));
 }
 
 TYPED_TEST_P(WienerDenoiseTest, GradientTest) {
@@ -1299,7 +1297,7 @@
   const int ret = aom_wiener_denoise_2d(
       data_ptrs, denoised_ptrs, kWidth, kHeight, this->stride_,
       this->chroma_sub_, this->noise_psd_ptrs_, this->kBlockSize,
-      this->kBitDepth, this->kUseHighBD);
+      this->kBitDepth);
   EXPECT_EQ(1, ret);
 
   // Check the noise on the denoised image (from the analytical gradient)

diff --git a/test/obmc_sad_test.cc b/test/obmc_sad_test.cc
index dfc98f1..342397a 100644
--- a/test/obmc_sad_test.cc
+++ b/test/obmc_sad_test.cc

@@ -34,121 +34,6 @@
 typedef libaom_test::FuncParam<ObmcSadF> TestFuncs;
 
 ////////////////////////////////////////////////////////////////////////////////
-// 8 bit
-////////////////////////////////////////////////////////////////////////////////
-
-class ObmcSadTest : public FunctionEquivalenceTest<ObmcSadF> {};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ObmcSadTest);
-
-TEST_P(ObmcSadTest, RandomValues) {
-  DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
-
-  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
-    const int pre_stride = rng_(MAX_SB_SIZE + 1);
-
-    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
-      pre[i] = rng_.Rand8();
-      wsrc[i] = rng_.Rand8() * rng_(kMaskMax * kMaskMax + 1);
-      mask[i] = rng_(kMaskMax * kMaskMax + 1);
-    }
-
-    const unsigned int ref_res = params_.ref_func(pre, pre_stride, wsrc, mask);
-    unsigned int tst_res;
-    ASM_REGISTER_STATE_CHECK(tst_res =
-                                 params_.tst_func(pre, pre_stride, wsrc, mask));
-
-    ASSERT_EQ(ref_res, tst_res);
-  }
-}
-
-TEST_P(ObmcSadTest, ExtremeValues) {
-  DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
-
-  for (int iter = 0; iter < MAX_SB_SIZE && !HasFatalFailure(); ++iter) {
-    const int pre_stride = iter;
-
-    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
-      pre[i] = UINT8_MAX;
-      wsrc[i] = UINT8_MAX * kMaskMax * kMaskMax;
-      mask[i] = kMaskMax * kMaskMax;
-    }
-
-    const unsigned int ref_res = params_.ref_func(pre, pre_stride, wsrc, mask);
-    unsigned int tst_res;
-    ASM_REGISTER_STATE_CHECK(tst_res =
-                                 params_.tst_func(pre, pre_stride, wsrc, mask));
-
-    ASSERT_EQ(ref_res, tst_res);
-  }
-}
-
-#if HAVE_SSE4_1
-const ObmcSadTest::ParamType sse4_functions[] = {
-  TestFuncs(aom_obmc_sad128x128_c, aom_obmc_sad128x128_sse4_1),
-  TestFuncs(aom_obmc_sad128x64_c, aom_obmc_sad128x64_sse4_1),
-  TestFuncs(aom_obmc_sad64x128_c, aom_obmc_sad64x128_sse4_1),
-  TestFuncs(aom_obmc_sad64x64_c, aom_obmc_sad64x64_sse4_1),
-  TestFuncs(aom_obmc_sad64x32_c, aom_obmc_sad64x32_sse4_1),
-  TestFuncs(aom_obmc_sad32x64_c, aom_obmc_sad32x64_sse4_1),
-  TestFuncs(aom_obmc_sad32x32_c, aom_obmc_sad32x32_sse4_1),
-  TestFuncs(aom_obmc_sad32x16_c, aom_obmc_sad32x16_sse4_1),
-  TestFuncs(aom_obmc_sad16x32_c, aom_obmc_sad16x32_sse4_1),
-  TestFuncs(aom_obmc_sad16x16_c, aom_obmc_sad16x16_sse4_1),
-  TestFuncs(aom_obmc_sad16x8_c, aom_obmc_sad16x8_sse4_1),
-  TestFuncs(aom_obmc_sad8x16_c, aom_obmc_sad8x16_sse4_1),
-  TestFuncs(aom_obmc_sad8x8_c, aom_obmc_sad8x8_sse4_1),
-  TestFuncs(aom_obmc_sad8x4_c, aom_obmc_sad8x4_sse4_1),
-  TestFuncs(aom_obmc_sad4x8_c, aom_obmc_sad4x8_sse4_1),
-  TestFuncs(aom_obmc_sad4x4_c, aom_obmc_sad4x4_sse4_1),
-
-  TestFuncs(aom_obmc_sad64x16_c, aom_obmc_sad64x16_sse4_1),
-  TestFuncs(aom_obmc_sad16x64_c, aom_obmc_sad16x64_sse4_1),
-  TestFuncs(aom_obmc_sad32x8_c, aom_obmc_sad32x8_sse4_1),
-  TestFuncs(aom_obmc_sad8x32_c, aom_obmc_sad8x32_sse4_1),
-  TestFuncs(aom_obmc_sad16x4_c, aom_obmc_sad16x4_sse4_1),
-  TestFuncs(aom_obmc_sad4x16_c, aom_obmc_sad4x16_sse4_1),
-};
-
-INSTANTIATE_TEST_SUITE_P(SSE4_1, ObmcSadTest,
-                         ::testing::ValuesIn(sse4_functions));
-#endif  // HAVE_SSE4_1
-
-#if HAVE_AVX2
-const ObmcSadTest::ParamType avx2_functions[] = {
-  TestFuncs(aom_obmc_sad128x128_c, aom_obmc_sad128x128_avx2),
-  TestFuncs(aom_obmc_sad128x64_c, aom_obmc_sad128x64_avx2),
-  TestFuncs(aom_obmc_sad64x128_c, aom_obmc_sad64x128_avx2),
-  TestFuncs(aom_obmc_sad64x64_c, aom_obmc_sad64x64_avx2),
-  TestFuncs(aom_obmc_sad64x32_c, aom_obmc_sad64x32_avx2),
-  TestFuncs(aom_obmc_sad32x64_c, aom_obmc_sad32x64_avx2),
-  TestFuncs(aom_obmc_sad32x32_c, aom_obmc_sad32x32_avx2),
-  TestFuncs(aom_obmc_sad32x16_c, aom_obmc_sad32x16_avx2),
-  TestFuncs(aom_obmc_sad16x32_c, aom_obmc_sad16x32_avx2),
-  TestFuncs(aom_obmc_sad16x16_c, aom_obmc_sad16x16_avx2),
-  TestFuncs(aom_obmc_sad16x8_c, aom_obmc_sad16x8_avx2),
-  TestFuncs(aom_obmc_sad8x16_c, aom_obmc_sad8x16_avx2),
-  TestFuncs(aom_obmc_sad8x8_c, aom_obmc_sad8x8_avx2),
-  TestFuncs(aom_obmc_sad8x4_c, aom_obmc_sad8x4_avx2),
-  TestFuncs(aom_obmc_sad4x8_c, aom_obmc_sad4x8_avx2),
-  TestFuncs(aom_obmc_sad4x4_c, aom_obmc_sad4x4_avx2),
-
-  TestFuncs(aom_obmc_sad64x16_c, aom_obmc_sad64x16_avx2),
-  TestFuncs(aom_obmc_sad16x64_c, aom_obmc_sad16x64_avx2),
-  TestFuncs(aom_obmc_sad32x8_c, aom_obmc_sad32x8_avx2),
-  TestFuncs(aom_obmc_sad8x32_c, aom_obmc_sad8x32_avx2),
-  TestFuncs(aom_obmc_sad16x4_c, aom_obmc_sad16x4_avx2),
-  TestFuncs(aom_obmc_sad4x16_c, aom_obmc_sad4x16_avx2),
-};
-
-INSTANTIATE_TEST_SUITE_P(AVX2, ObmcSadTest,
-                         ::testing::ValuesIn(avx2_functions));
-#endif  // HAVE_AVX2
-
-////////////////////////////////////////////////////////////////////////////////
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
 

diff --git a/test/obmc_variance_test.cc b/test/obmc_variance_test.cc
index b3a2da3..321cfae 100644
--- a/test/obmc_variance_test.cc
+++ b/test/obmc_variance_test.cc

@@ -37,164 +37,6 @@
 typedef libaom_test::FuncParam<ObmcVarF> TestFuncs;
 
 ////////////////////////////////////////////////////////////////////////////////
-// 8 bit
-////////////////////////////////////////////////////////////////////////////////
-
-class ObmcVarianceTest : public FunctionEquivalenceTest<ObmcVarF> {};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ObmcVarianceTest);
-
-TEST_P(ObmcVarianceTest, RandomValues) {
-  DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
-
-  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
-    const int pre_stride = this->rng_(MAX_SB_SIZE + 1);
-
-    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
-      pre[i] = this->rng_.Rand8();
-      wsrc[i] = this->rng_.Rand8() * this->rng_(kMaskMax * kMaskMax + 1);
-      mask[i] = this->rng_(kMaskMax * kMaskMax + 1);
-    }
-
-    unsigned int ref_sse, tst_sse;
-    const unsigned int ref_res =
-        params_.ref_func(pre, pre_stride, wsrc, mask, &ref_sse);
-    unsigned int tst_res;
-    ASM_REGISTER_STATE_CHECK(
-        tst_res = params_.tst_func(pre, pre_stride, wsrc, mask, &tst_sse));
-
-    ASSERT_EQ(ref_res, tst_res);
-    ASSERT_EQ(ref_sse, tst_sse);
-  }
-}
-
-TEST_P(ObmcVarianceTest, ExtremeValues) {
-  DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
-
-  for (int iter = 0; iter < MAX_SB_SIZE && !HasFatalFailure(); ++iter) {
-    const int pre_stride = iter;
-
-    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
-      pre[i] = UINT8_MAX;
-      wsrc[i] = UINT8_MAX * kMaskMax * kMaskMax;
-      mask[i] = kMaskMax * kMaskMax;
-    }
-
-    unsigned int ref_sse, tst_sse;
-    const unsigned int ref_res =
-        params_.ref_func(pre, pre_stride, wsrc, mask, &ref_sse);
-    unsigned int tst_res;
-    ASM_REGISTER_STATE_CHECK(
-        tst_res = params_.tst_func(pre, pre_stride, wsrc, mask, &tst_sse));
-
-    ASSERT_EQ(ref_res, tst_res);
-    ASSERT_EQ(ref_sse, tst_sse);
-  }
-}
-
-TEST_P(ObmcVarianceTest, DISABLED_Speed) {
-  DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
-
-  const int pre_stride = this->rng_(MAX_SB_SIZE + 1);
-
-  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
-    pre[i] = this->rng_.Rand8();
-    wsrc[i] = this->rng_.Rand8() * this->rng_(kMaskMax * kMaskMax + 1);
-    mask[i] = this->rng_(kMaskMax * kMaskMax + 1);
-  }
-
-  const int num_loops = 1000000;
-  unsigned int ref_sse, tst_sse;
-  aom_usec_timer ref_timer, test_timer;
-
-  aom_usec_timer_start(&ref_timer);
-  for (int i = 0; i < num_loops; ++i) {
-    params_.ref_func(pre, pre_stride, wsrc, mask, &ref_sse);
-  }
-  aom_usec_timer_mark(&ref_timer);
-  const int elapsed_time_c =
-      static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
-
-  aom_usec_timer_start(&test_timer);
-  for (int i = 0; i < num_loops; ++i) {
-    params_.tst_func(pre, pre_stride, wsrc, mask, &tst_sse);
-  }
-  aom_usec_timer_mark(&test_timer);
-  const int elapsed_time_simd =
-      static_cast<int>(aom_usec_timer_elapsed(&test_timer));
-
-  printf("c_time=%d \t simd_time=%d \t gain=%d \n", elapsed_time_c,
-         elapsed_time_simd, (elapsed_time_c / elapsed_time_simd));
-}
-
-#if HAVE_SSE4_1
-const ObmcVarianceTest::ParamType sse4_functions[] = {
-  TestFuncs(aom_obmc_variance128x128_c, aom_obmc_variance128x128_sse4_1),
-  TestFuncs(aom_obmc_variance128x64_c, aom_obmc_variance128x64_sse4_1),
-  TestFuncs(aom_obmc_variance64x128_c, aom_obmc_variance64x128_sse4_1),
-  TestFuncs(aom_obmc_variance64x64_c, aom_obmc_variance64x64_sse4_1),
-  TestFuncs(aom_obmc_variance64x32_c, aom_obmc_variance64x32_sse4_1),
-  TestFuncs(aom_obmc_variance32x64_c, aom_obmc_variance32x64_sse4_1),
-  TestFuncs(aom_obmc_variance32x32_c, aom_obmc_variance32x32_sse4_1),
-  TestFuncs(aom_obmc_variance32x16_c, aom_obmc_variance32x16_sse4_1),
-  TestFuncs(aom_obmc_variance16x32_c, aom_obmc_variance16x32_sse4_1),
-  TestFuncs(aom_obmc_variance16x16_c, aom_obmc_variance16x16_sse4_1),
-  TestFuncs(aom_obmc_variance16x8_c, aom_obmc_variance16x8_sse4_1),
-  TestFuncs(aom_obmc_variance8x16_c, aom_obmc_variance8x16_sse4_1),
-  TestFuncs(aom_obmc_variance8x8_c, aom_obmc_variance8x8_sse4_1),
-  TestFuncs(aom_obmc_variance8x4_c, aom_obmc_variance8x4_sse4_1),
-  TestFuncs(aom_obmc_variance4x8_c, aom_obmc_variance4x8_sse4_1),
-  TestFuncs(aom_obmc_variance4x4_c, aom_obmc_variance4x4_sse4_1),
-
-  TestFuncs(aom_obmc_variance64x16_c, aom_obmc_variance64x16_sse4_1),
-  TestFuncs(aom_obmc_variance16x64_c, aom_obmc_variance16x64_sse4_1),
-  TestFuncs(aom_obmc_variance32x8_c, aom_obmc_variance32x8_sse4_1),
-  TestFuncs(aom_obmc_variance8x32_c, aom_obmc_variance8x32_sse4_1),
-  TestFuncs(aom_obmc_variance16x4_c, aom_obmc_variance16x4_sse4_1),
-  TestFuncs(aom_obmc_variance4x16_c, aom_obmc_variance4x16_sse4_1),
-};
-
-INSTANTIATE_TEST_SUITE_P(SSE4_1, ObmcVarianceTest,
-                         ::testing::ValuesIn(sse4_functions));
-#endif  // HAVE_SSE4_1
-
-#if HAVE_AVX2
-const ObmcVarianceTest::ParamType avx2_functions[] = {
-  TestFuncs(aom_obmc_variance128x128_c, aom_obmc_variance128x128_avx2),
-  TestFuncs(aom_obmc_variance128x64_c, aom_obmc_variance128x64_avx2),
-  TestFuncs(aom_obmc_variance64x128_c, aom_obmc_variance64x128_avx2),
-  TestFuncs(aom_obmc_variance64x64_c, aom_obmc_variance64x64_avx2),
-  TestFuncs(aom_obmc_variance64x32_c, aom_obmc_variance64x32_avx2),
-  TestFuncs(aom_obmc_variance32x64_c, aom_obmc_variance32x64_avx2),
-  TestFuncs(aom_obmc_variance32x32_c, aom_obmc_variance32x32_avx2),
-  TestFuncs(aom_obmc_variance32x16_c, aom_obmc_variance32x16_avx2),
-  TestFuncs(aom_obmc_variance16x32_c, aom_obmc_variance16x32_avx2),
-  TestFuncs(aom_obmc_variance16x16_c, aom_obmc_variance16x16_avx2),
-  TestFuncs(aom_obmc_variance16x8_c, aom_obmc_variance16x8_avx2),
-  TestFuncs(aom_obmc_variance8x16_c, aom_obmc_variance8x16_avx2),
-  TestFuncs(aom_obmc_variance8x8_c, aom_obmc_variance8x8_avx2),
-  TestFuncs(aom_obmc_variance8x4_c, aom_obmc_variance8x4_avx2),
-  TestFuncs(aom_obmc_variance4x8_c, aom_obmc_variance4x8_avx2),
-  TestFuncs(aom_obmc_variance4x4_c, aom_obmc_variance4x4_avx2),
-
-  TestFuncs(aom_obmc_variance64x16_c, aom_obmc_variance64x16_avx2),
-  TestFuncs(aom_obmc_variance16x64_c, aom_obmc_variance16x64_avx2),
-  TestFuncs(aom_obmc_variance32x8_c, aom_obmc_variance32x8_avx2),
-  TestFuncs(aom_obmc_variance8x32_c, aom_obmc_variance8x32_avx2),
-  TestFuncs(aom_obmc_variance16x4_c, aom_obmc_variance16x4_avx2),
-  TestFuncs(aom_obmc_variance4x16_c, aom_obmc_variance4x16_avx2),
-};
-
-INSTANTIATE_TEST_SUITE_P(AVX2, ObmcVarianceTest,
-                         ::testing::ValuesIn(avx2_functions));
-#endif  // HAVE_AVX2
-
-////////////////////////////////////////////////////////////////////////////////
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
 class ObmcVarianceHBDTest : public FunctionEquivalenceTest<ObmcVarF> {};

diff --git a/test/opt_flow_test.cc b/test/opt_flow_test.cc
index 0273bd6..abd92a5 100644
--- a/test/opt_flow_test.cc
+++ b/test/opt_flow_test.cc

@@ -306,233 +306,6 @@
 }
 
 template <typename T>
-std::vector<TestParam<T>> GetOptFlowLowbdTestParams(T test_func) {
-  return GetOptFlowTestParams({ 8 }, test_func);
-}
-
-template <typename T>
-::testing::internal::ParamGenerator<TestParam<T>> BuildOptFlowParams(
-    T test_func) {
-  return ::testing::ValuesIn(GetOptFlowLowbdTestParams(test_func));
-}
-
-typedef int (*opfl_mv_refinement)(const uint8_t *p0, int pstride0,
-                                  const uint8_t *p1, int pstride1,
-                                  const int16_t *gx0, const int16_t *gy0,
-                                  const int16_t *gx1, const int16_t *gy1,
-                                  int gstride, int bw, int bh, int n, int d0,
-                                  int d1, int grad_prec_bits, int mv_prec_bits,
-                                  int *vx0, int *vy0, int *vx1, int *vy1);
-
-class AV1OptFlowRefineTest : public AV1OptFlowTest<opfl_mv_refinement> {
- public:
-  AV1OptFlowRefineTest() {
-    const BlockSize &block = GetParam().Block();
-    const int bw = block.Width();
-    const int bh = block.Height();
-
-    input0_ = (uint8_t *)aom_memalign(16, bw * bh * sizeof(uint8_t));
-    input1_ = (uint8_t *)aom_memalign(16, bw * bh * sizeof(uint8_t));
-    gx0_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
-    gy0_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
-    gx1_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
-    gy1_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
-  }
-
-  ~AV1OptFlowRefineTest() {
-    aom_free(input0_);
-    aom_free(input1_);
-    aom_free(gx0_);
-    aom_free(gy0_);
-    aom_free(gx1_);
-    aom_free(gy1_);
-  }
-
-  void RunTest(const int is_speed) {
-    OrderHintInfo oh_info;
-    const BlockSize &block = GetParam().Block();
-    const int bw_log2 = block.Width() >> MI_SIZE_LOG2;
-    const int bh_log2 = block.Height() >> MI_SIZE_LOG2;
-    const int bd = GetParam().BitDepth();
-    const int numIter = is_speed ? 1 : 16384 / (bw_log2 * bh_log2);
-    const int oh_start_bits = is_speed ? kMaxOrderHintBits : 1;
-
-    oh_info.enable_order_hint = 1;
-    for (int oh_bits = oh_start_bits; oh_bits <= kMaxOrderHintBits; oh_bits++) {
-      for (int count = 0; count < numIter;) {
-        const int cur_frm_idx = RandomFrameIdx(oh_bits);
-        const int ref0_frm_idx = RandomFrameIdx(oh_bits);
-        const int ref1_frm_idx = RandomFrameIdx(oh_bits);
-
-        oh_info.order_hint_bits_minus_1 = oh_bits - 1;
-        const int d0 = get_relative_dist(&oh_info, cur_frm_idx, ref0_frm_idx);
-        const int d1 = get_relative_dist(&oh_info, cur_frm_idx, ref1_frm_idx);
-        if (!d0 || !d1) continue;
-
-        RandomInput8(input0_, GetParam());
-        RandomInput8(input1_, GetParam());
-        RandomInput9(gx0_, GetParam());
-        RandomInput9(gy0_, GetParam());
-        RandomInput9(gx1_, GetParam());
-        RandomInput9(gy1_, GetParam());
-
-        TestOptFlowRefine(input0_, input1_, gx0_, gy0_, gx1_, gy1_, is_speed,
-                          d0, d1);
-        count++;
-      }
-    }
-    if (is_speed) return;
-
-    // Extreme value test
-    for (int oh_bits = oh_start_bits; oh_bits <= kMaxOrderHintBits;
-         oh_bits += kMaxOrderHintBits - 1) {
-      for (int count = 0; count < numIter;) {
-        const int d0 = RelativeDistExtreme(oh_bits);
-        const int d1 = RelativeDistExtreme(oh_bits);
-        if (!d0 || !d1) continue;
-
-        RandomInput8Extreme(input0_, GetParam());
-        RandomInput8Extreme(input1_, GetParam());
-        RandomInput9Extreme(gx0_, GetParam(), bd + 1);
-        RandomInput9Extreme(gy0_, GetParam(), bd + 1);
-        RandomInput9Extreme(gx1_, GetParam(), bd + 1);
-        RandomInput9Extreme(gy1_, GetParam(), bd + 1);
-
-        TestOptFlowRefine(input0_, input1_, gx0_, gy0_, gx1_, gy1_, 0, d0, d1);
-        count++;
-      }
-    }
-  }
-
- private:
-  void TestOptFlowRefine(uint8_t *input0, uint8_t *input1, int16_t *gx0,
-                         int16_t *gy0, int16_t *gx1, int16_t *gy1,
-                         const int is_speed, int d0, int d1) {
-    const BlockSize &block = GetParam().Block();
-    const int bw = block.Width();
-    const int bh = block.Height();
-    const int n = block.OptFlowBlkSize();
-
-    opfl_mv_refinement ref_func = av1_opfl_mv_refinement_nxn_lowbd_c;
-    opfl_mv_refinement test_func = GetParam().TestFunction();
-
-    if (is_speed)
-      OptFlowRefineSpeed(ref_func, test_func, input0, input1, gx0, gy0, gx1,
-                         gy1, bw, bh, n, d0, d1);
-    else
-      OptFlowRefine(ref_func, test_func, input0, input1, gx0, gy0, gx1, gy1, bw,
-                    bh, n, d0, d1);
-  }
-
-  void OptFlowRefine(opfl_mv_refinement ref_func, opfl_mv_refinement test_func,
-                     const uint8_t *input0, const uint8_t *input1,
-                     const int16_t *gx0, const int16_t *gy0, const int16_t *gx1,
-                     const int16_t *gy1, int bw, int bh, int n, int d0,
-                     int d1) {
-    int ref_out[4 * N_OF_OFFSETS] = { 0 };
-    int test_out[4 * N_OF_OFFSETS] = { 0 };
-    const int grad_prec_bits = 3 - kSubpelGradDeltaBits - 2;
-    const int mv_prec_bits = MV_REFINE_PREC_BITS;
-    int stride0 = bw;
-    int stride1 = bw;
-    int gstride = bw;
-    int n_blocks = 0;
-
-    n_blocks = ref_func(
-        input0, stride0, input1, stride1, gx0, gy0, gx1, gy1, gstride, bw, bh,
-        n, d0, d1, grad_prec_bits, mv_prec_bits, &ref_out[kVX_0 * N_OF_OFFSETS],
-        &ref_out[kVY_0 * N_OF_OFFSETS], &ref_out[kVX_1 * N_OF_OFFSETS],
-        &ref_out[kVY_1 * N_OF_OFFSETS]);
-    test_func(input0, stride0, input1, stride1, gx0, gy0, gx1, gy1, gstride, bw,
-              bh, n, d0, d1, grad_prec_bits, mv_prec_bits,
-              &test_out[kVX_0 * N_OF_OFFSETS], &test_out[kVY_0 * N_OF_OFFSETS],
-              &test_out[kVX_1 * N_OF_OFFSETS], &test_out[kVY_1 * N_OF_OFFSETS]);
-
-    AssertOutputEq(&ref_out[kVX_0 * N_OF_OFFSETS],
-                   &test_out[kVX_0 * N_OF_OFFSETS], n_blocks);
-    AssertOutputEq(&ref_out[kVY_0 * N_OF_OFFSETS],
-                   &test_out[kVY_0 * N_OF_OFFSETS], n_blocks);
-    AssertOutputEq(&ref_out[kVX_1 * N_OF_OFFSETS],
-                   &test_out[kVX_1 * N_OF_OFFSETS], n_blocks);
-    AssertOutputEq(&ref_out[kVY_1 * N_OF_OFFSETS],
-                   &test_out[kVY_1 * N_OF_OFFSETS], n_blocks);
-  }
-
-  void OptFlowRefineSpeed(opfl_mv_refinement ref_func,
-                          opfl_mv_refinement test_func, const uint8_t *input0,
-                          const uint8_t *input1, const int16_t *gx0,
-                          const int16_t *gy0, const int16_t *gx1,
-                          const int16_t *gy1, int bw, int bh, int n, int d0,
-                          int d1) {
-    int ref_out[4 * N_OF_OFFSETS] = { 0 };
-    int test_out[4 * N_OF_OFFSETS] = { 0 };
-    const int grad_prec_bits = 3 - kSubpelGradDeltaBits - 2;
-    const int mv_prec_bits = MV_REFINE_PREC_BITS;
-    const int bw_log2 = bw >> MI_SIZE_LOG2;
-    const int bh_log2 = bh >> MI_SIZE_LOG2;
-    int stride0 = bw;
-    int stride1 = bw;
-    int gstride = bw;
-
-    const int numIter = 2097152 / (bw_log2 * bh_log2);
-    aom_usec_timer timer_ref;
-    aom_usec_timer timer_test;
-
-    aom_usec_timer_start(&timer_ref);
-    for (int count = 0; count < numIter; count++) {
-      ref_func(input0, stride0, input1, stride1, gx0, gy0, gx1, gy1, gstride,
-               bw, bh, n, d0, d1, grad_prec_bits, mv_prec_bits,
-               &ref_out[kVX_0 * N_OF_OFFSETS], &ref_out[kVY_0 * N_OF_OFFSETS],
-               &ref_out[kVX_1 * N_OF_OFFSETS], &ref_out[kVY_1 * N_OF_OFFSETS]);
-    }
-    aom_usec_timer_mark(&timer_ref);
-
-    aom_usec_timer_start(&timer_test);
-    for (int count = 0; count < numIter; count++) {
-      test_func(
-          input0, stride0, input1, stride1, gx0, gy0, gx1, gy1, gstride, bw, bh,
-          n, d0, d1, grad_prec_bits, mv_prec_bits,
-          &test_out[kVX_0 * N_OF_OFFSETS], &test_out[kVY_0 * N_OF_OFFSETS],
-          &test_out[kVX_1 * N_OF_OFFSETS], &test_out[kVY_1 * N_OF_OFFSETS]);
-    }
-    aom_usec_timer_mark(&timer_test);
-
-    const int total_time_ref =
-        static_cast<int>(aom_usec_timer_elapsed(&timer_ref));
-    const int total_time_test =
-        static_cast<int>(aom_usec_timer_elapsed(&timer_test));
-
-    printf("ref_time = %d \t simd_time = %d \t Gain = %4.2f \n", total_time_ref,
-           total_time_test,
-           (static_cast<float>(total_time_ref) /
-            static_cast<float>(total_time_test)));
-  }
-  static constexpr int kVX_0 = 0;
-  static constexpr int kVX_1 = 1;
-  static constexpr int kVY_0 = 2;
-  static constexpr int kVY_1 = 3;
-  static constexpr int kMaxOrderHintBits = 8;
-  static constexpr int kSubpelGradDeltaBits = 3;
-  uint8_t *input0_;
-  uint8_t *input1_;
-  int16_t *gx0_;
-  int16_t *gy0_;
-  int16_t *gx1_;
-  int16_t *gy1_;
-};
-TEST_P(AV1OptFlowRefineTest, CheckOutput) { RunTest(0); }
-TEST_P(AV1OptFlowRefineTest, DISABLED_Speed) { RunTest(1); }
-
-INSTANTIATE_TEST_SUITE_P(
-    C, AV1OptFlowRefineTest,
-    BuildOptFlowParams(av1_opfl_mv_refinement_nxn_lowbd_c));
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(
-    SSE4_1, AV1OptFlowRefineTest,
-    BuildOptFlowParams(av1_opfl_mv_refinement_nxn_lowbd_sse4_1));
-#endif
-
-template <typename T>
 std::vector<TestParam<T>> GetOptFlowHighbdTestParams(T test_func) {
   return GetOptFlowTestParams({ 8, 10, 12 }, test_func);
 }
@@ -765,141 +538,6 @@
 #endif
 
 #if OPFL_BICUBIC_GRAD
-typedef void (*bicubic_grad_interp_lowbd)(const int16_t *pred_src,
-                                          int16_t *x_grad, int16_t *y_grad,
-                                          const int blk_width,
-                                          const int blk_height);
-
-class AV1OptFlowBiCubicGradLowbdTest
-    : public AV1OptFlowTest<bicubic_grad_interp_lowbd> {
- public:
-  AV1OptFlowBiCubicGradLowbdTest() {
-    const BlockSize &block = GetParam().Block();
-    const int bw = block.Width();
-    const int bh = block.Height();
-
-    pred_src_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
-    x_grad_ref_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
-    y_grad_ref_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
-    x_grad_test_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
-    y_grad_test_ = (int16_t *)aom_memalign(16, bw * bh * sizeof(int16_t));
-
-    memset(x_grad_ref_, 0, bw * bh * sizeof(int16_t));
-    memset(y_grad_ref_, 0, bw * bh * sizeof(int16_t));
-    memset(x_grad_test_, 0, bw * bh * sizeof(int16_t));
-    memset(y_grad_test_, 0, bw * bh * sizeof(int16_t));
-  }
-
-  ~AV1OptFlowBiCubicGradLowbdTest() {
-    aom_free(pred_src_);
-    aom_free(x_grad_ref_);
-    aom_free(y_grad_ref_);
-    aom_free(x_grad_test_);
-    aom_free(y_grad_test_);
-  }
-
-  void RunTest(const int is_speed) {
-    const BlockSize &block = GetParam().Block();
-    const int bd = GetParam().BitDepth();
-    const int bw_log2 = block.Width() >> MI_SIZE_LOG2;
-    const int bh_log2 = block.Height() >> MI_SIZE_LOG2;
-    const int numIter = is_speed ? 1 : 16384 / (bw_log2 * bh_log2);
-
-    for (int count = 0; count < numIter; count++) {
-      RandomInput16(pred_src_, GetParam(), bd);
-      TestBicubicGrad(pred_src_, x_grad_ref_, y_grad_ref_, x_grad_test_,
-                      y_grad_test_, is_speed);
-    }
-    if (is_speed) return;
-
-    for (int count = 0; count < numIter; count++) {
-      RandomInput16Extreme(pred_src_, GetParam(), bd);
-      TestBicubicGrad(pred_src_, x_grad_ref_, y_grad_ref_, x_grad_test_,
-                      y_grad_test_, 0);
-    }
-  }
-
- private:
-  void TestBicubicGrad(int16_t *pred_src, int16_t *x_grad_ref,
-                       int16_t *y_grad_ref, int16_t *x_grad_test,
-                       int16_t *y_grad_test, int is_speed) {
-    const BlockSize &block = GetParam().Block();
-    const int bw = block.Width();
-    const int bh = block.Height();
-
-    bicubic_grad_interp_lowbd ref_func = av1_bicubic_grad_interpolation_c;
-    bicubic_grad_interp_lowbd test_func = GetParam().TestFunction();
-    if (is_speed)
-      BicubicGradSpeed(ref_func, test_func, pred_src, x_grad_ref, y_grad_ref,
-                       x_grad_test, y_grad_test, bw, bh);
-    else
-      BicubicGrad(ref_func, test_func, pred_src, x_grad_ref, y_grad_ref,
-                  x_grad_test, y_grad_test, bw, bh);
-  }
-
-  void BicubicGrad(bicubic_grad_interp_lowbd ref_func,
-                   bicubic_grad_interp_lowbd test_func, const int16_t *pred_src,
-                   int16_t *x_grad_ref, int16_t *y_grad_ref,
-                   int16_t *x_grad_test, int16_t *y_grad_test, const int bw,
-                   const int bh) {
-    ref_func(pred_src, x_grad_ref, y_grad_ref, bw, bh);
-    test_func(pred_src, x_grad_test, y_grad_test, bw, bh);
-
-    AssertOutputBufferEq(x_grad_ref, x_grad_test, bw, bh);
-    AssertOutputBufferEq(y_grad_ref, y_grad_test, bw, bh);
-  }
-
-  void BicubicGradSpeed(bicubic_grad_interp_lowbd ref_func,
-                        bicubic_grad_interp_lowbd test_func, int16_t *pred_src,
-                        int16_t *x_grad_ref, int16_t *y_grad_ref,
-                        int16_t *x_grad_test, int16_t *y_grad_test,
-                        const int bw, const int bh) {
-    const int bw_log2 = bw >> MI_SIZE_LOG2;
-    const int bh_log2 = bh >> MI_SIZE_LOG2;
-
-    const int numIter = 2097152 / (bw_log2 * bh_log2);
-    aom_usec_timer timer_ref;
-    aom_usec_timer timer_test;
-
-    aom_usec_timer_start(&timer_ref);
-    for (int count = 0; count < numIter; count++)
-      ref_func(pred_src, x_grad_ref, y_grad_ref, bw, bh);
-    aom_usec_timer_mark(&timer_ref);
-
-    aom_usec_timer_start(&timer_test);
-    for (int count = 0; count < numIter; count++)
-      test_func(pred_src, x_grad_test, y_grad_test, bw, bh);
-    aom_usec_timer_mark(&timer_test);
-
-    const int total_time_ref =
-        static_cast<int>(aom_usec_timer_elapsed(&timer_ref));
-    const int total_time_test =
-        static_cast<int>(aom_usec_timer_elapsed(&timer_test));
-
-    printf("ref_time = %d \t simd_time = %d \t Gain = %4.2f \n", total_time_ref,
-           total_time_test,
-           (static_cast<float>(total_time_ref) /
-            static_cast<float>(total_time_test)));
-  }
-
-  int16_t *pred_src_;
-  int16_t *x_grad_ref_;
-  int16_t *y_grad_ref_;
-  int16_t *x_grad_test_;
-  int16_t *y_grad_test_;
-};
-TEST_P(AV1OptFlowBiCubicGradLowbdTest, CheckOutput) { RunTest(0); }
-TEST_P(AV1OptFlowBiCubicGradLowbdTest, DISABLED_Speed) { RunTest(1); }
-
-INSTANTIATE_TEST_SUITE_P(C, AV1OptFlowBiCubicGradLowbdTest,
-                         BuildOptFlowParams(av1_bicubic_grad_interpolation_c));
-
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(
-    SSE4_1, AV1OptFlowBiCubicGradLowbdTest,
-    BuildOptFlowParams(av1_bicubic_grad_interpolation_sse4_1));
-#endif
-
 typedef void (*bicubic_grad_interp_highbd)(const int16_t *pred_src,
                                            int16_t *x_grad, int16_t *y_grad,
                                            const int blk_width,
@@ -1237,174 +875,6 @@
 #endif  // OPFL_COMBINE_INTERP_GRAD_LS
 
 #if OPFL_BILINEAR_GRAD || OPFL_BICUBIC_GRAD
-typedef void (*pred_buffer_copy)(const uint8_t *src1, const uint8_t *src2,
-                                 int16_t *dst1, int16_t *dst2, int bw, int bh,
-                                 int d0, int d1);
-
-class AV1OptFlowCopyPredTest : public AV1OptFlowTest<pred_buffer_copy> {
- public:
-  AV1OptFlowCopyPredTest() {
-    const BlockSize &block = GetParam().Block();
-    const int bw = block.Width();
-    const int bh = block.Height();
-
-    src_buf1_ = (uint8_t *)aom_memalign(16, bw * bh * sizeof(*src_buf1_));
-    src_buf2_ = (uint8_t *)aom_memalign(16, bw * bh * sizeof(*src_buf2_));
-    dst_buf1_ref_ =
-        (int16_t *)aom_memalign(16, bw * bh * sizeof(*dst_buf1_ref_));
-    dst_buf2_ref_ =
-        (int16_t *)aom_memalign(16, bw * bh * sizeof(*dst_buf2_ref_));
-    dst_buf1_test_ =
-        (int16_t *)aom_memalign(16, bw * bh * sizeof(*dst_buf1_test_));
-    dst_buf2_test_ =
-        (int16_t *)aom_memalign(16, bw * bh * sizeof(*dst_buf2_test_));
-
-    memset(dst_buf2_ref_, 0, bw * bh * sizeof(*dst_buf2_ref_));
-    memset(dst_buf2_test_, 0, bw * bh * sizeof(*dst_buf2_test_));
-  }
-
-  ~AV1OptFlowCopyPredTest() {
-    aom_free(src_buf1_);
-    aom_free(src_buf2_);
-    aom_free(dst_buf1_ref_);
-    aom_free(dst_buf2_ref_);
-    aom_free(dst_buf1_test_);
-    aom_free(dst_buf2_test_);
-  }
-
-  void Run(const int is_speed) {
-    OrderHintInfo oh_info;
-    const BlockSize &block = GetParam().Block();
-    const int bw_log2 = block.Width() >> MI_SIZE_LOG2;
-    const int bh_log2 = block.Height() >> MI_SIZE_LOG2;
-    const int numIter = is_speed ? 1 : 16384 / (bw_log2 * bh_log2);
-    const int oh_start_bits = is_speed ? kMaxOrderHintBits : 1;
-
-    oh_info.enable_order_hint = 1;
-    for (int oh_bits = oh_start_bits; oh_bits <= kMaxOrderHintBits; oh_bits++) {
-      for (int count = 0; count < numIter;) {
-        const int cur_frm_idx = RandomFrameIdx(oh_bits);
-        const int ref0_frm_idx = RandomFrameIdx(oh_bits);
-        const int ref1_frm_idx = RandomFrameIdx(oh_bits);
-
-        oh_info.order_hint_bits_minus_1 = oh_bits - 1;
-        const int d0 = get_relative_dist(&oh_info, cur_frm_idx, ref0_frm_idx);
-        const int d1 = get_relative_dist(&oh_info, cur_frm_idx, ref1_frm_idx);
-        if (!d0 || !d1) continue;
-
-        RandomInput8(src_buf1_, GetParam());
-        RandomInput8(src_buf2_, GetParam());
-        TestCopyPredArray(src_buf1_, src_buf2_, dst_buf1_ref_, dst_buf2_ref_,
-                          dst_buf1_test_, dst_buf2_test_, d0, d1, is_speed);
-        count++;
-      }
-    }
-    if (is_speed) return;
-
-    // Extreme value test
-    for (int oh_bits = oh_start_bits; oh_bits <= kMaxOrderHintBits;
-         oh_bits += kMaxOrderHintBits - 1) {
-      for (int count = 0; count < numIter;) {
-        const int d0 = RelativeDistExtreme(oh_bits);
-        const int d1 = RelativeDistExtreme(oh_bits);
-        if (!d0 || !d1) continue;
-
-        RandomInput8Extreme(src_buf1_, GetParam());
-        RandomInput8Extreme(src_buf2_, GetParam());
-        TestCopyPredArray(src_buf1_, src_buf2_, dst_buf1_ref_, dst_buf2_ref_,
-                          dst_buf1_test_, dst_buf2_test_, d0, d1, 0);
-        count++;
-      }
-    }
-  }
-
- private:
-  void TestCopyPredArray(uint8_t *src_buf1, uint8_t *src_buf2,
-                         int16_t *dst_buf1_ref, int16_t *dst_buf2_ref,
-                         int16_t *dst_buf1_test, int16_t *dst_buf2_test, int d0,
-                         int d1, int is_speed) {
-    const BlockSize &block = GetParam().Block();
-    const int bw = block.Width();
-    const int bh = block.Height();
-
-    pred_buffer_copy ref_func = av1_copy_pred_array_c;
-    pred_buffer_copy test_func = GetParam().TestFunction();
-    if (is_speed)
-      CopyPredArraySpeed(ref_func, test_func, src_buf1, src_buf2, dst_buf1_ref,
-                         dst_buf2_ref, dst_buf1_test, dst_buf2_test, d0, d1, bw,
-                         bh);
-    else
-      CopyPredArray(ref_func, test_func, src_buf1, src_buf2, dst_buf1_ref,
-                    dst_buf2_ref, dst_buf1_test, dst_buf2_test, d0, d1, bw, bh);
-  }
-
-  void CopyPredArray(pred_buffer_copy ref_func, pred_buffer_copy test_func,
-                     const uint8_t *src_buf1, uint8_t *src_buf2,
-                     int16_t *dst_buf1_ref, int16_t *dst_buf2_ref,
-                     int16_t *dst_buf1_test, int16_t *dst_buf2_test,
-                     const int d0, const int d1, const int bw, const int bh) {
-    ref_func(src_buf1, src_buf2, dst_buf1_ref, dst_buf2_ref, bw, bh, d0, d1);
-    test_func(src_buf1, src_buf2, dst_buf1_test, dst_buf2_test, bw, bh, d0, d1);
-
-    AssertOutputBufferEq(dst_buf1_ref, dst_buf1_test, bw, bh);
-    AssertOutputBufferEq(dst_buf2_ref, dst_buf2_test, bw, bh);
-  }
-
-  void CopyPredArraySpeed(pred_buffer_copy ref_func, pred_buffer_copy test_func,
-                          const uint8_t *src_buf1, uint8_t *src_buf2,
-                          int16_t *dst_buf1_ref, int16_t *dst_buf2_ref,
-                          int16_t *dst_buf1_test, int16_t *dst_buf2_test,
-                          const int d0, const int d1, const int bw,
-                          const int bh) {
-    const int bw_log2 = bw >> MI_SIZE_LOG2;
-    const int bh_log2 = bh >> MI_SIZE_LOG2;
-    printf("bw=%d, bh=%d\n", bw, bh);
-    const int numIter = 2097152 / (bw_log2 * bh_log2);
-    aom_usec_timer timer_ref;
-    aom_usec_timer timer_test;
-
-    aom_usec_timer_start(&timer_ref);
-    for (int count = 0; count < numIter; count++)
-      ref_func(src_buf1, src_buf2, dst_buf1_ref, dst_buf2_ref, bw, bh, d0, d1);
-    aom_usec_timer_mark(&timer_ref);
-
-    aom_usec_timer_start(&timer_test);
-    for (int count = 0; count < numIter; count++)
-      test_func(src_buf1, src_buf2, dst_buf1_test, dst_buf2_test, bw, bh, d0,
-                d1);
-    aom_usec_timer_mark(&timer_test);
-
-    const int total_time_ref =
-        static_cast<int>(aom_usec_timer_elapsed(&timer_ref));
-    const int total_time_test =
-        static_cast<int>(aom_usec_timer_elapsed(&timer_test));
-
-    printf("ref_time = %d \t simd_time = %d \t Gain = %4.2f \n", total_time_ref,
-           total_time_test,
-           (static_cast<float>(total_time_ref) /
-            static_cast<float>(total_time_test)));
-  }
-
-  uint8_t *src_buf1_;
-  uint8_t *src_buf2_;
-  int16_t *dst_buf1_ref_;
-  int16_t *dst_buf2_ref_;
-  int16_t *dst_buf1_test_;
-  int16_t *dst_buf2_test_;
-  static constexpr int kMaxOrderHintBits = 8;
-};
-
-TEST_P(AV1OptFlowCopyPredTest, CheckOutput) { Run(0); }
-TEST_P(AV1OptFlowCopyPredTest, DISABLED_Speed) { Run(1); }
-
-INSTANTIATE_TEST_SUITE_P(C, AV1OptFlowCopyPredTest,
-                         BuildOptFlowParams(av1_copy_pred_array_c));
-
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1OptFlowCopyPredTest,
-                         BuildOptFlowParams(av1_copy_pred_array_sse4_1));
-#endif
-
 typedef void (*pred_buffer_copy_highbd)(const uint16_t *src1,
                                         const uint16_t *src2, int16_t *dst1,
                                         int16_t *dst2, int bw, int bh, int d0,

diff --git a/test/pickrst_test.cc b/test/pickrst_test.cc
index ce50e89..815358a 100644
--- a/test/pickrst_test.cc
+++ b/test/pickrst_test.cc

@@ -27,177 +27,6 @@
 
 #define MAX_DATA_BLOCK 384
 
-namespace pickrst_test_lowbd {
-static const int kIterations = 100;
-
-typedef int64_t (*lowbd_pixel_proj_error_func)(
-    const uint8_t *src8, int width, int height, int src_stride,
-    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
-    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
-
-////////////////////////////////////////////////////////////////////////////////
-// 8 bit
-////////////////////////////////////////////////////////////////////////////////
-
-typedef std::tuple<const lowbd_pixel_proj_error_func> PixelProjErrorTestParam;
-
-class PixelProjErrorTest
-    : public ::testing::TestWithParam<PixelProjErrorTestParam> {
- public:
-  virtual void SetUp() {
-    target_func_ = GET_PARAM(0);
-    src_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
-                                  sizeof(*src_)));
-    ASSERT_NE(src_, nullptr);
-    dgd_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
-                                  sizeof(*dgd_)));
-    ASSERT_NE(dgd_, nullptr);
-    flt0_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
-                                   sizeof(*flt0_)));
-    ASSERT_NE(flt0_, nullptr);
-    flt1_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
-                                   sizeof(*flt1_)));
-    ASSERT_NE(flt1_, nullptr);
-  }
-  virtual void TearDown() {
-    aom_free(src_);
-    aom_free(dgd_);
-    aom_free(flt0_);
-    aom_free(flt1_);
-  }
-  void RunPixelProjErrorTest(int32_t run_times);
-  void RunPixelProjErrorTest_ExtremeValues();
-
- private:
-  lowbd_pixel_proj_error_func target_func_;
-  libaom_test::ACMRandom rng_;
-  uint8_t *src_;
-  uint8_t *dgd_;
-  int32_t *flt0_;
-  int32_t *flt1_;
-};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PixelProjErrorTest);
-
-void PixelProjErrorTest::RunPixelProjErrorTest(int32_t run_times) {
-  int h_end = run_times != 1 ? 128 : (rng_.Rand16() % MAX_DATA_BLOCK) + 1;
-  int v_end = run_times != 1 ? 128 : (rng_.Rand16() % MAX_DATA_BLOCK) + 1;
-  const int dgd_stride = MAX_DATA_BLOCK;
-  const int src_stride = MAX_DATA_BLOCK;
-  const int flt0_stride = MAX_DATA_BLOCK;
-  const int flt1_stride = MAX_DATA_BLOCK;
-  sgr_params_type params;
-  int xq[2];
-  const int iters = run_times == 1 ? kIterations : 4;
-  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
-    int64_t err_ref = 0, err_test = 1;
-    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
-      dgd_[i] = rng_.Rand8();
-      src_[i] = rng_.Rand8();
-      flt0_[i] = rng_.Rand15Signed();
-      flt1_[i] = rng_.Rand15Signed();
-    }
-    xq[0] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
-    xq[1] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
-    params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
-    params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
-    params.s[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
-    params.s[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
-    uint8_t *dgd = dgd_;
-    uint8_t *src = src_;
-
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-    for (int i = 0; i < run_times; ++i) {
-      err_ref = av1_lowbd_pixel_proj_error_c(src, h_end, v_end, src_stride, dgd,
-                                             dgd_stride, flt0_, flt0_stride,
-                                             flt1_, flt1_stride, xq, &params);
-    }
-    aom_usec_timer_mark(&timer);
-    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
-    aom_usec_timer_start(&timer);
-    for (int i = 0; i < run_times; ++i) {
-      err_test =
-          target_func_(src, h_end, v_end, src_stride, dgd, dgd_stride, flt0_,
-                       flt0_stride, flt1_, flt1_stride, xq, &params);
-    }
-    aom_usec_timer_mark(&timer);
-    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
-    if (run_times > 10) {
-      printf("r0 %d r1 %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", params.r[0],
-             params.r[1], h_end, v_end, time1, time2, time1 / time2);
-    }
-    ASSERT_EQ(err_ref, err_test);
-  }
-}
-
-void PixelProjErrorTest::RunPixelProjErrorTest_ExtremeValues() {
-  const int h_start = 0;
-  int h_end = 192;
-  const int v_start = 0;
-  int v_end = 192;
-  const int dgd_stride = MAX_DATA_BLOCK;
-  const int src_stride = MAX_DATA_BLOCK;
-  const int flt0_stride = MAX_DATA_BLOCK;
-  const int flt1_stride = MAX_DATA_BLOCK;
-  sgr_params_type params;
-  int xq[2];
-  const int iters = kIterations;
-  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
-    int64_t err_ref = 0, err_test = 1;
-    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
-      dgd_[i] = 0;
-      src_[i] = 255;
-      flt0_[i] = rng_.Rand15Signed();
-      flt1_[i] = rng_.Rand15Signed();
-    }
-    xq[0] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
-    xq[1] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
-    params.r[0] = rng_.Rand8() % MAX_RADIUS;
-    params.r[1] = rng_.Rand8() % MAX_RADIUS;
-    params.s[0] = rng_.Rand8() % MAX_RADIUS;
-    params.s[1] = rng_.Rand8() % MAX_RADIUS;
-    uint8_t *dgd = dgd_;
-    uint8_t *src = src_;
-
-    err_ref = av1_lowbd_pixel_proj_error_c(
-        src, h_end - h_start, v_end - v_start, src_stride, dgd, dgd_stride,
-        flt0_, flt0_stride, flt1_, flt1_stride, xq, &params);
-
-    err_test = target_func_(src, h_end - h_start, v_end - v_start, src_stride,
-                            dgd, dgd_stride, flt0_, flt0_stride, flt1_,
-                            flt1_stride, xq, &params);
-
-    ASSERT_EQ(err_ref, err_test);
-  }
-}
-
-TEST_P(PixelProjErrorTest, RandomValues) { RunPixelProjErrorTest(1); }
-
-TEST_P(PixelProjErrorTest, ExtremeValues) {
-  RunPixelProjErrorTest_ExtremeValues();
-}
-
-TEST_P(PixelProjErrorTest, DISABLED_Speed) { RunPixelProjErrorTest(200000); }
-
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(SSE4_1, PixelProjErrorTest,
-                         ::testing::Values(av1_lowbd_pixel_proj_error_sse4_1));
-#endif  // HAVE_SSE4_1
-
-#if HAVE_AVX2
-
-INSTANTIATE_TEST_SUITE_P(AVX2, PixelProjErrorTest,
-                         ::testing::Values(av1_lowbd_pixel_proj_error_avx2));
-#endif  // HAVE_AVX2
-
-#if HAVE_NEON
-
-INSTANTIATE_TEST_SUITE_P(NEON, PixelProjErrorTest,
-                         ::testing::Values(av1_lowbd_pixel_proj_error_neon));
-#endif  // HAVE_NEON
-
-}  // namespace pickrst_test_lowbd
-
 namespace pickrst_test_highbd {
 static const int kIterations = 100;
 
@@ -364,179 +193,3 @@
 #endif  // HAVE_AVX2
 
 }  // namespace pickrst_test_highbd
-
-////////////////////////////////////////////////////////////////////////////////
-// Get_proj_subspace_Test
-////////////////////////////////////////////////////////////////////////////////
-
-namespace get_proj_subspace_test_lowbd {
-static const int kIterations = 100;
-
-typedef void (*set_get_proj_subspace)(const uint8_t *src8, int width,
-                                      int height, int src_stride,
-                                      const uint8_t *dat8, int dat_stride,
-                                      int32_t *flt0, int flt0_stride,
-                                      int32_t *flt1, int flt1_stride,
-                                      int64_t H[2][2], int64_t C[2],
-                                      const sgr_params_type *params);
-
-typedef std::tuple<const set_get_proj_subspace> GetProjSubspaceTestParam;
-
-class GetProjSubspaceTest
-    : public ::testing::TestWithParam<GetProjSubspaceTestParam> {
- public:
-  virtual void SetUp() {
-    target_func_ = GET_PARAM(0);
-    src_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
-                                  sizeof(*src_)));
-    ASSERT_NE(src_, nullptr);
-    dgd_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
-                                  sizeof(*dgd_)));
-    ASSERT_NE(dgd_, nullptr);
-    flt0_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
-                                   sizeof(*flt0_)));
-    ASSERT_NE(flt0_, nullptr);
-    flt1_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
-                                   sizeof(*flt1_)));
-    ASSERT_NE(flt1_, nullptr);
-  }
-  virtual void TearDown() {
-    aom_free(src_);
-    aom_free(dgd_);
-    aom_free(flt0_);
-    aom_free(flt1_);
-  }
-  void RunGetProjSubspaceTest(int32_t run_times);
-  void RunGetProjSubspaceTest_ExtremeValues();
-
- private:
-  set_get_proj_subspace target_func_;
-  libaom_test::ACMRandom rng_;
-  uint8_t *src_;
-  uint8_t *dgd_;
-  int32_t *flt0_;
-  int32_t *flt1_;
-};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GetProjSubspaceTest);
-
-void GetProjSubspaceTest::RunGetProjSubspaceTest(int32_t run_times) {
-  int h_end = run_times != 1
-                  ? 128
-                  : ((rng_.Rand16() % MAX_DATA_BLOCK) &
-                     2147483640);  // We test for widths divisible by 8.
-  int v_end =
-      run_times != 1 ? 128 : ((rng_.Rand16() % MAX_DATA_BLOCK) & 2147483640);
-  const int dgd_stride = MAX_DATA_BLOCK;
-  const int src_stride = MAX_DATA_BLOCK;
-  const int flt0_stride = MAX_DATA_BLOCK;
-  const int flt1_stride = MAX_DATA_BLOCK;
-  sgr_params_type params;
-  const int iters = run_times == 1 ? kIterations : 4;
-  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
-    int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
-    int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
-    int64_t H_test[2][2] = { { 0, 0 }, { 0, 0 } };
-    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
-      dgd_[i] = rng_.Rand8();
-      src_[i] = rng_.Rand8();
-      flt0_[i] = rng_.Rand15Signed();
-      flt1_[i] = rng_.Rand15Signed();
-    }
-
-    params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : 1;
-    params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : 1;
-    params.s[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
-    params.s[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
-    uint8_t *dgd = dgd_;
-    uint8_t *src = src_;
-
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-    for (int i = 0; i < run_times; ++i) {
-      av1_calc_proj_params_c(src, v_end, h_end, src_stride, dgd, dgd_stride,
-                             flt0_, flt0_stride, flt1_, flt1_stride, H_ref,
-                             C_ref, &params);
-    }
-    aom_usec_timer_mark(&timer);
-    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
-    aom_usec_timer_start(&timer);
-    for (int i = 0; i < run_times; ++i) {
-      target_func_(src, v_end, h_end, src_stride, dgd, dgd_stride, flt0_,
-                   flt0_stride, flt1_, flt1_stride, H_test, C_test, &params);
-    }
-    aom_usec_timer_mark(&timer);
-    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
-    if (run_times > 10) {
-      printf("r0 %d r1 %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", params.r[0],
-             params.r[1], h_end, v_end, time1, time2, time1 / time2);
-    } else {
-      ASSERT_EQ(H_ref[0][0], H_test[0][0]);
-      ASSERT_EQ(H_ref[0][1], H_test[0][1]);
-      ASSERT_EQ(H_ref[1][0], H_test[1][0]);
-      ASSERT_EQ(H_ref[1][1], H_test[1][1]);
-      ASSERT_EQ(C_ref[0], C_test[0]);
-      ASSERT_EQ(C_ref[1], C_test[1]);
-    }
-  }
-}
-
-void GetProjSubspaceTest::RunGetProjSubspaceTest_ExtremeValues() {
-  const int h_start = 0;
-  int h_end = MAX_DATA_BLOCK;
-  const int v_start = 0;
-  int v_end = MAX_DATA_BLOCK;
-  const int dgd_stride = MAX_DATA_BLOCK;
-  const int src_stride = MAX_DATA_BLOCK;
-  const int flt0_stride = MAX_DATA_BLOCK;
-  const int flt1_stride = MAX_DATA_BLOCK;
-  sgr_params_type params;
-  const int iters = kIterations;
-  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
-    int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
-    int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
-    int64_t H_test[2][2] = { { 0, 0 }, { 0, 0 } };
-    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
-      dgd_[i] = 0;
-      src_[i] = 255;
-      flt0_[i] = rng_.Rand15Signed();
-      flt1_[i] = rng_.Rand15Signed();
-    }
-    params.r[0] = 1;
-    params.r[1] = 1;
-    params.s[0] = rng_.Rand8() % MAX_RADIUS;
-    params.s[1] = rng_.Rand8() % MAX_RADIUS;
-    uint8_t *dgd = dgd_;
-    uint8_t *src = src_;
-
-    av1_calc_proj_params_c(src, h_end - h_start, v_end - v_start, src_stride,
-                           dgd, dgd_stride, flt0_, flt0_stride, flt1_,
-                           flt1_stride, H_ref, C_ref, &params);
-
-    target_func_(src, h_end - h_start, v_end - v_start, src_stride, dgd,
-                 dgd_stride, flt0_, flt0_stride, flt1_, flt1_stride, H_test,
-                 C_test, &params);
-
-    ASSERT_EQ(H_ref[0][0], H_test[0][0]);
-    ASSERT_EQ(H_ref[0][1], H_test[0][1]);
-    ASSERT_EQ(H_ref[1][0], H_test[1][0]);
-    ASSERT_EQ(H_ref[1][1], H_test[1][1]);
-    ASSERT_EQ(C_ref[0], C_test[0]);
-    ASSERT_EQ(C_ref[1], C_test[1]);
-  }
-}
-
-TEST_P(GetProjSubspaceTest, RandomValues) { RunGetProjSubspaceTest(1); }
-
-TEST_P(GetProjSubspaceTest, ExtremeValues) {
-  RunGetProjSubspaceTest_ExtremeValues();
-}
-
-TEST_P(GetProjSubspaceTest, DISABLED_Speed) { RunGetProjSubspaceTest(200000); }
-
-#if HAVE_AVX2
-
-INSTANTIATE_TEST_SUITE_P(AVX2, GetProjSubspaceTest,
-                         ::testing::Values(av1_calc_proj_params_avx2));
-#endif  // HAVE_AVX2
-
-}  // namespace get_proj_subspace_test_lowbd

diff --git a/test/quantize_func_test.cc b/test/quantize_func_test.cc
index 17000cb..e8944ce 100644
--- a/test/quantize_func_test.cc
+++ b/test/quantize_func_test.cc

@@ -392,14 +392,4 @@
                          ::testing::ValuesIn(kQParamArraySSE2));
 #endif  // HAVE_SSE2
 
-#if HAVE_NEON
-const QuantizeParam kQParamArrayNEON[] = {
-  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_c,
-             static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
-};
-
-INSTANTIATE_TEST_SUITE_P(NEON, QuantizeTest,
-                         ::testing::ValuesIn(kQParamArrayNEON));
-#endif  // HAVE_NEON
-
 }  // namespace

diff --git a/test/reconinter_test.cc b/test/reconinter_test.cc
index 59c0a9c..3967a4d 100644
--- a/test/reconinter_test.cc
+++ b/test/reconinter_test.cc

@@ -30,36 +30,6 @@
 namespace {
 using libaom_test::ACMRandom;
 
-typedef void (*buildcompdiffwtdmaskd_func)(uint8_t *mask,
-                                           DIFFWTD_MASK_TYPE mask_type,
-                                           const uint8_t *src0, int src0_stride,
-                                           const uint8_t *src1, int src1_stride,
-                                           int h, int w);
-
-typedef std::tuple<BLOCK_SIZE, buildcompdiffwtdmaskd_func>
-    BuildCompDiffwtdMaskDParam;
-
-#if HAVE_SSE4_1
-::testing::internal::ParamGenerator<BuildCompDiffwtdMaskDParam> BuildParams(
-    buildcompdiffwtdmaskd_func filter) {
-  return ::testing::Combine(::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL),
-                            ::testing::Values(filter));
-}
-#endif
-
-class BuildCompDiffwtdMaskTest
-    : public ::testing::TestWithParam<BuildCompDiffwtdMaskDParam> {
- public:
-  virtual ~BuildCompDiffwtdMaskTest() {}
-
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-  void RunTest(buildcompdiffwtdmaskd_func test_impl, const int is_speed,
-               const DIFFWTD_MASK_TYPE type);
-
- private:
-  ACMRandom rnd_;
-};
-
 typedef void (*buildcompdiffwtdmaskd16_func)(
     uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
     int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
@@ -176,57 +146,6 @@
   printf("av1_build_compound_diffwtd_mask_d16  %3dx%-3d: %7.2f \n", width,
          height, elapsed_time / double(elapsed_time1));
 }
-#if HAVE_SSE4_1
-void BuildCompDiffwtdMaskTest::RunTest(buildcompdiffwtdmaskd_func test_impl,
-                                       const int is_speed,
-                                       const DIFFWTD_MASK_TYPE type) {
-  const int sb_type = GET_PARAM(0);
-  const int width = block_size_wide[sb_type];
-  const int height = block_size_high[sb_type];
-  DECLARE_ALIGNED(16, uint8_t, mask_ref[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, mask_test[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, src0[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, src1[MAX_SB_SQUARE]);
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  for (int i = 0; i < width * height; i++) {
-    src0[i] = rnd.Rand8();
-    src1[i] = rnd.Rand8();
-  }
-  const int run_times = is_speed ? (10000000 / (width + height)) : 1;
-  aom_usec_timer timer;
-  aom_usec_timer_start(&timer);
-  for (int i = 0; i < run_times; ++i) {
-    av1_build_compound_diffwtd_mask_c(mask_ref, type, src0, width, src1, width,
-                                      height, width);
-  }
-  const double t1 = get_time_mark(&timer);
-  aom_usec_timer_start(&timer);
-  for (int i = 0; i < run_times; ++i) {
-    test_impl(mask_test, type, src0, width, src1, width, height, width);
-  }
-  const double t2 = get_time_mark(&timer);
-  if (is_speed) {
-    printf("mask %d %3dx%-3d:%7.2f/%7.2fns", type, width, height, t1, t2);
-    printf("(%3.2f)\n", t1 / t2);
-  }
-  for (int r = 0; r < height; ++r) {
-    for (int c = 0; c < width; ++c) {
-      ASSERT_EQ(mask_ref[c + r * width], mask_test[c + r * width])
-          << "[" << r << "," << c << "] " << run_times << " @ " << width << "x"
-          << height << " inv " << type;
-    }
-  }
-}
-
-TEST_P(BuildCompDiffwtdMaskTest, match) {
-  RunTest(GET_PARAM(1), 0, DIFFWTD_38);
-  RunTest(GET_PARAM(1), 0, DIFFWTD_38_INV);
-}
-TEST_P(BuildCompDiffwtdMaskTest, DISABLED_Speed) {
-  RunTest(GET_PARAM(1), 1, DIFFWTD_38);
-  RunTest(GET_PARAM(1), 1, DIFFWTD_38_INV);
-}
-#endif
 TEST_P(BuildCompDiffwtdMaskD16Test, CheckOutput) {
   RunCheckOutput(GET_PARAM(1));
 }
@@ -237,18 +156,12 @@
 }
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(SSE4_1, BuildCompDiffwtdMaskTest,
-                         BuildParams(av1_build_compound_diffwtd_mask_sse4_1));
-
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, BuildCompDiffwtdMaskD16Test,
     BuildParams(av1_build_compound_diffwtd_mask_d16_sse4_1));
 #endif
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(AVX2, BuildCompDiffwtdMaskTest,
-                         BuildParams(av1_build_compound_diffwtd_mask_avx2));
-
 INSTANTIATE_TEST_SUITE_P(AVX2, BuildCompDiffwtdMaskD16Test,
                          BuildParams(av1_build_compound_diffwtd_mask_d16_avx2));
 #endif

diff --git a/test/sad_test.cc b/test/sad_test.cc
index 627b7ad..29dbb72 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc

@@ -459,6 +459,7 @@
     }
   }
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SADx4AvgTest);
 
 class SADTest : public ::testing::WithParamInterface<SadMxNParam>,
                 public SADTestBase {
@@ -579,6 +580,7 @@
     }
   }
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DistWtdCompAvgTest);
 
 class DistWtdSADTest : public ::testing::WithParamInterface<DistWtdSadMxhParam>,
                        public SADTestBase {
@@ -644,6 +646,7 @@
     }
   }
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DistWtdSADavgTest);
 
 uint8_t *SADTestBase::source_data_ = NULL;
 uint8_t *SADTestBase::reference_data_ = NULL;
@@ -1173,22 +1176,6 @@
 //------------------------------------------------------------------------------
 // C functions
 const SadMxNParam c_tests[] = {
-  make_tuple(128, 128, &aom_sad128x128_c, -1),
-  make_tuple(128, 64, &aom_sad128x64_c, -1),
-  make_tuple(64, 128, &aom_sad64x128_c, -1),
-  make_tuple(64, 64, &aom_sad64x64_c, -1),
-  make_tuple(64, 32, &aom_sad64x32_c, -1),
-  make_tuple(32, 64, &aom_sad32x64_c, -1),
-  make_tuple(32, 32, &aom_sad32x32_c, -1),
-  make_tuple(32, 16, &aom_sad32x16_c, -1),
-  make_tuple(16, 32, &aom_sad16x32_c, -1),
-  make_tuple(16, 16, &aom_sad16x16_c, -1),
-  make_tuple(16, 8, &aom_sad16x8_c, -1),
-  make_tuple(8, 16, &aom_sad8x16_c, -1),
-  make_tuple(8, 8, &aom_sad8x8_c, -1),
-  make_tuple(8, 4, &aom_sad8x4_c, -1),
-  make_tuple(4, 8, &aom_sad4x8_c, -1),
-  make_tuple(4, 4, &aom_sad4x4_c, -1),
   make_tuple(128, 128, &aom_highbd_sad128x128_c, 8),
   make_tuple(128, 64, &aom_highbd_sad128x64_c, 8),
   make_tuple(64, 128, &aom_highbd_sad64x128_c, 8),
@@ -1237,8 +1224,6 @@
   make_tuple(8, 4, &aom_highbd_sad8x4_c, 12),
   make_tuple(4, 8, &aom_highbd_sad4x8_c, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4_c, 12),
-  make_tuple(64, 16, &aom_sad64x16_c, -1),
-  make_tuple(16, 64, &aom_sad16x64_c, -1),
 
   make_tuple(64, 16, &aom_highbd_sad64x16_c, 8),
   make_tuple(16, 64, &aom_highbd_sad16x64_c, 8),
@@ -1247,9 +1232,6 @@
   make_tuple(64, 16, &aom_highbd_sad64x16_c, 12),
   make_tuple(16, 64, &aom_highbd_sad16x64_c, 12),
 
-  make_tuple(32, 8, &aom_sad32x8_c, -1),
-  make_tuple(8, 32, &aom_sad8x32_c, -1),
-
   make_tuple(32, 8, &aom_highbd_sad32x8_c, 8),
   make_tuple(8, 32, &aom_highbd_sad8x32_c, 8),
   make_tuple(32, 8, &aom_highbd_sad32x8_c, 10),
@@ -1257,9 +1239,6 @@
   make_tuple(32, 8, &aom_highbd_sad32x8_c, 12),
   make_tuple(8, 32, &aom_highbd_sad8x32_c, 12),
 
-  make_tuple(16, 4, &aom_sad16x4_c, -1),
-  make_tuple(4, 16, &aom_sad4x16_c, -1),
-
   make_tuple(16, 4, &aom_highbd_sad16x4_c, 8),
   make_tuple(4, 16, &aom_highbd_sad4x16_c, 8),
   make_tuple(16, 4, &aom_highbd_sad16x4_c, 10),
@@ -1270,29 +1249,6 @@
 INSTANTIATE_TEST_SUITE_P(C, SADTest, ::testing::ValuesIn(c_tests));
 
 const SadSkipMxNParam skip_c_tests[] = {
-  make_tuple(128, 128, &aom_sad_skip_128x128_c, -1),
-  make_tuple(128, 64, &aom_sad_skip_128x64_c, -1),
-  make_tuple(64, 128, &aom_sad_skip_64x128_c, -1),
-  make_tuple(64, 64, &aom_sad_skip_64x64_c, -1),
-  make_tuple(64, 32, &aom_sad_skip_64x32_c, -1),
-  make_tuple(32, 64, &aom_sad_skip_32x64_c, -1),
-  make_tuple(32, 32, &aom_sad_skip_32x32_c, -1),
-  make_tuple(32, 16, &aom_sad_skip_32x16_c, -1),
-  make_tuple(16, 32, &aom_sad_skip_16x32_c, -1),
-  make_tuple(16, 16, &aom_sad_skip_16x16_c, -1),
-  make_tuple(16, 8, &aom_sad_skip_16x8_c, -1),
-  make_tuple(8, 16, &aom_sad_skip_8x16_c, -1),
-  make_tuple(8, 8, &aom_sad_skip_8x8_c, -1),
-  make_tuple(8, 4, &aom_sad_skip_8x4_c, -1),
-  make_tuple(4, 8, &aom_sad_skip_4x8_c, -1),
-  make_tuple(4, 4, &aom_sad_skip_4x4_c, -1),
-  make_tuple(64, 16, &aom_sad_skip_64x16_c, -1),
-  make_tuple(16, 64, &aom_sad_skip_16x64_c, -1),
-  make_tuple(32, 8, &aom_sad_skip_32x8_c, -1),
-  make_tuple(8, 32, &aom_sad_skip_8x32_c, -1),
-  make_tuple(16, 4, &aom_sad_skip_16x4_c, -1),
-  make_tuple(4, 16, &aom_sad_skip_4x16_c, -1),
-
   make_tuple(128, 128, &aom_highbd_sad_skip_128x128_c, 8),
   make_tuple(128, 64, &aom_highbd_sad_skip_128x64_c, 8),
   make_tuple(64, 128, &aom_highbd_sad_skip_64x128_c, 8),
@@ -1365,23 +1321,6 @@
 INSTANTIATE_TEST_SUITE_P(C, SADSkipTest, ::testing::ValuesIn(skip_c_tests));
 
 const SadMxNAvgParam avg_c_tests[] = {
-  make_tuple(128, 128, &aom_sad128x128_avg_c, -1),
-  make_tuple(128, 64, &aom_sad128x64_avg_c, -1),
-  make_tuple(64, 128, &aom_sad64x128_avg_c, -1),
-  make_tuple(64, 64, &aom_sad64x64_avg_c, -1),
-  make_tuple(64, 32, &aom_sad64x32_avg_c, -1),
-  make_tuple(32, 64, &aom_sad32x64_avg_c, -1),
-  make_tuple(32, 32, &aom_sad32x32_avg_c, -1),
-  make_tuple(32, 16, &aom_sad32x16_avg_c, -1),
-  make_tuple(16, 32, &aom_sad16x32_avg_c, -1),
-  make_tuple(16, 16, &aom_sad16x16_avg_c, -1),
-  make_tuple(16, 8, &aom_sad16x8_avg_c, -1),
-  make_tuple(8, 16, &aom_sad8x16_avg_c, -1),
-  make_tuple(8, 8, &aom_sad8x8_avg_c, -1),
-  make_tuple(8, 4, &aom_sad8x4_avg_c, -1),
-  make_tuple(4, 8, &aom_sad4x8_avg_c, -1),
-  make_tuple(4, 4, &aom_sad4x4_avg_c, -1),
-
   make_tuple(128, 128, &aom_highbd_sad128x128_avg_c, 8),
   make_tuple(128, 64, &aom_highbd_sad128x64_avg_c, 8),
   make_tuple(64, 128, &aom_highbd_sad64x128_avg_c, 8),
@@ -1431,9 +1370,6 @@
   make_tuple(4, 8, &aom_highbd_sad4x8_avg_c, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4_avg_c, 12),
 
-  make_tuple(64, 16, &aom_sad64x16_avg_c, -1),
-  make_tuple(16, 64, &aom_sad16x64_avg_c, -1),
-
   make_tuple(64, 16, &aom_highbd_sad64x16_avg_c, 8),
   make_tuple(16, 64, &aom_highbd_sad16x64_avg_c, 8),
   make_tuple(64, 16, &aom_highbd_sad64x16_avg_c, 10),
@@ -1441,9 +1377,6 @@
   make_tuple(64, 16, &aom_highbd_sad64x16_avg_c, 12),
   make_tuple(16, 64, &aom_highbd_sad16x64_avg_c, 12),
 
-  make_tuple(32, 8, &aom_sad32x8_avg_c, -1),
-  make_tuple(8, 32, &aom_sad8x32_avg_c, -1),
-
   make_tuple(32, 8, &aom_highbd_sad32x8_avg_c, 8),
   make_tuple(8, 32, &aom_highbd_sad8x32_avg_c, 8),
   make_tuple(32, 8, &aom_highbd_sad32x8_avg_c, 10),
@@ -1451,9 +1384,6 @@
   make_tuple(32, 8, &aom_highbd_sad32x8_avg_c, 12),
   make_tuple(8, 32, &aom_highbd_sad8x32_avg_c, 12),
 
-  make_tuple(16, 4, &aom_sad16x4_avg_c, -1),
-  make_tuple(4, 16, &aom_sad4x16_avg_c, -1),
-
   make_tuple(16, 4, &aom_highbd_sad16x4_avg_c, 8),
   make_tuple(4, 16, &aom_highbd_sad4x16_avg_c, 8),
   make_tuple(16, 4, &aom_highbd_sad16x4_avg_c, 10),
@@ -1463,83 +1393,7 @@
 };
 INSTANTIATE_TEST_SUITE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests));
 
-// TODO(chengchen): add highbd tests
-const DistWtdCompAvgParam dist_wtd_comp_avg_c_tests[] = {
-  make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
-
-  make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
-  make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
-};
-
-INSTANTIATE_TEST_SUITE_P(C, DistWtdCompAvgTest,
-                         ::testing::ValuesIn(dist_wtd_comp_avg_c_tests));
-
-const DistWtdSadMxNAvgParam dist_wtd_avg_c_tests[] = {
-  make_tuple(128, 128, &aom_dist_wtd_sad128x128_avg_c, -1),
-  make_tuple(128, 64, &aom_dist_wtd_sad128x64_avg_c, -1),
-  make_tuple(64, 128, &aom_dist_wtd_sad64x128_avg_c, -1),
-  make_tuple(64, 64, &aom_dist_wtd_sad64x64_avg_c, -1),
-  make_tuple(64, 32, &aom_dist_wtd_sad64x32_avg_c, -1),
-  make_tuple(32, 64, &aom_dist_wtd_sad32x64_avg_c, -1),
-  make_tuple(32, 32, &aom_dist_wtd_sad32x32_avg_c, -1),
-  make_tuple(32, 16, &aom_dist_wtd_sad32x16_avg_c, -1),
-  make_tuple(16, 32, &aom_dist_wtd_sad16x32_avg_c, -1),
-  make_tuple(16, 16, &aom_dist_wtd_sad16x16_avg_c, -1),
-  make_tuple(16, 8, &aom_dist_wtd_sad16x8_avg_c, -1),
-  make_tuple(8, 16, &aom_dist_wtd_sad8x16_avg_c, -1),
-  make_tuple(8, 8, &aom_dist_wtd_sad8x8_avg_c, -1),
-  make_tuple(8, 4, &aom_dist_wtd_sad8x4_avg_c, -1),
-  make_tuple(4, 8, &aom_dist_wtd_sad4x8_avg_c, -1),
-  make_tuple(4, 4, &aom_dist_wtd_sad4x4_avg_c, -1),
-
-  make_tuple(64, 16, &aom_dist_wtd_sad64x16_avg_c, -1),
-  make_tuple(16, 64, &aom_dist_wtd_sad16x64_avg_c, -1),
-  make_tuple(32, 8, &aom_dist_wtd_sad32x8_avg_c, -1),
-  make_tuple(8, 32, &aom_dist_wtd_sad8x32_avg_c, -1),
-  make_tuple(16, 4, &aom_dist_wtd_sad16x4_avg_c, -1),
-  make_tuple(4, 16, &aom_dist_wtd_sad4x16_avg_c, -1),
-};
-
-INSTANTIATE_TEST_SUITE_P(C, DistWtdSADavgTest,
-                         ::testing::ValuesIn(dist_wtd_avg_c_tests));
-
 const SadMxNx4Param x4d_c_tests[] = {
-  make_tuple(128, 128, &aom_sad128x128x4d_c, -1),
-  make_tuple(128, 64, &aom_sad128x64x4d_c, -1),
-  make_tuple(64, 128, &aom_sad64x128x4d_c, -1),
-  make_tuple(64, 64, &aom_sad64x64x4d_c, -1),
-  make_tuple(64, 32, &aom_sad64x32x4d_c, -1),
-  make_tuple(32, 64, &aom_sad32x64x4d_c, -1),
-  make_tuple(32, 32, &aom_sad32x32x4d_c, -1),
-  make_tuple(32, 16, &aom_sad32x16x4d_c, -1),
-  make_tuple(16, 32, &aom_sad16x32x4d_c, -1),
-  make_tuple(16, 16, &aom_sad16x16x4d_c, -1),
-  make_tuple(16, 8, &aom_sad16x8x4d_c, -1),
-  make_tuple(8, 16, &aom_sad8x16x4d_c, -1),
-  make_tuple(8, 8, &aom_sad8x8x4d_c, -1),
-  make_tuple(8, 4, &aom_sad8x4x4d_c, -1),
-  make_tuple(4, 8, &aom_sad4x8x4d_c, -1),
-  make_tuple(4, 4, &aom_sad4x4x4d_c, -1),
-
   make_tuple(128, 128, &aom_highbd_sad128x128x4d_c, 8),
   make_tuple(128, 64, &aom_highbd_sad128x64x4d_c, 8),
   make_tuple(64, 128, &aom_highbd_sad64x128x4d_c, 8),
@@ -1589,9 +1443,6 @@
   make_tuple(4, 8, &aom_highbd_sad4x8x4d_c, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4x4d_c, 12),
 
-  make_tuple(64, 16, &aom_sad64x16x4d_c, -1),
-  make_tuple(16, 64, &aom_sad16x64x4d_c, -1),
-
   make_tuple(64, 16, &aom_highbd_sad64x16x4d_c, 8),
   make_tuple(16, 64, &aom_highbd_sad16x64x4d_c, 8),
   make_tuple(64, 16, &aom_highbd_sad64x16x4d_c, 10),
@@ -1599,9 +1450,6 @@
   make_tuple(64, 16, &aom_highbd_sad64x16x4d_c, 12),
   make_tuple(16, 64, &aom_highbd_sad16x64x4d_c, 12),
 
-  make_tuple(32, 8, &aom_sad32x8x4d_c, -1),
-  make_tuple(8, 32, &aom_sad8x32x4d_c, -1),
-
   make_tuple(32, 8, &aom_highbd_sad32x8x4d_c, 8),
   make_tuple(8, 32, &aom_highbd_sad8x32x4d_c, 8),
   make_tuple(32, 8, &aom_highbd_sad32x8x4d_c, 10),
@@ -1609,9 +1457,6 @@
   make_tuple(32, 8, &aom_highbd_sad32x8x4d_c, 12),
   make_tuple(8, 32, &aom_highbd_sad8x32x4d_c, 12),
 
-  make_tuple(16, 4, &aom_sad16x4x4d_c, -1),
-  make_tuple(4, 16, &aom_sad4x16x4d_c, -1),
-
   make_tuple(16, 4, &aom_highbd_sad16x4x4d_c, 8),
   make_tuple(4, 16, &aom_highbd_sad4x16x4d_c, 8),
   make_tuple(16, 4, &aom_highbd_sad16x4x4d_c, 10),
@@ -1622,26 +1467,6 @@
 INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
 
 const SadMxNx4Param skip_x4d_c_tests[] = {
-  make_tuple(128, 128, &aom_sad_skip_128x128x4d_c, -1),
-  make_tuple(128, 64, &aom_sad_skip_128x64x4d_c, -1),
-  make_tuple(64, 128, &aom_sad_skip_64x128x4d_c, -1),
-  make_tuple(64, 64, &aom_sad_skip_64x64x4d_c, -1),
-  make_tuple(64, 32, &aom_sad_skip_64x32x4d_c, -1),
-  make_tuple(32, 64, &aom_sad_skip_32x64x4d_c, -1),
-  make_tuple(32, 32, &aom_sad_skip_32x32x4d_c, -1),
-  make_tuple(32, 16, &aom_sad_skip_32x16x4d_c, -1),
-  make_tuple(16, 32, &aom_sad_skip_16x32x4d_c, -1),
-  make_tuple(16, 16, &aom_sad_skip_16x16x4d_c, -1),
-  make_tuple(16, 8, &aom_sad_skip_16x8x4d_c, -1),
-  make_tuple(8, 16, &aom_sad_skip_8x16x4d_c, -1),
-  make_tuple(8, 8, &aom_sad_skip_8x8x4d_c, -1),
-  make_tuple(4, 8, &aom_sad_skip_4x8x4d_c, -1),
-  make_tuple(64, 16, &aom_sad_skip_64x16x4d_c, -1),
-  make_tuple(16, 64, &aom_sad_skip_16x64x4d_c, -1),
-  make_tuple(32, 8, &aom_sad_skip_32x8x4d_c, -1),
-  make_tuple(8, 32, &aom_sad_skip_8x32x4d_c, -1),
-  make_tuple(4, 16, &aom_sad_skip_4x16x4d_c, -1),
-
   make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_c, 8),
   make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_c, 8),
   make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_c, 8),
@@ -1713,124 +1538,10 @@
 };
 INSTANTIATE_TEST_SUITE_P(C, SADSkipx4Test,
                          ::testing::ValuesIn(skip_x4d_c_tests));
-
-const SadMxNx4AvgParam x4d_avg_c_tests[] = {
-  make_tuple(128, 128, &aom_sad128x128x4d_avg_c, -1),
-  make_tuple(128, 64, &aom_sad128x64x4d_avg_c, -1),
-  make_tuple(64, 128, &aom_sad64x128x4d_avg_c, -1),
-  make_tuple(64, 64, &aom_sad64x64x4d_avg_c, -1),
-  make_tuple(64, 32, &aom_sad64x32x4d_avg_c, -1),
-  make_tuple(32, 64, &aom_sad32x64x4d_avg_c, -1),
-  make_tuple(32, 32, &aom_sad32x32x4d_avg_c, -1),
-  make_tuple(32, 16, &aom_sad32x16x4d_avg_c, -1),
-  make_tuple(16, 32, &aom_sad16x32x4d_avg_c, -1),
-  make_tuple(16, 16, &aom_sad16x16x4d_avg_c, -1),
-  make_tuple(16, 8, &aom_sad16x8x4d_avg_c, -1),
-  make_tuple(8, 16, &aom_sad8x16x4d_avg_c, -1),
-  make_tuple(8, 8, &aom_sad8x8x4d_avg_c, -1),
-  make_tuple(8, 4, &aom_sad8x4x4d_avg_c, -1),
-  make_tuple(4, 8, &aom_sad4x8x4d_avg_c, -1),
-  make_tuple(4, 4, &aom_sad4x4x4d_avg_c, -1),
-  make_tuple(64, 16, &aom_sad64x16x4d_avg_c, -1),
-  make_tuple(16, 64, &aom_sad16x64x4d_avg_c, -1),
-  make_tuple(32, 8, &aom_sad32x8x4d_avg_c, -1),
-  make_tuple(8, 32, &aom_sad8x32x4d_avg_c, -1),
-  make_tuple(16, 4, &aom_sad16x4x4d_avg_c, -1),
-  make_tuple(4, 16, &aom_sad4x16x4d_avg_c, -1),
-};
-INSTANTIATE_TEST_SUITE_P(C, SADx4AvgTest, ::testing::ValuesIn(x4d_avg_c_tests));
-
-//------------------------------------------------------------------------------
-// ARM functions
-#if HAVE_NEON
-const SadMxNParam neon_tests[] = {
-  make_tuple(128, 128, &aom_sad128x128_neon, -1),
-  make_tuple(64, 64, &aom_sad64x64_neon, -1),
-  make_tuple(32, 32, &aom_sad32x32_neon, -1),
-  make_tuple(16, 16, &aom_sad16x16_neon, -1),
-  make_tuple(16, 8, &aom_sad16x8_neon, -1),
-  make_tuple(8, 16, &aom_sad8x16_neon, -1),
-  make_tuple(8, 8, &aom_sad8x8_neon, -1),
-  make_tuple(4, 4, &aom_sad4x4_neon, -1),
-};
-INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));
-
-const SadMxNx4Param x4d_neon_tests[] = {
-  make_tuple(64, 64, &aom_sad64x64x4d_neon, -1),
-  make_tuple(32, 32, &aom_sad32x32x4d_neon, -1),
-  make_tuple(16, 16, &aom_sad16x16x4d_neon, -1),
-};
-INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
-const SadSkipMxNParam skip_neon_tests[] = {
-  make_tuple(128, 128, &aom_sad_skip_128x128_neon, -1),
-  make_tuple(128, 64, &aom_sad_skip_128x64_neon, -1),
-  make_tuple(64, 128, &aom_sad_skip_64x128_neon, -1),
-  make_tuple(64, 64, &aom_sad_skip_64x64_neon, -1),
-  make_tuple(64, 32, &aom_sad_skip_64x32_neon, -1),
-  make_tuple(64, 16, &aom_sad_skip_64x16_neon, -1),
-  make_tuple(32, 64, &aom_sad_skip_32x64_neon, -1),
-  make_tuple(32, 32, &aom_sad_skip_32x32_neon, -1),
-  make_tuple(32, 16, &aom_sad_skip_32x16_neon, -1),
-  make_tuple(32, 8, &aom_sad_skip_32x8_neon, -1),
-  make_tuple(16, 64, &aom_sad_skip_16x64_neon, -1),
-  make_tuple(16, 32, &aom_sad_skip_16x32_neon, -1),
-  make_tuple(16, 16, &aom_sad_skip_16x16_neon, -1),
-  make_tuple(16, 8, &aom_sad_skip_16x8_neon, -1),
-  make_tuple(8, 32, &aom_sad_skip_8x32_neon, -1),
-  make_tuple(8, 16, &aom_sad_skip_8x16_neon, -1),
-  make_tuple(8, 8, &aom_sad_skip_8x8_neon, -1),
-  make_tuple(4, 16, &aom_sad_skip_4x16_neon, -1),
-  make_tuple(4, 8, &aom_sad_skip_4x8_neon, -1),
-};
-INSTANTIATE_TEST_SUITE_P(NEON, SADSkipTest,
-                         ::testing::ValuesIn(skip_neon_tests));
-
-const SadSkipMxNx4Param skip_x4d_neon_tests[] = {
-  make_tuple(128, 128, &aom_sad_skip_128x128x4d_neon, -1),
-  make_tuple(128, 64, &aom_sad_skip_128x64x4d_neon, -1),
-  make_tuple(64, 128, &aom_sad_skip_64x128x4d_neon, -1),
-  make_tuple(64, 64, &aom_sad_skip_64x64x4d_neon, -1),
-  make_tuple(64, 32, &aom_sad_skip_64x32x4d_neon, -1),
-  make_tuple(64, 16, &aom_sad_skip_64x16x4d_neon, -1),
-  make_tuple(32, 64, &aom_sad_skip_32x64x4d_neon, -1),
-  make_tuple(32, 32, &aom_sad_skip_32x32x4d_neon, -1),
-  make_tuple(32, 16, &aom_sad_skip_32x16x4d_neon, -1),
-  make_tuple(32, 8, &aom_sad_skip_32x8x4d_neon, -1),
-  make_tuple(16, 64, &aom_sad_skip_16x64x4d_neon, -1),
-  make_tuple(16, 32, &aom_sad_skip_16x32x4d_neon, -1),
-  make_tuple(16, 16, &aom_sad_skip_16x16x4d_neon, -1),
-  make_tuple(16, 8, &aom_sad_skip_16x8x4d_neon, -1),
-  make_tuple(8, 8, &aom_sad_skip_8x8x4d_neon, -1),
-  make_tuple(8, 16, &aom_sad_skip_8x16x4d_neon, -1),
-  make_tuple(8, 32, &aom_sad_skip_8x32x4d_neon, -1),
-  make_tuple(4, 8, &aom_sad_skip_4x8x4d_neon, -1),
-  make_tuple(4, 16, &aom_sad_skip_4x16x4d_neon, -1),
-};
-INSTANTIATE_TEST_SUITE_P(NEON, SADSkipx4Test,
-                         ::testing::ValuesIn(skip_x4d_neon_tests));
-#endif  // HAVE_NEON
-
 //------------------------------------------------------------------------------
 // x86 functions
 #if HAVE_SSE2
 const SadMxNParam sse2_tests[] = {
-  make_tuple(128, 128, &aom_sad128x128_sse2, -1),
-  make_tuple(128, 64, &aom_sad128x64_sse2, -1),
-  make_tuple(64, 128, &aom_sad64x128_sse2, -1),
-  make_tuple(64, 64, &aom_sad64x64_sse2, -1),
-  make_tuple(64, 32, &aom_sad64x32_sse2, -1),
-  make_tuple(32, 64, &aom_sad32x64_sse2, -1),
-  make_tuple(32, 32, &aom_sad32x32_sse2, -1),
-  make_tuple(32, 16, &aom_sad32x16_sse2, -1),
-  make_tuple(16, 32, &aom_sad16x32_sse2, -1),
-  make_tuple(16, 16, &aom_sad16x16_sse2, -1),
-  make_tuple(16, 8, &aom_sad16x8_sse2, -1),
-  make_tuple(8, 16, &aom_sad8x16_sse2, -1),
-  make_tuple(8, 8, &aom_sad8x8_sse2, -1),
-  make_tuple(8, 4, &aom_sad8x4_sse2, -1),
-  make_tuple(4, 8, &aom_sad4x8_sse2, -1),
-  make_tuple(4, 4, &aom_sad4x4_sse2, -1),
-
   make_tuple(64, 64, &aom_highbd_sad64x64_sse2, 8),
   make_tuple(64, 32, &aom_highbd_sad64x32_sse2, 8),
   make_tuple(32, 64, &aom_highbd_sad32x64_sse2, 8),
@@ -1871,9 +1582,6 @@
   make_tuple(4, 8, &aom_highbd_sad4x8_sse2, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4_sse2, 12),
 
-  make_tuple(64, 16, &aom_sad64x16_sse2, -1),
-  make_tuple(16, 64, &aom_sad16x64_sse2, -1),
-
   make_tuple(64, 16, &aom_highbd_sad64x16_sse2, 8),
   make_tuple(16, 64, &aom_highbd_sad16x64_sse2, 8),
   make_tuple(64, 16, &aom_highbd_sad64x16_sse2, 10),
@@ -1881,9 +1589,6 @@
   make_tuple(64, 16, &aom_highbd_sad64x16_sse2, 12),
   make_tuple(16, 64, &aom_highbd_sad16x64_sse2, 12),
 
-  make_tuple(32, 8, &aom_sad32x8_sse2, -1),
-  make_tuple(8, 32, &aom_sad8x32_sse2, -1),
-
   make_tuple(32, 8, &aom_highbd_sad32x8_sse2, 8),
   make_tuple(8, 32, &aom_highbd_sad8x32_sse2, 8),
   make_tuple(32, 8, &aom_highbd_sad32x8_sse2, 10),
@@ -1891,9 +1596,6 @@
   make_tuple(32, 8, &aom_highbd_sad32x8_sse2, 12),
   make_tuple(8, 32, &aom_highbd_sad8x32_sse2, 12),
 
-  make_tuple(16, 4, &aom_sad16x4_sse2, -1),
-  make_tuple(4, 16, &aom_sad4x16_sse2, -1),
-
   make_tuple(16, 4, &aom_highbd_sad16x4_sse2, 8),
   make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 8),
   make_tuple(16, 4, &aom_highbd_sad16x4_sse2, 10),
@@ -1904,26 +1606,6 @@
 INSTANTIATE_TEST_SUITE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
 
 const SadSkipMxNParam skip_sse2_tests[] = {
-  make_tuple(128, 128, &aom_sad_skip_128x128_sse2, -1),
-  make_tuple(128, 64, &aom_sad_skip_128x64_sse2, -1),
-  make_tuple(64, 128, &aom_sad_skip_64x128_sse2, -1),
-  make_tuple(64, 64, &aom_sad_skip_64x64_sse2, -1),
-  make_tuple(64, 32, &aom_sad_skip_64x32_sse2, -1),
-  make_tuple(32, 64, &aom_sad_skip_32x64_sse2, -1),
-  make_tuple(32, 32, &aom_sad_skip_32x32_sse2, -1),
-  make_tuple(32, 16, &aom_sad_skip_32x16_sse2, -1),
-  make_tuple(16, 32, &aom_sad_skip_16x32_sse2, -1),
-  make_tuple(16, 16, &aom_sad_skip_16x16_sse2, -1),
-  make_tuple(16, 8, &aom_sad_skip_16x8_sse2, -1),
-  make_tuple(8, 16, &aom_sad_skip_8x16_sse2, -1),
-  make_tuple(8, 8, &aom_sad_skip_8x8_sse2, -1),
-  make_tuple(4, 8, &aom_sad_skip_4x8_sse2, -1),
-  make_tuple(64, 16, &aom_sad_skip_64x16_sse2, -1),
-  make_tuple(16, 64, &aom_sad_skip_16x64_sse2, -1),
-  make_tuple(32, 8, &aom_sad_skip_32x8_sse2, -1),
-  make_tuple(8, 32, &aom_sad_skip_8x32_sse2, -1),
-  make_tuple(4, 16, &aom_sad_skip_4x16_sse2, -1),
-
   make_tuple(64, 64, &aom_highbd_sad_skip_64x64_sse2, 8),
   make_tuple(64, 32, &aom_highbd_sad_skip_64x32_sse2, 8),
   make_tuple(32, 64, &aom_highbd_sad_skip_32x64_sse2, 8),
@@ -1979,23 +1661,6 @@
                          ::testing::ValuesIn(skip_sse2_tests));
 
 const SadMxNAvgParam avg_sse2_tests[] = {
-  make_tuple(128, 128, &aom_sad128x128_avg_sse2, -1),
-  make_tuple(128, 64, &aom_sad128x64_avg_sse2, -1),
-  make_tuple(64, 128, &aom_sad64x128_avg_sse2, -1),
-  make_tuple(64, 64, &aom_sad64x64_avg_sse2, -1),
-  make_tuple(64, 32, &aom_sad64x32_avg_sse2, -1),
-  make_tuple(32, 64, &aom_sad32x64_avg_sse2, -1),
-  make_tuple(32, 32, &aom_sad32x32_avg_sse2, -1),
-  make_tuple(32, 16, &aom_sad32x16_avg_sse2, -1),
-  make_tuple(16, 32, &aom_sad16x32_avg_sse2, -1),
-  make_tuple(16, 16, &aom_sad16x16_avg_sse2, -1),
-  make_tuple(16, 8, &aom_sad16x8_avg_sse2, -1),
-  make_tuple(8, 16, &aom_sad8x16_avg_sse2, -1),
-  make_tuple(8, 8, &aom_sad8x8_avg_sse2, -1),
-  make_tuple(8, 4, &aom_sad8x4_avg_sse2, -1),
-  make_tuple(4, 8, &aom_sad4x8_avg_sse2, -1),
-  make_tuple(4, 4, &aom_sad4x4_avg_sse2, -1),
-
   make_tuple(64, 64, &aom_highbd_sad64x64_avg_sse2, 8),
   make_tuple(64, 32, &aom_highbd_sad64x32_avg_sse2, 8),
   make_tuple(32, 64, &aom_highbd_sad32x64_avg_sse2, 8),
@@ -2036,9 +1701,6 @@
   make_tuple(4, 8, &aom_highbd_sad4x8_avg_sse2, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4_avg_sse2, 12),
 
-  make_tuple(64, 16, &aom_sad64x16_avg_sse2, -1),
-  make_tuple(16, 64, &aom_sad16x64_avg_sse2, -1),
-
   make_tuple(64, 16, &aom_highbd_sad64x16_avg_sse2, 8),
   make_tuple(16, 64, &aom_highbd_sad16x64_avg_sse2, 8),
   make_tuple(64, 16, &aom_highbd_sad64x16_avg_sse2, 10),
@@ -2046,9 +1708,6 @@
   make_tuple(64, 16, &aom_highbd_sad64x16_avg_sse2, 12),
   make_tuple(16, 64, &aom_highbd_sad16x64_avg_sse2, 12),
 
-  make_tuple(32, 8, &aom_sad32x8_avg_sse2, -1),
-  make_tuple(8, 32, &aom_sad8x32_avg_sse2, -1),
-
   make_tuple(32, 8, &aom_highbd_sad32x8_avg_sse2, 8),
   make_tuple(8, 32, &aom_highbd_sad8x32_avg_sse2, 8),
   make_tuple(32, 8, &aom_highbd_sad32x8_avg_sse2, 10),
@@ -2056,9 +1715,6 @@
   make_tuple(32, 8, &aom_highbd_sad32x8_avg_sse2, 12),
   make_tuple(8, 32, &aom_highbd_sad8x32_avg_sse2, 12),
 
-  make_tuple(16, 4, &aom_sad16x4_avg_sse2, -1),
-  make_tuple(4, 16, &aom_sad4x16_avg_sse2, -1),
-
   make_tuple(16, 4, &aom_highbd_sad16x4_avg_sse2, 8),
   make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 8),
   make_tuple(16, 4, &aom_highbd_sad16x4_avg_sse2, 10),
@@ -2069,23 +1725,6 @@
 INSTANTIATE_TEST_SUITE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests));
 
 const SadMxNx4Param x4d_sse2_tests[] = {
-  make_tuple(128, 128, &aom_sad128x128x4d_sse2, -1),
-  make_tuple(128, 64, &aom_sad128x64x4d_sse2, -1),
-  make_tuple(64, 128, &aom_sad64x128x4d_sse2, -1),
-  make_tuple(64, 64, &aom_sad64x64x4d_sse2, -1),
-  make_tuple(64, 32, &aom_sad64x32x4d_sse2, -1),
-  make_tuple(32, 64, &aom_sad32x64x4d_sse2, -1),
-  make_tuple(32, 32, &aom_sad32x32x4d_sse2, -1),
-  make_tuple(32, 16, &aom_sad32x16x4d_sse2, -1),
-  make_tuple(16, 32, &aom_sad16x32x4d_sse2, -1),
-  make_tuple(16, 16, &aom_sad16x16x4d_sse2, -1),
-  make_tuple(16, 8, &aom_sad16x8x4d_sse2, -1),
-  make_tuple(8, 16, &aom_sad8x16x4d_sse2, -1),
-  make_tuple(8, 8, &aom_sad8x8x4d_sse2, -1),
-  make_tuple(8, 4, &aom_sad8x4x4d_sse2, -1),
-  make_tuple(4, 8, &aom_sad4x8x4d_sse2, -1),
-  make_tuple(4, 4, &aom_sad4x4x4d_sse2, -1),
-
   make_tuple(64, 64, &aom_highbd_sad64x64x4d_sse2, 8),
   make_tuple(64, 32, &aom_highbd_sad64x32x4d_sse2, 8),
   make_tuple(32, 64, &aom_highbd_sad32x64x4d_sse2, 8),
@@ -2126,9 +1765,6 @@
   make_tuple(4, 8, &aom_highbd_sad4x8x4d_sse2, 12),
   make_tuple(4, 4, &aom_highbd_sad4x4x4d_sse2, 12),
 
-  make_tuple(64, 16, &aom_sad64x16x4d_sse2, -1),
-  make_tuple(16, 64, &aom_sad16x64x4d_sse2, -1),
-
   make_tuple(64, 16, &aom_highbd_sad64x16x4d_sse2, 8),
   make_tuple(16, 64, &aom_highbd_sad16x64x4d_sse2, 8),
   make_tuple(64, 16, &aom_highbd_sad64x16x4d_sse2, 10),
@@ -2136,9 +1772,6 @@
   make_tuple(64, 16, &aom_highbd_sad64x16x4d_sse2, 12),
   make_tuple(16, 64, &aom_highbd_sad16x64x4d_sse2, 12),
 
-  make_tuple(32, 8, &aom_sad32x8x4d_sse2, -1),
-  make_tuple(8, 32, &aom_sad8x32x4d_sse2, -1),
-
   make_tuple(32, 8, &aom_highbd_sad32x8x4d_sse2, 8),
   make_tuple(8, 32, &aom_highbd_sad8x32x4d_sse2, 8),
   make_tuple(32, 8, &aom_highbd_sad32x8x4d_sse2, 10),
@@ -2146,9 +1779,6 @@
   make_tuple(32, 8, &aom_highbd_sad32x8x4d_sse2, 12),
   make_tuple(8, 32, &aom_highbd_sad8x32x4d_sse2, 12),
 
-  make_tuple(16, 4, &aom_sad16x4x4d_sse2, -1),
-  make_tuple(4, 16, &aom_sad4x16x4d_sse2, -1),
-
   make_tuple(16, 4, &aom_highbd_sad16x4x4d_sse2, 8),
   make_tuple(4, 16, &aom_highbd_sad4x16x4d_sse2, 8),
   make_tuple(16, 4, &aom_highbd_sad16x4x4d_sse2, 10),
@@ -2159,26 +1789,6 @@
 INSTANTIATE_TEST_SUITE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
 
 const SadSkipMxNx4Param skip_x4d_sse2_tests[] = {
-  make_tuple(128, 128, &aom_sad_skip_128x128x4d_sse2, -1),
-  make_tuple(128, 64, &aom_sad_skip_128x64x4d_sse2, -1),
-  make_tuple(64, 128, &aom_sad_skip_64x128x4d_sse2, -1),
-  make_tuple(64, 64, &aom_sad_skip_64x64x4d_sse2, -1),
-  make_tuple(64, 32, &aom_sad_skip_64x32x4d_sse2, -1),
-  make_tuple(32, 64, &aom_sad_skip_32x64x4d_sse2, -1),
-  make_tuple(32, 32, &aom_sad_skip_32x32x4d_sse2, -1),
-  make_tuple(32, 16, &aom_sad_skip_32x16x4d_sse2, -1),
-  make_tuple(16, 32, &aom_sad_skip_16x32x4d_sse2, -1),
-  make_tuple(16, 16, &aom_sad_skip_16x16x4d_sse2, -1),
-  make_tuple(16, 8, &aom_sad_skip_16x8x4d_sse2, -1),
-  make_tuple(8, 16, &aom_sad_skip_8x16x4d_sse2, -1),
-  make_tuple(8, 8, &aom_sad_skip_8x8x4d_sse2, -1),
-  make_tuple(4, 8, &aom_sad_skip_4x8x4d_sse2, -1),
-  make_tuple(64, 16, &aom_sad_skip_64x16x4d_sse2, -1),
-  make_tuple(16, 64, &aom_sad_skip_16x64x4d_sse2, -1),
-  make_tuple(32, 8, &aom_sad_skip_32x8x4d_sse2, -1),
-  make_tuple(8, 32, &aom_sad_skip_8x32x4d_sse2, -1),
-  make_tuple(4, 16, &aom_sad_skip_4x16x4d_sse2, -1),
-
   make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_sse2, 8),
   make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_sse2, 8),
   make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_sse2, 8),
@@ -2232,152 +1842,10 @@
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipx4Test,
                          ::testing::ValuesIn(skip_x4d_sse2_tests));
-
-const SadMxNx4AvgParam x4d_avg_sse2_tests[] = {
-  make_tuple(128, 128, &aom_sad128x128x4d_avg_sse2, -1),
-  make_tuple(128, 64, &aom_sad128x64x4d_avg_sse2, -1),
-  make_tuple(64, 128, &aom_sad64x128x4d_avg_sse2, -1),
-  make_tuple(64, 64, &aom_sad64x64x4d_avg_sse2, -1),
-  make_tuple(64, 32, &aom_sad64x32x4d_avg_sse2, -1),
-  make_tuple(32, 64, &aom_sad32x64x4d_avg_sse2, -1),
-  make_tuple(32, 32, &aom_sad32x32x4d_avg_sse2, -1),
-  make_tuple(32, 16, &aom_sad32x16x4d_avg_sse2, -1),
-  make_tuple(16, 32, &aom_sad16x32x4d_avg_sse2, -1),
-  make_tuple(16, 16, &aom_sad16x16x4d_avg_sse2, -1),
-  make_tuple(16, 8, &aom_sad16x8x4d_avg_sse2, -1),
-  make_tuple(8, 16, &aom_sad8x16x4d_avg_sse2, -1),
-  make_tuple(8, 8, &aom_sad8x8x4d_avg_sse2, -1),
-  make_tuple(8, 4, &aom_sad8x4x4d_avg_sse2, -1),
-  make_tuple(4, 8, &aom_sad4x8x4d_avg_sse2, -1),
-  make_tuple(4, 4, &aom_sad4x4x4d_avg_sse2, -1),
-  make_tuple(64, 16, &aom_sad64x16x4d_avg_sse2, -1),
-  make_tuple(16, 64, &aom_sad16x64x4d_avg_sse2, -1),
-  make_tuple(32, 8, &aom_sad32x8x4d_avg_sse2, -1),
-  make_tuple(8, 32, &aom_sad8x32x4d_avg_sse2, -1),
-  make_tuple(16, 4, &aom_sad16x4x4d_avg_sse2, -1),
-  make_tuple(4, 16, &aom_sad4x16x4d_avg_sse2, -1),
-};
-INSTANTIATE_TEST_SUITE_P(SSE2, SADx4AvgTest,
-                         ::testing::ValuesIn(x4d_avg_sse2_tests));
 #endif  // HAVE_SSE2
 
-#if HAVE_SSSE3
-// Note: These are named sse2, but part of ssse3 file and only built and linked
-// when ssse3 is enabled.
-const DistWtdSadMxhParam dist_wtd_sad_sse2_tests[] = {
-  make_tuple(4, 4, &aom_sad4xh_sse2, -1),
-  make_tuple(4, 8, &aom_sad4xh_sse2, -1),
-  make_tuple(8, 4, &aom_sad8xh_sse2, -1),
-  make_tuple(8, 8, &aom_sad8xh_sse2, -1),
-  make_tuple(8, 16, &aom_sad8xh_sse2, -1),
-  make_tuple(16, 8, &aom_sad16xh_sse2, -1),
-  make_tuple(16, 16, &aom_sad16xh_sse2, -1),
-  make_tuple(16, 32, &aom_sad16xh_sse2, -1),
-  make_tuple(32, 16, &aom_sad32xh_sse2, -1),
-  make_tuple(32, 32, &aom_sad32xh_sse2, -1),
-  make_tuple(32, 64, &aom_sad32xh_sse2, -1),
-  make_tuple(64, 32, &aom_sad64xh_sse2, -1),
-  make_tuple(64, 64, &aom_sad64xh_sse2, -1),
-  make_tuple(128, 128, &aom_sad128xh_sse2, -1),
-  make_tuple(128, 64, &aom_sad128xh_sse2, -1),
-  make_tuple(64, 128, &aom_sad64xh_sse2, -1),
-  make_tuple(4, 16, &aom_sad4xh_sse2, -1),
-  make_tuple(16, 4, &aom_sad16xh_sse2, -1),
-  make_tuple(8, 32, &aom_sad8xh_sse2, -1),
-  make_tuple(32, 8, &aom_sad32xh_sse2, -1),
-  make_tuple(16, 64, &aom_sad16xh_sse2, -1),
-  make_tuple(64, 16, &aom_sad64xh_sse2, -1),
-
-  make_tuple(16, 64, &aom_sad16xh_sse2, -1),
-  make_tuple(64, 16, &aom_sad64xh_sse2, -1),
-  make_tuple(8, 32, &aom_sad8xh_sse2, -1),
-  make_tuple(32, 8, &aom_sad32xh_sse2, -1),
-  make_tuple(4, 16, &aom_sad4xh_sse2, -1),
-  make_tuple(16, 4, &aom_sad16xh_sse2, -1),
-};
-INSTANTIATE_TEST_SUITE_P(SSE2, DistWtdSADTest,
-                         ::testing::ValuesIn(dist_wtd_sad_sse2_tests));
-
-#endif  // HAVE_SSSE3
-
-#if HAVE_SSE3
-// Only functions are x3, which do not have tests.
-#endif  // HAVE_SSE3
-
-#if HAVE_SSSE3
-const DistWtdCompAvgParam dist_wtd_comp_avg_ssse3_tests[] = {
-  make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-
-  make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-  make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
-};
-
-INSTANTIATE_TEST_SUITE_P(SSSE3, DistWtdCompAvgTest,
-                         ::testing::ValuesIn(dist_wtd_comp_avg_ssse3_tests));
-
-const DistWtdSadMxNAvgParam dist_wtd_avg_ssse3_tests[] = {
-  make_tuple(128, 128, &aom_dist_wtd_sad128x128_avg_ssse3, -1),
-  make_tuple(128, 64, &aom_dist_wtd_sad128x64_avg_ssse3, -1),
-  make_tuple(64, 128, &aom_dist_wtd_sad64x128_avg_ssse3, -1),
-  make_tuple(64, 64, &aom_dist_wtd_sad64x64_avg_ssse3, -1),
-  make_tuple(64, 32, &aom_dist_wtd_sad64x32_avg_ssse3, -1),
-  make_tuple(32, 64, &aom_dist_wtd_sad32x64_avg_ssse3, -1),
-  make_tuple(32, 32, &aom_dist_wtd_sad32x32_avg_ssse3, -1),
-  make_tuple(32, 16, &aom_dist_wtd_sad32x16_avg_ssse3, -1),
-  make_tuple(16, 32, &aom_dist_wtd_sad16x32_avg_ssse3, -1),
-  make_tuple(16, 16, &aom_dist_wtd_sad16x16_avg_ssse3, -1),
-  make_tuple(16, 8, &aom_dist_wtd_sad16x8_avg_ssse3, -1),
-  make_tuple(8, 16, &aom_dist_wtd_sad8x16_avg_ssse3, -1),
-  make_tuple(8, 8, &aom_dist_wtd_sad8x8_avg_ssse3, -1),
-  make_tuple(8, 4, &aom_dist_wtd_sad8x4_avg_ssse3, -1),
-  make_tuple(4, 8, &aom_dist_wtd_sad4x8_avg_ssse3, -1),
-  make_tuple(4, 4, &aom_dist_wtd_sad4x4_avg_ssse3, -1),
-
-  make_tuple(64, 16, &aom_dist_wtd_sad64x16_avg_ssse3, -1),
-  make_tuple(16, 64, &aom_dist_wtd_sad16x64_avg_ssse3, -1),
-  make_tuple(32, 8, &aom_dist_wtd_sad32x8_avg_ssse3, -1),
-  make_tuple(8, 32, &aom_dist_wtd_sad8x32_avg_ssse3, -1),
-  make_tuple(16, 4, &aom_dist_wtd_sad16x4_avg_ssse3, -1),
-  make_tuple(4, 16, &aom_dist_wtd_sad4x16_avg_ssse3, -1),
-};
-INSTANTIATE_TEST_SUITE_P(SSSE3, DistWtdSADavgTest,
-                         ::testing::ValuesIn(dist_wtd_avg_ssse3_tests));
-#endif  // HAVE_SSSE3
-
-#if HAVE_SSE4_1
-// Only functions are x8, which do not have tests.
-#endif  // HAVE_SSE4_1
-
 #if HAVE_AVX2
 const SadMxNParam avx2_tests[] = {
-  make_tuple(64, 128, &aom_sad64x128_avx2, -1),
-  make_tuple(128, 64, &aom_sad128x64_avx2, -1),
-  make_tuple(128, 128, &aom_sad128x128_avx2, -1),
-  make_tuple(64, 64, &aom_sad64x64_avx2, -1),
-  make_tuple(64, 32, &aom_sad64x32_avx2, -1),
-  make_tuple(32, 64, &aom_sad32x64_avx2, -1),
-  make_tuple(32, 32, &aom_sad32x32_avx2, -1),
-  make_tuple(32, 16, &aom_sad32x16_avx2, -1),
   make_tuple(128, 128, &aom_highbd_sad128x128_avx2, 8),
   make_tuple(128, 128, &aom_highbd_sad128x128_avx2, 10),
   make_tuple(128, 128, &aom_highbd_sad128x128_avx2, 12),
@@ -2428,14 +1896,6 @@
 INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests));
 
 const SadSkipMxNParam skip_avx2_tests[] = {
-  make_tuple(128, 128, &aom_sad_skip_128x128_avx2, -1),
-  make_tuple(128, 64, &aom_sad_skip_128x64_avx2, -1),
-  make_tuple(64, 128, &aom_sad_skip_64x128_avx2, -1),
-  make_tuple(64, 64, &aom_sad_skip_64x64_avx2, -1),
-  make_tuple(64, 32, &aom_sad_skip_64x32_avx2, -1),
-  make_tuple(32, 64, &aom_sad_skip_32x64_avx2, -1),
-  make_tuple(32, 32, &aom_sad_skip_32x32_avx2, -1),
-  make_tuple(32, 16, &aom_sad_skip_32x16_avx2, -1),
 
   make_tuple(128, 128, &aom_highbd_sad_skip_128x128_avx2, 8),
   make_tuple(128, 64, &aom_highbd_sad_skip_128x64_avx2, 8),
@@ -2480,14 +1940,6 @@
                          ::testing::ValuesIn(skip_avx2_tests));
 
 const SadMxNAvgParam avg_avx2_tests[] = {
-  make_tuple(64, 128, &aom_sad64x128_avg_avx2, -1),
-  make_tuple(128, 64, &aom_sad128x64_avg_avx2, -1),
-  make_tuple(128, 128, &aom_sad128x128_avg_avx2, -1),
-  make_tuple(64, 64, &aom_sad64x64_avg_avx2, -1),
-  make_tuple(64, 32, &aom_sad64x32_avg_avx2, -1),
-  make_tuple(32, 64, &aom_sad32x64_avg_avx2, -1),
-  make_tuple(32, 32, &aom_sad32x32_avg_avx2, -1),
-  make_tuple(32, 16, &aom_sad32x16_avg_avx2, -1),
   make_tuple(128, 128, &aom_highbd_sad128x128_avg_avx2, 8),
   make_tuple(128, 128, &aom_highbd_sad128x128_avg_avx2, 10),
   make_tuple(128, 128, &aom_highbd_sad128x128_avg_avx2, 12),
@@ -2538,17 +1990,6 @@
 INSTANTIATE_TEST_SUITE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests));
 
 const SadSkipMxNx4Param skip_x4d_avx2_tests[] = {
-  make_tuple(128, 128, &aom_sad_skip_128x128x4d_avx2, -1),
-  make_tuple(128, 64, &aom_sad_skip_128x64x4d_avx2, -1),
-  make_tuple(64, 128, &aom_sad_skip_64x128x4d_avx2, -1),
-  make_tuple(64, 64, &aom_sad_skip_64x64x4d_avx2, -1),
-  make_tuple(64, 32, &aom_sad_skip_64x32x4d_avx2, -1),
-  make_tuple(64, 16, &aom_sad_skip_64x16x4d_avx2, -1),
-  make_tuple(32, 64, &aom_sad_skip_32x64x4d_avx2, -1),
-  make_tuple(32, 32, &aom_sad_skip_32x32x4d_avx2, -1),
-  make_tuple(32, 16, &aom_sad_skip_32x16x4d_avx2, -1),
-  make_tuple(32, 8, &aom_sad_skip_32x8x4d_avx2, -1),
-
   make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_avx2, 8),
   make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_avx2, 8),
   make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_avx2, 8),
@@ -2598,17 +2039,6 @@
                          ::testing::ValuesIn(skip_x4d_avx2_tests));
 
 const SadMxNx4Param x4d_avx2_tests[] = {
-  make_tuple(32, 64, &aom_sad32x64x4d_avx2, -1),
-  make_tuple(32, 32, &aom_sad32x32x4d_avx2, -1),
-  make_tuple(32, 16, &aom_sad32x16x4d_avx2, -1),
-  make_tuple(32, 8, &aom_sad32x8x4d_avx2, -1),
-  make_tuple(64, 128, &aom_sad64x128x4d_avx2, -1),
-  make_tuple(64, 64, &aom_sad64x64x4d_avx2, -1),
-  make_tuple(64, 32, &aom_sad64x32x4d_avx2, -1),
-  make_tuple(64, 16, &aom_sad64x16x4d_avx2, -1),
-  make_tuple(128, 128, &aom_sad128x128x4d_avx2, -1),
-  make_tuple(128, 64, &aom_sad128x64x4d_avx2, -1),
-
   make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 8),
   make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 10),
   make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 12),
@@ -2659,59 +2089,4 @@
 INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
 #endif  // HAVE_AVX2
 
-//------------------------------------------------------------------------------
-// MIPS functions
-#if HAVE_MSA
-const SadMxNParam msa_tests[] = {
-  make_tuple(64, 64, &aom_sad64x64_msa, -1),
-  make_tuple(64, 32, &aom_sad64x32_msa, -1),
-  make_tuple(32, 64, &aom_sad32x64_msa, -1),
-  make_tuple(32, 32, &aom_sad32x32_msa, -1),
-  make_tuple(32, 16, &aom_sad32x16_msa, -1),
-  make_tuple(16, 32, &aom_sad16x32_msa, -1),
-  make_tuple(16, 16, &aom_sad16x16_msa, -1),
-  make_tuple(16, 8, &aom_sad16x8_msa, -1),
-  make_tuple(8, 16, &aom_sad8x16_msa, -1),
-  make_tuple(8, 8, &aom_sad8x8_msa, -1),
-  make_tuple(8, 4, &aom_sad8x4_msa, -1),
-  make_tuple(4, 8, &aom_sad4x8_msa, -1),
-  make_tuple(4, 4, &aom_sad4x4_msa, -1),
-};
-INSTANTIATE_TEST_SUITE_P(MSA, SADTest, ::testing::ValuesIn(msa_tests));
-
-const SadMxNAvgParam avg_msa_tests[] = {
-  make_tuple(64, 64, &aom_sad64x64_avg_msa, -1),
-  make_tuple(64, 32, &aom_sad64x32_avg_msa, -1),
-  make_tuple(32, 64, &aom_sad32x64_avg_msa, -1),
-  make_tuple(32, 32, &aom_sad32x32_avg_msa, -1),
-  make_tuple(32, 16, &aom_sad32x16_avg_msa, -1),
-  make_tuple(16, 32, &aom_sad16x32_avg_msa, -1),
-  make_tuple(16, 16, &aom_sad16x16_avg_msa, -1),
-  make_tuple(16, 8, &aom_sad16x8_avg_msa, -1),
-  make_tuple(8, 16, &aom_sad8x16_avg_msa, -1),
-  make_tuple(8, 8, &aom_sad8x8_avg_msa, -1),
-  make_tuple(8, 4, &aom_sad8x4_avg_msa, -1),
-  make_tuple(4, 8, &aom_sad4x8_avg_msa, -1),
-  make_tuple(4, 4, &aom_sad4x4_avg_msa, -1),
-};
-INSTANTIATE_TEST_SUITE_P(MSA, SADavgTest, ::testing::ValuesIn(avg_msa_tests));
-
-const SadMxNx4Param x4d_msa_tests[] = {
-  make_tuple(64, 64, &aom_sad64x64x4d_msa, -1),
-  make_tuple(64, 32, &aom_sad64x32x4d_msa, -1),
-  make_tuple(32, 64, &aom_sad32x64x4d_msa, -1),
-  make_tuple(32, 32, &aom_sad32x32x4d_msa, -1),
-  make_tuple(32, 16, &aom_sad32x16x4d_msa, -1),
-  make_tuple(16, 32, &aom_sad16x32x4d_msa, -1),
-  make_tuple(16, 16, &aom_sad16x16x4d_msa, -1),
-  make_tuple(16, 8, &aom_sad16x8x4d_msa, -1),
-  make_tuple(8, 16, &aom_sad8x16x4d_msa, -1),
-  make_tuple(8, 8, &aom_sad8x8x4d_msa, -1),
-  make_tuple(8, 4, &aom_sad8x4x4d_msa, -1),
-  make_tuple(4, 8, &aom_sad4x8x4d_msa, -1),
-  make_tuple(4, 4, &aom_sad4x4x4d_msa, -1),
-};
-INSTANTIATE_TEST_SUITE_P(MSA, SADx4Test, ::testing::ValuesIn(x4d_msa_tests));
-#endif  // HAVE_MSA
-
 }  // namespace

diff --git a/test/sb_multipass_test.cc b/test/sb_multipass_test.cc
index 5c6d807..3080d7d 100644
--- a/test/sb_multipass_test.cc
+++ b/test/sb_multipass_test.cc

@@ -32,7 +32,6 @@
     aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
     cfg.w = 1280;
     cfg.h = 720;
-    cfg.allow_lowbitdepth = 1;
     decoder_ = codec_->CreateDecoder(cfg, 0);
     if (decoder_->IsAV1()) {
       decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1);

diff --git a/test/selfguided_filter_test.cc b/test/selfguided_filter_test.cc
index af47dab..2cc099a 100644
--- a/test/selfguided_filter_test.cc
+++ b/test/selfguided_filter_test.cc

@@ -34,192 +34,7 @@
 
 typedef void (*SgrFunc)(const uint8_t *dat8, int width, int height, int stride,
                         int eps, const int *xqd, uint8_t *dst8, int dst_stride,
-                        int32_t *tmpbuf, int bit_depth, int highbd);
-
-// Test parameter list:
-//  <tst_fun_>
-typedef tuple<SgrFunc> FilterTestParam;
-
-class AV1SelfguidedFilterTest
-    : public ::testing::TestWithParam<FilterTestParam> {
- public:
-  virtual ~AV1SelfguidedFilterTest() {}
-  virtual void SetUp() {}
-
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RunSpeedTest() {
-    tst_fun_ = GET_PARAM(0);
-    const int pu_width = RESTORATION_PROC_UNIT_SIZE;
-    const int pu_height = RESTORATION_PROC_UNIT_SIZE;
-    const int width = 256, height = 256, stride = 288, out_stride = 288;
-    const int NUM_ITERS = 2000;
-    int i, j, k;
-
-    uint8_t *input_ =
-        (uint8_t *)aom_memalign(32, stride * (height + 32) * sizeof(uint8_t));
-    uint8_t *output_ = (uint8_t *)aom_memalign(
-        32, out_stride * (height + 32) * sizeof(uint8_t));
-    int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
-    uint8_t *input = input_ + stride * 16 + 16;
-    uint8_t *output = output_ + out_stride * 16 + 16;
-
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-
-    for (i = -16; i < height + 16; ++i)
-      for (j = -16; j < width + 16; ++j)
-        input[i * stride + j] = rnd.Rand16() & 0xFF;
-
-    int xqd[2] = { SGRPROJ_PRJ_MIN0 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 -
-                                                        SGRPROJ_PRJ_MIN0),
-                   SGRPROJ_PRJ_MIN1 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 -
-                                                        SGRPROJ_PRJ_MIN1) };
-    // Fix a parameter set, since the speed depends slightly on r.
-    // Change this to test different combinations of values of r.
-    int eps = 15;
-
-    av1_loop_restoration_precal();
-
-    aom_usec_timer ref_timer;
-    aom_usec_timer_start(&ref_timer);
-    for (i = 0; i < NUM_ITERS; ++i) {
-      for (k = 0; k < height; k += pu_height)
-        for (j = 0; j < width; j += pu_width) {
-          int w = AOMMIN(pu_width, width - j);
-          int h = AOMMIN(pu_height, height - k);
-          uint8_t *input_p = input + k * stride + j;
-          uint8_t *output_p = output + k * out_stride + j;
-          av1_apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd,
-                                             output_p, out_stride, tmpbuf, 8,
-                                             0);
-        }
-    }
-    aom_usec_timer_mark(&ref_timer);
-    const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
-
-    aom_usec_timer tst_timer;
-    aom_usec_timer_start(&tst_timer);
-    for (i = 0; i < NUM_ITERS; ++i) {
-      for (k = 0; k < height; k += pu_height)
-        for (j = 0; j < width; j += pu_width) {
-          int w = AOMMIN(pu_width, width - j);
-          int h = AOMMIN(pu_height, height - k);
-          uint8_t *input_p = input + k * stride + j;
-          uint8_t *output_p = output + k * out_stride + j;
-          tst_fun_(input_p, w, h, stride, eps, xqd, output_p, out_stride,
-                   tmpbuf, 8, 0);
-        }
-    }
-    aom_usec_timer_mark(&tst_timer);
-    const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
-
-    std::cout << "[          ] C time = " << ref_time / 1000
-              << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
-
-    EXPECT_GT(ref_time, tst_time)
-        << "Error: AV1SelfguidedFilterTest.SpeedTest, SIMD slower than C.\n"
-        << "C time: " << ref_time << " us\n"
-        << "SIMD time: " << tst_time << " us\n";
-
-    aom_free(input_);
-    aom_free(output_);
-    aom_free(tmpbuf);
-  }
-
-  void RunCorrectnessTest() {
-    tst_fun_ = GET_PARAM(0);
-    const int pu_width = RESTORATION_PROC_UNIT_SIZE;
-    const int pu_height = RESTORATION_PROC_UNIT_SIZE;
-    // Set the maximum width/height to test here. We actually test a small
-    // range of sizes *up to* this size, so that we can check, eg.,
-    // the behaviour on tiles which are not a multiple of 4 wide.
-    const int max_w = 260, max_h = 260, stride = 672, out_stride = 672;
-    const int NUM_ITERS = 81;
-    int i, j, k;
-
-    uint8_t *input_ =
-        (uint8_t *)aom_memalign(32, stride * (max_h + 32) * sizeof(uint8_t));
-    uint8_t *output_ = (uint8_t *)aom_memalign(
-        32, out_stride * (max_h + 32) * sizeof(uint8_t));
-    uint8_t *output2_ = (uint8_t *)aom_memalign(
-        32, out_stride * (max_h + 32) * sizeof(uint8_t));
-    int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
-
-    uint8_t *input = input_ + stride * 16 + 16;
-    uint8_t *output = output_ + out_stride * 16 + 16;
-    uint8_t *output2 = output2_ + out_stride * 16 + 16;
-
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-
-    av1_loop_restoration_precal();
-
-    for (i = 0; i < NUM_ITERS; ++i) {
-      for (j = -16; j < max_h + 16; ++j)
-        for (k = -16; k < max_w + 16; ++k)
-          input[j * stride + k] = rnd.Rand16() & 0xFF;
-
-      int xqd[2] = { SGRPROJ_PRJ_MIN0 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 -
-                                                          SGRPROJ_PRJ_MIN0),
-                     SGRPROJ_PRJ_MIN1 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 -
-                                                          SGRPROJ_PRJ_MIN1) };
-      int eps = rnd.PseudoUniform(1 << SGRPROJ_PARAMS_BITS);
-
-      // Test various tile sizes around 256x256
-      int test_w = max_w - (i / 9);
-      int test_h = max_h - (i % 9);
-
-      for (k = 0; k < test_h; k += pu_height)
-        for (j = 0; j < test_w; j += pu_width) {
-          int w = AOMMIN(pu_width, test_w - j);
-          int h = AOMMIN(pu_height, test_h - k);
-          uint8_t *input_p = input + k * stride + j;
-          uint8_t *output_p = output + k * out_stride + j;
-          uint8_t *output2_p = output2 + k * out_stride + j;
-          tst_fun_(input_p, w, h, stride, eps, xqd, output_p, out_stride,
-                   tmpbuf, 8, 0);
-          av1_apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd,
-                                             output2_p, out_stride, tmpbuf, 8,
-                                             0);
-        }
-
-      for (j = 0; j < test_h; ++j)
-        for (k = 0; k < test_w; ++k) {
-          ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
-        }
-    }
-
-    aom_free(input_);
-    aom_free(output_);
-    aom_free(output2_);
-    aom_free(tmpbuf);
-  }
-
- private:
-  SgrFunc tst_fun_;
-};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1SelfguidedFilterTest);
-
-TEST_P(AV1SelfguidedFilterTest, DISABLED_SpeedTest) { RunSpeedTest(); }
-TEST_P(AV1SelfguidedFilterTest, CorrectnessTest) { RunCorrectnessTest(); }
-
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(
-    SSE4_1, AV1SelfguidedFilterTest,
-    ::testing::Values(av1_apply_selfguided_restoration_sse4_1));
-#endif
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(
-    AVX2, AV1SelfguidedFilterTest,
-    ::testing::Values(av1_apply_selfguided_restoration_avx2));
-#endif
-
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(
-    NEON, AV1SelfguidedFilterTest,
-    ::testing::Values(av1_apply_selfguided_restoration_neon));
-#endif
+                        int32_t *tmpbuf, int bit_depth);
 
 // Test parameter list:
 //  <tst_fun_, bit_depth>
@@ -279,7 +94,7 @@
           uint16_t *output_p = output + k * out_stride + j;
           av1_apply_selfguided_restoration_c(
               CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
-              CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth, 1);
+              CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth);
         }
     }
     aom_usec_timer_mark(&ref_timer);
@@ -295,8 +110,7 @@
           uint16_t *input_p = input + k * stride + j;
           uint16_t *output_p = output + k * out_stride + j;
           tst_fun_(CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
-                   CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth,
-                   1);
+                   CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth);
         }
     }
     aom_usec_timer_mark(&tst_timer);
@@ -368,11 +182,10 @@
           uint16_t *output_p = output + k * out_stride + j;
           uint16_t *output2_p = output2 + k * out_stride + j;
           tst_fun_(CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
-                   CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth,
-                   1);
+                   CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth);
           av1_apply_selfguided_restoration_c(
               CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
-              CONVERT_TO_BYTEPTR(output2_p), out_stride, tmpbuf, bit_depth, 1);
+              CONVERT_TO_BYTEPTR(output2_p), out_stride, tmpbuf, bit_depth);
         }
 
       for (j = 0; j < test_h; ++j)

diff --git a/test/subgop_test.cc b/test/subgop_test.cc
index c54caec..8c5fba0 100644
--- a/test/subgop_test.cc
+++ b/test/subgop_test.cc

@@ -26,6 +26,9 @@
                             void *user_priv) AOM_UNUSED;
 static aom_codec_err_t image2yuvconfig(const aom_image_t *img,
                                        YV12_BUFFER_CONFIG *yv12) AOM_UNUSED;
+static void image2yuvconfig_upshift(aom_image_t *hbd_img,
+                                    const aom_image_t *img,
+                                    YV12_BUFFER_CONFIG *yv12) AOM_UNUSED;
 #include "av1/av1_iface_common.h"
 
 #define MAX_SUBGOP_CODES 3

diff --git a/test/subtract_test.cc b/test/subtract_test.cc
index be74d30..d99e902 100644
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc

@@ -25,89 +25,10 @@
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
 
-typedef void (*SubtractFunc)(int rows, int cols, int16_t *diff_ptr,
-                             ptrdiff_t diff_stride, const uint8_t *src_ptr,
-                             ptrdiff_t src_stride, const uint8_t *pred_ptr,
-                             ptrdiff_t pred_stride);
-
 namespace {
 
-class AV1SubtractBlockTest : public ::testing::TestWithParam<SubtractFunc> {
- public:
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-};
-
 using libaom_test::ACMRandom;
 
-TEST_P(AV1SubtractBlockTest, SimpleSubtract) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-
-  // FIXME(rbultje) split in its own file
-  for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;
-       bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) {
-    const int block_width = block_size_wide[bsize];
-    const int block_height = block_size_high[bsize];
-    int16_t *diff = reinterpret_cast<int16_t *>(
-        aom_memalign(32, sizeof(*diff) * block_width * block_height * 2));
-    uint8_t *pred = reinterpret_cast<uint8_t *>(
-        aom_memalign(16, block_width * block_height * 2));
-    uint8_t *src = reinterpret_cast<uint8_t *>(
-        aom_memalign(16, block_width * block_height * 2));
-
-    for (int n = 0; n < 100; n++) {
-      for (int r = 0; r < block_height; ++r) {
-        for (int c = 0; c < block_width * 2; ++c) {
-          src[r * block_width * 2 + c] = rnd.Rand8();
-          pred[r * block_width * 2 + c] = rnd.Rand8();
-        }
-      }
-
-      GetParam()(block_height, block_width, diff, block_width, src, block_width,
-                 pred, block_width);
-
-      for (int r = 0; r < block_height; ++r) {
-        for (int c = 0; c < block_width; ++c) {
-          EXPECT_EQ(diff[r * block_width + c],
-                    (src[r * block_width + c] - pred[r * block_width + c]))
-              << "r = " << r << ", c = " << c << ", bs = " << bsize;
-        }
-      }
-
-      GetParam()(block_height, block_width, diff, block_width * 2, src,
-                 block_width * 2, pred, block_width * 2);
-
-      for (int r = 0; r < block_height; ++r) {
-        for (int c = 0; c < block_width; ++c) {
-          EXPECT_EQ(
-              diff[r * block_width * 2 + c],
-              (src[r * block_width * 2 + c] - pred[r * block_width * 2 + c]))
-              << "r = " << r << ", c = " << c << ", bs = " << bsize;
-        }
-      }
-    }
-    aom_free(diff);
-    aom_free(pred);
-    aom_free(src);
-  }
-}
-
-INSTANTIATE_TEST_SUITE_P(C, AV1SubtractBlockTest,
-                         ::testing::Values(aom_subtract_block_c));
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(AVX2, AV1SubtractBlockTest,
-                         ::testing::Values(aom_subtract_block_avx2));
-#endif
-
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(NEON, AV1SubtractBlockTest,
-                         ::testing::Values(aom_subtract_block_neon));
-#endif
-#if HAVE_MSA
-INSTANTIATE_TEST_SUITE_P(MSA, AV1SubtractBlockTest,
-                         ::testing::Values(aom_subtract_block_msa));
-#endif
-
 typedef void (*HBDSubtractFunc)(int rows, int cols, int16_t *diff_ptr,
                                 ptrdiff_t diff_stride, const uint8_t *src_ptr,
                                 ptrdiff_t src_stride, const uint8_t *pred_ptr,

diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc
index 53ba944..634dfc0 100644
--- a/test/sum_squares_test.cc
+++ b/test/sum_squares_test.cc

@@ -396,16 +396,14 @@
 }
 
 #if HAVE_NEON
-TestSSEFuncs sse_neon[] = { TestSSEFuncs(&aom_sse_c, &aom_sse_neon),
-                            TestSSEFuncs(&aom_highbd_sse_c,
+TestSSEFuncs sse_neon[] = { TestSSEFuncs(&aom_highbd_sse_c,
                                          &aom_highbd_sse_neon) };
 INSTANTIATE_TEST_SUITE_P(NEON, SSETest,
                          Combine(ValuesIn(sse_neon), Range(4, 129, 4)));
 #endif  // HAVE_NEON
 
 #if HAVE_SSE4_1
-TestSSEFuncs sse_sse4[] = { TestSSEFuncs(&aom_sse_c, &aom_sse_sse4_1),
-                            TestSSEFuncs(&aom_highbd_sse_c,
+TestSSEFuncs sse_sse4[] = { TestSSEFuncs(&aom_highbd_sse_c,
                                          &aom_highbd_sse_sse4_1) };
 INSTANTIATE_TEST_SUITE_P(SSE4_1, SSETest,
                          Combine(ValuesIn(sse_sse4), Range(4, 129, 4)));
@@ -413,8 +411,7 @@
 
 #if HAVE_AVX2
 
-TestSSEFuncs sse_avx2[] = { TestSSEFuncs(&aom_sse_c, &aom_sse_avx2),
-                            TestSSEFuncs(&aom_highbd_sse_c,
+TestSSEFuncs sse_avx2[] = { TestSSEFuncs(&aom_highbd_sse_c,
                                          &aom_highbd_sse_avx2) };
 INSTANTIATE_TEST_SUITE_P(AVX2, SSETest,
                          Combine(ValuesIn(sse_avx2), Range(4, 129, 4)));

diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
index 6ed0e12..9390420 100644
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc

@@ -37,204 +37,6 @@
 
 namespace {
 
-typedef void (*TemporalFilterFunc)(
-    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
-    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const double *noise_level, const MV *subblock_mvs,
-    const int *subblock_mses, const int q_factor, const int filter_strenght,
-    const uint8_t *pred, uint32_t *accum, uint16_t *count);
-typedef libaom_test::FuncParam<TemporalFilterFunc> TemporalFilterFuncParam;
-
-typedef std::tuple<TemporalFilterFuncParam, int> TemporalFilterWithParam;
-
-class TemporalFilterTest
-    : public ::testing::TestWithParam<TemporalFilterWithParam> {
- public:
-  virtual ~TemporalFilterTest() {}
-  virtual void SetUp() {
-    params_ = GET_PARAM(0);
-    rnd_.Reset(ACMRandom::DeterministicSeed());
-    src1_ = reinterpret_cast<uint8_t *>(aom_memalign(8, 256 * 256));
-    src2_ = reinterpret_cast<uint8_t *>(aom_memalign(8, 256 * 256));
-
-    ASSERT_TRUE(src1_ != NULL);
-    ASSERT_TRUE(src2_ != NULL);
-  }
-
-  virtual void TearDown() {
-    libaom_test::ClearSystemState();
-    aom_free(src1_);
-    aom_free(src2_);
-  }
-  void RunTest(int isRandom, int width, int height, int run_times);
-
-  void GenRandomData(int width, int height, int stride, int stride2) {
-    for (int ii = 0; ii < height; ii++) {
-      for (int jj = 0; jj < width; jj++) {
-        src1_[ii * stride + jj] = rnd_.Rand8();
-        src2_[ii * stride2 + jj] = rnd_.Rand8();
-      }
-    }
-  }
-
-  void GenExtremeData(int width, int height, int stride, uint8_t *data,
-                      int stride2, uint8_t *data2, uint8_t val) {
-    for (int ii = 0; ii < height; ii++) {
-      for (int jj = 0; jj < width; jj++) {
-        data[ii * stride + jj] = val;
-        data2[ii * stride2 + jj] = (255 - val);
-      }
-    }
-  }
-
- protected:
-  TemporalFilterFuncParam params_;
-  uint8_t *src1_;
-  uint8_t *src2_;
-  ACMRandom rnd_;
-};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(TemporalFilterTest);
-
-void TemporalFilterTest::RunTest(int isRandom, int width, int height,
-                                 int run_times) {
-  aom_usec_timer ref_timer, test_timer;
-  for (int k = 0; k < 3; k++) {
-    const int stride = width;
-    const int stride2 = width;
-    if (isRandom) {
-      GenRandomData(width, height, stride, stride2);
-    } else {
-      const int msb = 8;  // Up to 8 bit input
-      const int limit = (1 << msb) - 1;
-      if (k == 0) {
-        GenExtremeData(width, height, stride, src1_, stride2, src2_, limit);
-      } else {
-        GenExtremeData(width, height, stride, src1_, stride2, src2_, 0);
-      }
-    }
-    double sigma[1] = { 2.1002103677063437 };
-    DECLARE_ALIGNED(16, unsigned int, accumulator_ref[1024 * 3]);
-    DECLARE_ALIGNED(16, uint16_t, count_ref[1024 * 3]);
-    memset(accumulator_ref, 0, 1024 * 3 * sizeof(accumulator_ref[0]));
-    memset(count_ref, 0, 1024 * 3 * sizeof(count_ref[0]));
-    DECLARE_ALIGNED(16, unsigned int, accumulator_mod[1024 * 3]);
-    DECLARE_ALIGNED(16, uint16_t, count_mod[1024 * 3]);
-    memset(accumulator_mod, 0, 1024 * 3 * sizeof(accumulator_mod[0]));
-    memset(count_mod, 0, 1024 * 3 * sizeof(count_mod[0]));
-
-    assert(width == 32 && height == 32);
-    const BLOCK_SIZE block_size = BLOCK_32X32;
-    const MV subblock_mvs[4] = { { 0, 0 }, { 5, 5 }, { 7, 8 }, { 2, 10 } };
-    const int subblock_mses[4] = { 15, 16, 17, 18 };
-    const int q_factor = 12;
-    const int filter_strength = 5;
-    const int mb_row = 0;
-    const int mb_col = 0;
-    const int num_planes = 1;
-    YV12_BUFFER_CONFIG *ref_frame =
-        (YV12_BUFFER_CONFIG *)malloc(sizeof(YV12_BUFFER_CONFIG));
-    ref_frame->y_crop_height = 360;
-    ref_frame->y_crop_width = 540;
-    ref_frame->heights[0] = height;
-    ref_frame->strides[0] = stride;
-    DECLARE_ALIGNED(16, uint8_t, src[1024 * 3]);
-    ref_frame->buffer_alloc = src;
-    ref_frame->buffers[0] = ref_frame->buffer_alloc;
-    ref_frame->flags = 0;  // Only support low bit-depth test.
-    memcpy(src, src1_, 1024 * 3 * sizeof(uint8_t));
-
-    MACROBLOCKD *mbd = (MACROBLOCKD *)malloc(sizeof(MACROBLOCKD));
-    mbd->plane[0].subsampling_y = 0;
-    mbd->plane[0].subsampling_x = 0;
-    mbd->bd = 8;
-
-    params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                     sigma, subblock_mvs, subblock_mses, q_factor,
-                     filter_strength, src2_, accumulator_ref, count_ref);
-    params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                     sigma, subblock_mvs, subblock_mses, q_factor,
-                     filter_strength, src2_, accumulator_mod, count_mod);
-
-    if (run_times > 1) {
-      aom_usec_timer_start(&ref_timer);
-      for (int j = 0; j < run_times; j++) {
-        params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                         sigma, subblock_mvs, subblock_mses, q_factor,
-                         filter_strength, src2_, accumulator_ref, count_ref);
-      }
-      aom_usec_timer_mark(&ref_timer);
-      const int elapsed_time_c =
-          static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
-
-      aom_usec_timer_start(&test_timer);
-      for (int j = 0; j < run_times; j++) {
-        params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                         sigma, subblock_mvs, subblock_mses, q_factor,
-                         filter_strength, src2_, accumulator_mod, count_mod);
-      }
-      aom_usec_timer_mark(&test_timer);
-      const int elapsed_time_simd =
-          static_cast<int>(aom_usec_timer_elapsed(&test_timer));
-
-      printf(
-          "c_time=%d \t simd_time=%d \t "
-          "gain=%f\t width=%d\t height=%d \n",
-          elapsed_time_c, elapsed_time_simd,
-          (float)((float)elapsed_time_c / (float)elapsed_time_simd), width,
-          height);
-
-    } else {
-      for (int i = 0, l = 0; i < height; i++) {
-        for (int j = 0; j < width; j++, l++) {
-          EXPECT_EQ(accumulator_ref[l], accumulator_mod[l])
-              << "Error:" << k << " SSE Sum Test [" << width << "x" << height
-              << "] C accumulator does not match optimized accumulator.";
-          EXPECT_EQ(count_ref[l], count_mod[l])
-              << "Error:" << k << " SSE Sum Test [" << width << "x" << height
-              << "] C count does not match optimized count.";
-        }
-      }
-    }
-
-    free(ref_frame);
-    free(mbd);
-  }
-}
-
-TEST_P(TemporalFilterTest, OperationCheck) {
-  for (int height = 32; height <= 32; height = height * 2) {
-    RunTest(1, height, height, 1);  // GenRandomData
-  }
-}
-
-TEST_P(TemporalFilterTest, ExtremeValues) {
-  for (int height = 32; height <= 32; height = height * 2) {
-    RunTest(0, height, height, 1);
-  }
-}
-
-TEST_P(TemporalFilterTest, DISABLED_Speed) {
-  for (int height = 32; height <= 32; height = height * 2) {
-    RunTest(1, height, height, 100000);
-  }
-}
-
-#if HAVE_AVX2
-TemporalFilterFuncParam temporal_filter_test_avx2[] = { TemporalFilterFuncParam(
-    &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_avx2) };
-INSTANTIATE_TEST_SUITE_P(AVX2, TemporalFilterTest,
-                         Combine(ValuesIn(temporal_filter_test_avx2),
-                                 Range(64, 65, 4)));
-#endif  // HAVE_AVX2
-
-#if HAVE_SSE2
-TemporalFilterFuncParam temporal_filter_test_sse2[] = { TemporalFilterFuncParam(
-    &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_sse2) };
-INSTANTIATE_TEST_SUITE_P(SSE2, TemporalFilterTest,
-                         Combine(ValuesIn(temporal_filter_test_sse2),
-                                 Range(64, 65, 4)));
-#endif  // HAVE_SSE2
-
 typedef void (*HBDTemporalFilterFunc)(
     const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
@@ -358,7 +160,6 @@
     DECLARE_ALIGNED(16, uint16_t, src[1024 * 3]);
     ref_frame->buffer_alloc = CONVERT_TO_BYTEPTR(src);
     ref_frame->buffers[0] = ref_frame->buffer_alloc;
-    ref_frame->flags = YV12_FLAG_HIGHBITDEPTH;  // Only Hihgbd bit-depth test.
     memcpy(src, src1_, 1024 * 3 * sizeof(uint16_t));
 
     MACROBLOCKD *mbd = (MACROBLOCKD *)malloc(sizeof(MACROBLOCKD));

diff --git a/test/test.cmake b/test/test.cmake
index 1c7cd8e..85c8d27 100644
--- a/test/test.cmake
+++ b/test/test.cmake

@@ -46,11 +46,6 @@
   "${AOM_ROOT}/test/util.h"
   "${AOM_ROOT}/test/video_source.h")
 
-if(CONFIG_INTERNAL_STATS)
-  list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
-       "${AOM_ROOT}/test/hbd_metrics_test.cc")
-endif()
-
 list(
   APPEND
   AOM_UNIT_TEST_DECODER_SOURCES
@@ -122,7 +117,6 @@
       "${AOM_ROOT}/test/binary_codes_test.cc"
       "${AOM_ROOT}/test/boolcoder_test.cc"
       "${AOM_ROOT}/test/cnn_test.cc"
-      "${AOM_ROOT}/test/coding_path_sync.cc"
       "${AOM_ROOT}/test/decode_multithreaded_test.cc"
       "${AOM_ROOT}/test/divu_small_test.cc"
       "${AOM_ROOT}/test/dr_prediction_test.cc"
@@ -191,13 +185,11 @@
     "${AOM_ROOT}/test/av1_fwd_txfm1d_test.cc"
     "${AOM_ROOT}/test/av1_fwd_txfm2d_test.cc"
     "${AOM_ROOT}/test/av1_inv_txfm1d_test.cc"
-    "${AOM_ROOT}/test/av1_inv_txfm2d_test.cc"
     "${AOM_ROOT}/test/av1_nn_predict_test.cc"
     "${AOM_ROOT}/test/av1_round_shift_array_test.cc"
     "${AOM_ROOT}/test/av1_txfm_test.cc"
     "${AOM_ROOT}/test/av1_txfm_test.h"
     "${AOM_ROOT}/test/av1_wedge_utils_test.cc"
-    "${AOM_ROOT}/test/avg_test.cc"
     "${AOM_ROOT}/test/blend_a64_mask_1d_test.cc"
     "${AOM_ROOT}/test/blend_a64_mask_test.cc"
     "${AOM_ROOT}/test/comp_avg_pred_test.cc"
@@ -209,7 +201,6 @@
     "${AOM_ROOT}/test/fft_test.cc"
     "${AOM_ROOT}/test/fwht4x4_test.cc"
     "${AOM_ROOT}/test/fdct4x4_test.cc"
-    "${AOM_ROOT}/test/hadamard_test.cc"
     "${AOM_ROOT}/test/horver_correlation_test.cc"
     "${AOM_ROOT}/test/masked_sad_test.cc"
     "${AOM_ROOT}/test/masked_variance_test.cc"
@@ -227,7 +218,6 @@
     "${AOM_ROOT}/test/sse_sum_test.cc"
     "${AOM_ROOT}/test/variance_test.cc"
     "${AOM_ROOT}/test/wiener_test.cc"
-    "${AOM_ROOT}/test/frame_error_test.cc"
     "${AOM_ROOT}/test/warp_filter_test.cc"
     "${AOM_ROOT}/test/warp_filter_test_util.cc"
     "${AOM_ROOT}/test/warp_filter_test_util.h"
@@ -243,7 +233,7 @@
     "${AOM_ROOT}/test/av1_quantize_test.cc"
     "${AOM_ROOT}/test/corner_match_test.cc" "${AOM_ROOT}/test/simd_cmp_sse4.cc")
 
-  if(NOT (HAVE_SSE2 OR HAVE_NEON))
+  if(NOT HAVE_SSE2)
     list(REMOVE_ITEM AOM_UNIT_TEST_ENCODER_SOURCES
          "${AOM_ROOT}/test/quantize_func_test.cc")
   endif()

diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 7ad5e0b..3fd67e1 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc

@@ -37,9 +37,6 @@
 // 0: Generate MD5 array as required
 #define APPLY_UNIT_TESTS 1
 
-typedef void (*AvxPredFunc)(uint8_t *dst, ptrdiff_t y_stride,
-                            const uint8_t *above, const uint8_t *left);
-
 const int kBPS = 64;
 const int kTotalPixels = kBPS * kBPS;
 // 4 DC variants, V, H, PAETH, SMOOTH, SMOOTH_V, SMOOTH_H
@@ -86,21 +83,16 @@
   DECLARE_ALIGNED(16, Pixel, above_mem[2 * kBPS + 16]);
 };
 
-// -----------------------------------------------------------------------------
-// Low Bittdepth
-
-typedef IntraPredTestMem<uint8_t> Av1IntraPredTestMem;
-
 static const char *const kTxSizeStrings[TX_SIZES_ALL] = {
   "4X4",  "8X8",  "16X16", "32X32", "64X64", "4X8",   "8X4",
   "8X16", "16X8", "16X32", "32X16", "32X64", "64X32", "4X16",
   "16X4", "8X32", "32X8",  "16X64", "64X16",
 };
 
-void CheckMd5Signature(TX_SIZE tx_size, bool is_hbd,
-                       const char *const signatures[], const void *data,
-                       size_t data_size, int elapsed_time, int idx) {
-  const std::string hbd_str = is_hbd ? "Hbd " : "";
+void CheckMd5Signature(TX_SIZE tx_size, const char *const signatures[],
+                       const void *data, size_t data_size, int elapsed_time,
+                       int idx) {
+  const std::string hbd_str = "Hbd ";
   const std::string name_str = hbd_str + "Intra" + kTxSizeStrings[tx_size];
   libaom_test::MD5 md5;
   md5.Add(reinterpret_cast<const uint8_t *>(data), data_size);
@@ -116,770 +108,8 @@
 #endif
 }
 
-void TestIntraPred(TX_SIZE tx_size, AvxPredFunc const *pred_funcs,
-                   const char *const signatures[]) {
-  const int block_width = tx_size_wide[tx_size];
-  const int block_height = tx_size_high[tx_size];
-  const int num_pixels_per_test =
-      block_width * block_height * kNumAv1IntraFuncs;
-  const int kNumTests = static_cast<int>(2.e10 / num_pixels_per_test);
-  Av1IntraPredTestMem intra_pred_test_mem;
-  intra_pred_test_mem.Init(block_width, block_height, 8);
-
-  for (int k = 0; k < kNumAv1IntraFuncs; ++k) {
-    if (pred_funcs[k] == NULL) continue;
-    memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src,
-           sizeof(intra_pred_test_mem.src));
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-    for (int num_tests = 0; num_tests < kNumTests; ++num_tests) {
-      pred_funcs[k](intra_pred_test_mem.src, intra_pred_test_mem.stride,
-                    intra_pred_test_mem.above, intra_pred_test_mem.left);
-    }
-    libaom_test::ClearSystemState();
-    aom_usec_timer_mark(&timer);
-    const int elapsed_time =
-        static_cast<int>(aom_usec_timer_elapsed(&timer) / 1000);
-    CheckMd5Signature(
-        tx_size, false, signatures, intra_pred_test_mem.src,
-        intra_pred_test_mem.num_pixels * sizeof(*intra_pred_test_mem.src),
-        elapsed_time, k);
-  }
-}
-
-static const char *const kSignatures[TX_SIZES_ALL][kNumAv1IntraFuncs] = {
-  {
-      // 4X4
-      "e7ed7353c3383fff942e500e9bfe82fe",
-      "2a4a26fcc6ce005eadc08354d196c8a9",
-      "269d92eff86f315d9c38fe7640d85b15",
-      "ae2960eea9f71ee3dabe08b282ec1773",
-      "6c1abcc44e90148998b51acd11144e9c",
-      "f7bb3186e1ef8a2b326037ff898cad8e",
-      "59fc0e923a08cfac0a493fb38988e2bb",
-      "9ff8bb37d9c830e6ab8ecb0c435d3c91",
-      "de6937fca02354f2874dbc5dbec5d5b3",
-      "723cf948137f7d8c7860d814e55ae67d",
-  },
-  {
-      // 8X8
-      "d8bbae5d6547cfc17e4f5f44c8730e88",
-      "373bab6d931868d41a601d9d88ce9ac3",
-      "6fdd5ff4ff79656c14747598ca9e3706",
-      "d9661c2811d6a73674f40ffb2b841847",
-      "7c722d10b19ccff0b8c171868e747385",
-      "f81dd986eb2b50f750d3a7da716b7e27",
-      "064404361748dd111a890a1470d7f0ea",
-      "dc29b7e1f78cc8e7525d5ea4c0ab9b78",
-      "97111eb1bc26bade6272015df829f1ae",
-      "d19a8a73cc46b807f2c5e817576cc1e1",
-  },
-  {
-      // 16X16
-      "50971c07ce26977d30298538fffec619",
-      "527a6b9e0dc5b21b98cf276305432bef",
-      "7eff2868f80ebc2c43a4f367281d80f7",
-      "67cd60512b54964ef6aff1bd4816d922",
-      "48371c87dc95c08a33b2048f89cf6468",
-      "b0acf2872ee411d7530af6d2625a7084",
-      "93d6b5352b571805ab16a55e1bbed86a",
-      "03764e4c0aebbc180e4e2c68fb06df2b",
-      "bb6c74c9076c9f266ab11fb57060d8e6",
-      "0c5162bc28489756ddb847b5678e6f07",
-  },
-  {
-      // 32X32
-      "a0a618c900e65ae521ccc8af789729f2",
-      "985aaa7c72b4a6c2fb431d32100cf13a",
-      "10662d09febc3ca13ee4e700120daeb5",
-      "b3b01379ba08916ef6b1b35f7d9ad51c",
-      "9f4261755795af97e34679c333ec7004",
-      "bc2c9da91ad97ef0d1610fb0a9041657",
-      "ef1653982b69e1f64bee3759f3e1ec45",
-      "1a51a675deba2c83282142eb48d3dc3d",
-      "866c224746dc260cda861a7b1b383fb3",
-      "cea23799fc3526e1b6a6ff02b42b82af",
-  },
-  {
-      // 64X64
-      "6e1094fa7b50bc813aa2ba29f5df8755",
-      "afe020786b83b793c2bbd9468097ff6e",
-      "be91585259bc37bf4dc1651936e90b3e",
-      "a1650dbcd56e10288c3e269eca37967d",
-      "9e5c34f3797e0cdd3cd9d4c05b0d8950",
-      "bc87be7ac899cc6a28f399d7516c49fe",
-      "9811fd0d2dd515f06122f5d1bd18b784",
-      "3c140e466f2c2c0d9cb7d2157ab8dc27",
-      "9543de76c925a8f6adc884cc7f98dc91",
-      "df1df0376cc944afe7e74e94f53e575a",
-  },
-  {
-      // 4X8
-      "d9fbebdc85f71ab1e18461b2db4a2adc",
-      "5ccb2a68284bc9714d94b8a06ccadbb2",
-      "735d059abc2744f3ff3f9590f7191b37",
-      "d9fbebdc85f71ab1e18461b2db4a2adc",
-      "6819497c44cd0ace120add83672996ee",
-      "7e3244f5a2d3edf81c7e962a842b97f9",
-      "809350f164cd4d1650850bb0f59c3260",
-      "1b60a394331eeab6927a6f8aaff57040",
-      "5307de1bd7329ba6b281d2c1b0b457f9",
-      "24c58a8138339846d95568efb91751db",
-  },
-  {
-      // 8X4
-      "23f9fc11344426c9bee2e06d57dfd628",
-      "2d71a26d1bae1fb34734de7b42fc5eb7",
-      "5af9c1b2fd9d5721fad67b67b3f7c816",
-      "00d71b17be662753813d515f197d145e",
-      "bef10ec984427e28f4390f43809d10af",
-      "77773cdfb7ed6bc882ab202a64b0a470",
-      "2cc48bd66d6b0121b5221d52ccd732af",
-      "b302155e1c9eeeafe2ba2bf68e807a46",
-      "561bc8d0e76d5041ebd5168fc6a115e1",
-      "81d0113fb1d0a9a24ffd6f1987b77948",
-  },
-  {
-      // 8X16
-      "c849de88b24f773dfcdd1d48d1209796",
-      "6cb807c1897b94866a0f3d3c56ed8695",
-      "d56db05a8ac7981762f5b877f486c4ef",
-      "b4bc01eb6e59a40922ad17715cafb04b",
-      "09d178439534f4062ae687c351f66d64",
-      "644501399cf73080ac606e5cef7ca09b",
-      "278076495180e17c065a95ab7278539a",
-      "9dd7f324816f242be408ffeb0c673732",
-      "f520c4a20acfa0bea1d253c6f0f040fd",
-      "85f38df809df2c2d7c8b4a157a65cd44",
-  },
-  {
-      // 16X8
-      "b4cbdbdf10ce13300b4063a3daf99e04",
-      "3731e1e6202064a9d0604d7c293ecee4",
-      "6c856188c4256a06452f0d5d70cac436",
-      "1f2192b4c8c497589484ea7bf9c944e8",
-      "84011bd4b7f565119d06787840e333a0",
-      "0e48949f7a6aa36f0d76b5d01f91124a",
-      "60eff8064634b6c73b10681356baeee9",
-      "1559aeb081a9c0c71111d6093c2ff9fd",
-      "c15479b739713773e5cabb748451987b",
-      "72e33ec12c9b67aea26d8d005fb82de2",
-  },
-  {
-      // 16X32
-      "abe5233d189cdbf79424721571bbaa7b",
-      "282759f81e3cfb2e2d396fe406b72a8b",
-      "e2224926c264f6f174cbc3167a233168",
-      "6814e85c2b33f8c9415d62e80394b47b",
-      "99cbbb60459c08a3061d72c4e4f6276a",
-      "1d1567d40b8e816f8c1f71e576fe0f87",
-      "36fdd371b624a075814d497c4832ec85",
-      "8ab8da61b727442b6ff692b40d0df018",
-      "e35a10ad7fdf2327e821504a90f6a6eb",
-      "1f7211e727dc1de7d6a55d082fbdd821",
-  },
-  {
-      // 32X16
-      "d1aeb8d5fdcfd3307922af01a798a4dc",
-      "b0bcb514ebfbee065faea9d34c12ae75",
-      "d6a18c63b4e909871c0137ca652fad23",
-      "fd047f2fc1b8ffb95d0eeef3e8796a45",
-      "645ab60779ea348fd93c81561c31bab9",
-      "4409633c9db8dff41ade4292a3a56e7f",
-      "5e36a11e069b31c2a739f3a9c7b37c24",
-      "e83b9483d702cfae496991c3c7fa92c0",
-      "12f6ddf98c7f30a277307f1ea935b030",
-      "354321d6c32bbdb0739e4fa2acbf41e1",
-  },
-  {
-      // 32X64
-      "0ce332b343934b34cd4417725faa85cb",
-      "4e2a2cfd8f56f15939bdfc753145b303",
-      "0f46d124ba9f48cdd5d5290acf786d6d",
-      "e1e8ed803236367821981500a3d9eebe",
-      "1d2f8e48e3adb7c448be05d9f66f4954",
-      "9fb2e176636a5689b26f73ca73fcc512",
-      "e720ebccae7e25e36f23da53ae5b5d6a",
-      "86fe4364734169aaa4520d799890d530",
-      "b1870290764bb1b100d1974e2bd70f1d",
-      "ce5b238e19d85ef69d85badfab4e63ae",
-  },
-  {
-      // 64X32
-      "a6c5aeb722615089efbca80b02951ceb",
-      "538424b24bd0830f21788e7238ca762f",
-      "80c15b303235f9bc2259027bb92dfdc4",
-      "e48e1ac15e97191a8fda08d62fff343e",
-      "12604b37875533665078405ef4582e35",
-      "0048afa17bd3e1632d68b96048836530",
-      "07a0cfcb56a5eed50c4bd6c26814336b",
-      "529d8a070de5bc6531fa3ee8f450c233",
-      "33c50a11c7d78f72434064f634305e95",
-      "e0ef7f0559c1a50ec5a8c12011b962f7",
-  },
-  {
-      // 4X16
-      "750491056568eb8fe15387b86bdf06b8",
-      "3a52dae9f599f08cfb3bd1b910dc0e11",
-      "af79f71e3e03dbeca44e2e13561f70c7",
-      "ca7dfd7624afc0c06fb5552f44398535",
-      "b591af115444bf43140c29c269f68fb2",
-      "483d942ae36e69e62f31eb215331416f",
-      "f14b58525e81870bc5d95c7ac71a347f",
-      "371208bb4027d9badb04095d1590bbc4",
-      "c7049c21b2924d70c7c12784d6b6b796",
-      "7d87233f4b5b0f12086045e5d7b2d4c2",
-  },
-  {
-      // 16X4
-      "7c6e325a65e77e732b3adbe237e045e4",
-      "24478f93ffcec47852e004d0fe948464",
-      "258d042c67d4ba3ecfa667f0adc9aebf",
-      "b2cd21d06959f159a1f3c4d9768ee7fb",
-      "b4e1f38157bf8410e7c3da02f687a343",
-      "869e703729eb0fc0711c254944ff5d5a",
-      "9638dd77105a640b146a8201ea7a0801",
-      "919d932c6af8a1cc7486e8ce996dd487",
-      "e1c9be493b6714c7ae48f30044c43140",
-      "bf0fe3889d654b2f6eb98c8fc751f9e4",
-  },
-  {
-      // 8X32
-      "8dfac4319fe0bd40013ffb3102da8c72",
-      "feb46b6dc4e2ca0a09533bfc51d4dcb0",
-      "850837ec714c37262216527aaf4cbbe9",
-      "4603c7800fb08361f163daca876e8bda",
-      "1ff95e7d2debc27b05806fb25abfd624",
-      "d81b9a51a062b23ca7823804cb7bec22",
-      "f1d8978158766f46335203608cb807e7",
-      "f3527096256258c0878d644a9d7d53ca",
-      "cbde98ac8b009953eb112807ad2ea29e",
-      "654fb1153415747feae599f538122af5",
-  },
-  {
-      // 32X8
-      "3d4ee16fab374357474f60b845327bc7",
-      "bc17c5059473a476df4e85f56395ad55",
-      "3d4ee16fab374357474f60b845327bc7",
-      "c14b8db34dc2355b84e3735c9ba16c7f",
-      "a71d25b5d47a92a8b9223c98f18458ee",
-      "6c1cfe2b1893f4576a80675687cb6426",
-      "92d11bbef8b85bb48d799bb055de3514",
-      "bcf81d1db8ae5cc03360467f44f498ec",
-      "79f8c564163555592e808e145eaf5c60",
-      "46fff139cef2ef773938bcc8b0e5abb8",
-  },
-  {
-      // 16X64
-      "3b2a053ee8b05a8ac35ad23b0422a151",
-      "12b0c69595328c465e0b25e0c9e3e9fc",
-      "f77c544ac8035e01920deae40cee7b07",
-      "727797ef15ccd8d325476fe8f12006a3",
-      "f3be77c0fe67eb5d9d515e92bec21eb7",
-      "f1ece6409e01e9dd98b800d49628247d",
-      "efd2ec9bfbbd4fd1f6604ea369df1894",
-      "ec703de918422b9e03197ba0ed60a199",
-      "739418efb89c07f700895deaa5d0b3e3",
-      "9943ae1bbeeebfe1d3a92dc39e049d63",
-  },
-  {
-      // 64X16
-      "821b76b1494d4f84d20817840f719a1a",
-      "69e462c3338a9aaf993c3f7cfbc15649",
-      "516d8f6eb054d74d150e7b444185b6b9",
-      "de1b736e9d99129609d6ef3a491507a0",
-      "fd9b4276e7affe1e0e4ce4f428058994",
-      "cd82fd361a4767ac29a9f406b480b8f3",
-      "2792c2f810157a4a6cb13c28529ff779",
-      "1220442d90c4255ba0969d28b91e93a6",
-      "c7253e10b45f7f67dfee3256c9b94825",
-      "879792198071c7e0b50b9b5010d8c18f",
-  },
-};
-
 }  // namespace
 
-// Defines a test case for |arch| (e.g., C, SSE2, ...) passing the predictors
-// to TestIntraPred. The test name is 'arch.TestIntraPred_tx_size', e.g.,
-// C.TestIntraPred.0
-#define INTRA_PRED_TEST(arch, tx_size, dc, dc_left, dc_top, dc_128, v, h,  \
-                        paeth, smooth, smooth_v, smooth_h)                 \
-  TEST(arch, DISABLED_##TestIntraPred_##tx_size) {                         \
-    static const AvxPredFunc aom_intra_pred[] = {                          \
-      dc, dc_left, dc_top, dc_128, v, h, paeth, smooth, smooth_v, smooth_h \
-    };                                                                     \
-    TestIntraPred(tx_size, aom_intra_pred, kSignatures[tx_size]);          \
-  }
-
-// -----------------------------------------------------------------------------
-// 4x4, 4x8, 4x16
-
-INTRA_PRED_TEST(C_1, TX_4X4, aom_dc_predictor_4x4_c,
-                aom_dc_left_predictor_4x4_c, aom_dc_top_predictor_4x4_c,
-                aom_dc_128_predictor_4x4_c, aom_v_predictor_4x4_c,
-                aom_h_predictor_4x4_c, aom_paeth_predictor_4x4_c,
-                aom_smooth_predictor_4x4_c, aom_smooth_v_predictor_4x4_c,
-                aom_smooth_h_predictor_4x4_c)
-
-INTRA_PRED_TEST(C_2, TX_4X8, aom_dc_predictor_4x8_c,
-                aom_dc_left_predictor_4x8_c, aom_dc_top_predictor_4x8_c,
-                aom_dc_128_predictor_4x8_c, aom_v_predictor_4x8_c,
-                aom_h_predictor_4x8_c, aom_paeth_predictor_4x8_c,
-                aom_smooth_predictor_4x8_c, aom_smooth_v_predictor_4x8_c,
-                aom_smooth_h_predictor_4x8_c)
-
-INTRA_PRED_TEST(C_3, TX_4X16, aom_dc_predictor_4x16_c,
-                aom_dc_left_predictor_4x16_c, aom_dc_top_predictor_4x16_c,
-                aom_dc_128_predictor_4x16_c, aom_v_predictor_4x16_c,
-                aom_h_predictor_4x16_c, aom_paeth_predictor_4x16_c,
-                aom_smooth_predictor_4x16_c, aom_smooth_v_predictor_4x16_c,
-                aom_smooth_h_predictor_4x16_c)
-
-#if HAVE_SSE2
-INTRA_PRED_TEST(SSE2_1, TX_4X4, aom_dc_predictor_4x4_sse2,
-                aom_dc_left_predictor_4x4_sse2, aom_dc_top_predictor_4x4_sse2,
-                aom_dc_128_predictor_4x4_sse2, aom_v_predictor_4x4_sse2,
-                aom_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_2, TX_4X8, aom_dc_predictor_4x8_sse2,
-                aom_dc_left_predictor_4x8_sse2, aom_dc_top_predictor_4x8_sse2,
-                aom_dc_128_predictor_4x8_sse2, aom_v_predictor_4x8_sse2,
-                aom_h_predictor_4x8_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_3, TX_4X16, aom_dc_predictor_4x16_sse2,
-                aom_dc_left_predictor_4x16_sse2, aom_dc_top_predictor_4x16_sse2,
-                aom_dc_128_predictor_4x16_sse2, aom_v_predictor_4x16_sse2,
-                aom_h_predictor_4x16_sse2, NULL, NULL, NULL, NULL)
-#endif  // HAVE_SSE2
-
-#if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3_1, TX_4X4, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_4x4_ssse3, aom_smooth_predictor_4x4_ssse3,
-                aom_smooth_v_predictor_4x4_ssse3,
-                aom_smooth_h_predictor_4x4_ssse3)
-INTRA_PRED_TEST(SSSE3_2, TX_4X8, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_4x8_ssse3, aom_smooth_predictor_4x8_ssse3,
-                aom_smooth_v_predictor_4x8_ssse3,
-                aom_smooth_h_predictor_4x8_ssse3)
-INTRA_PRED_TEST(SSSE3_3, TX_4X16, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_4x16_ssse3, aom_smooth_predictor_4x16_ssse3,
-                aom_smooth_v_predictor_4x16_ssse3,
-                aom_smooth_h_predictor_4x16_ssse3)
-#endif  // HAVE_SSSE3
-
-#if HAVE_DSPR2
-INTRA_PRED_TEST(DSPR2, TX_4X4, aom_dc_predictor_4x4_dspr2, NULL, NULL, NULL,
-                NULL, aom_h_predictor_4x4_dspr2, NULL, NULL, NULL, NULL)
-#endif  // HAVE_DSPR2
-
-#if HAVE_NEON
-INTRA_PRED_TEST(NEON, TX_4X4, aom_dc_predictor_4x4_neon,
-                aom_dc_left_predictor_4x4_neon, aom_dc_top_predictor_4x4_neon,
-                aom_dc_128_predictor_4x4_neon, aom_v_predictor_4x4_neon,
-                aom_h_predictor_4x4_neon, NULL, NULL, NULL, NULL)
-#endif  // HAVE_NEON
-
-#if HAVE_MSA
-INTRA_PRED_TEST(MSA, TX_4X4, aom_dc_predictor_4x4_msa,
-                aom_dc_left_predictor_4x4_msa, aom_dc_top_predictor_4x4_msa,
-                aom_dc_128_predictor_4x4_msa, aom_v_predictor_4x4_msa,
-                aom_h_predictor_4x4_msa, NULL, NULL, NULL, NULL)
-#endif  // HAVE_MSA
-
-// -----------------------------------------------------------------------------
-// 8x8, 8x4, 8x16, 8x32
-
-INTRA_PRED_TEST(C_1, TX_8X8, aom_dc_predictor_8x8_c,
-                aom_dc_left_predictor_8x8_c, aom_dc_top_predictor_8x8_c,
-                aom_dc_128_predictor_8x8_c, aom_v_predictor_8x8_c,
-                aom_h_predictor_8x8_c, aom_paeth_predictor_8x8_c,
-                aom_smooth_predictor_8x8_c, aom_smooth_v_predictor_8x8_c,
-                aom_smooth_h_predictor_8x8_c)
-
-INTRA_PRED_TEST(C_2, TX_8X4, aom_dc_predictor_8x4_c,
-                aom_dc_left_predictor_8x4_c, aom_dc_top_predictor_8x4_c,
-                aom_dc_128_predictor_8x4_c, aom_v_predictor_8x4_c,
-                aom_h_predictor_8x4_c, aom_paeth_predictor_8x4_c,
-                aom_smooth_predictor_8x4_c, aom_smooth_v_predictor_8x4_c,
-                aom_smooth_h_predictor_8x4_c)
-
-INTRA_PRED_TEST(C_3, TX_8X16, aom_dc_predictor_8x16_c,
-                aom_dc_left_predictor_8x16_c, aom_dc_top_predictor_8x16_c,
-                aom_dc_128_predictor_8x16_c, aom_v_predictor_8x16_c,
-                aom_h_predictor_8x16_c, aom_paeth_predictor_8x16_c,
-                aom_smooth_predictor_8x16_c, aom_smooth_v_predictor_8x16_c,
-                aom_smooth_h_predictor_8x16_c)
-
-INTRA_PRED_TEST(C_4, TX_8X32, aom_dc_predictor_8x32_c,
-                aom_dc_left_predictor_8x32_c, aom_dc_top_predictor_8x32_c,
-                aom_dc_128_predictor_8x32_c, aom_v_predictor_8x32_c,
-                aom_h_predictor_8x32_c, aom_paeth_predictor_8x32_c,
-                aom_smooth_predictor_8x32_c, aom_smooth_v_predictor_8x32_c,
-                aom_smooth_h_predictor_8x32_c)
-
-#if HAVE_SSE2
-INTRA_PRED_TEST(SSE2_1, TX_8X8, aom_dc_predictor_8x8_sse2,
-                aom_dc_left_predictor_8x8_sse2, aom_dc_top_predictor_8x8_sse2,
-                aom_dc_128_predictor_8x8_sse2, aom_v_predictor_8x8_sse2,
-                aom_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_2, TX_8X4, aom_dc_predictor_8x4_sse2,
-                aom_dc_left_predictor_8x4_sse2, aom_dc_top_predictor_8x4_sse2,
-                aom_dc_128_predictor_8x4_sse2, aom_v_predictor_8x4_sse2,
-                aom_h_predictor_8x4_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_3, TX_8X16, aom_dc_predictor_8x16_sse2,
-                aom_dc_left_predictor_8x16_sse2, aom_dc_top_predictor_8x16_sse2,
-                aom_dc_128_predictor_8x16_sse2, aom_v_predictor_8x16_sse2,
-                aom_h_predictor_8x16_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_4, TX_8X32, aom_dc_predictor_8x32_sse2,
-                aom_dc_left_predictor_8x32_sse2, aom_dc_top_predictor_8x32_sse2,
-                aom_dc_128_predictor_8x32_sse2, aom_v_predictor_8x32_sse2,
-                aom_h_predictor_8x32_sse2, NULL, NULL, NULL, NULL)
-#endif  // HAVE_SSE2
-
-#if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3_1, TX_8X8, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_8x8_ssse3, aom_smooth_predictor_8x8_ssse3,
-                aom_smooth_v_predictor_8x8_ssse3,
-                aom_smooth_h_predictor_8x8_ssse3)
-INTRA_PRED_TEST(SSSE3_2, TX_8X4, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_8x4_ssse3, aom_smooth_predictor_8x4_ssse3,
-                aom_smooth_v_predictor_8x4_ssse3,
-                aom_smooth_h_predictor_8x4_ssse3)
-INTRA_PRED_TEST(SSSE3_3, TX_8X16, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_8x16_ssse3, aom_smooth_predictor_8x16_ssse3,
-                aom_smooth_v_predictor_8x16_ssse3,
-                aom_smooth_h_predictor_8x16_ssse3)
-INTRA_PRED_TEST(SSSE3_4, TX_8X32, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_8x32_ssse3, aom_smooth_predictor_8x32_ssse3,
-                aom_smooth_v_predictor_8x32_ssse3,
-                aom_smooth_h_predictor_8x32_ssse3)
-#endif  // HAVE_SSSE3
-
-#if HAVE_DSPR2
-INTRA_PRED_TEST(DSPR2, TX_8X8, aom_dc_predictor_8x8_dspr2, NULL, NULL, NULL,
-                NULL, aom_h_predictor_8x8_dspr2, NULL, NULL, NULL, NULL)
-#endif  // HAVE_DSPR2
-
-#if HAVE_NEON
-INTRA_PRED_TEST(NEON, TX_8X8, aom_dc_predictor_8x8_neon,
-                aom_dc_left_predictor_8x8_neon, aom_dc_top_predictor_8x8_neon,
-                aom_dc_128_predictor_8x8_neon, aom_v_predictor_8x8_neon,
-                aom_h_predictor_8x8_neon, NULL, NULL, NULL, NULL)
-#endif  // HAVE_NEON
-
-#if HAVE_MSA
-INTRA_PRED_TEST(MSA, TX_8X8, aom_dc_predictor_8x8_msa,
-                aom_dc_left_predictor_8x8_msa, aom_dc_top_predictor_8x8_msa,
-                aom_dc_128_predictor_8x8_msa, aom_v_predictor_8x8_msa,
-                aom_h_predictor_8x8_msa, NULL, NULL, NULL, NULL)
-#endif  // HAVE_MSA
-
-// -----------------------------------------------------------------------------
-// 16x16, 16x8, 16x32, 16x4, 16x64
-
-INTRA_PRED_TEST(C_1, TX_16X16, aom_dc_predictor_16x16_c,
-                aom_dc_left_predictor_16x16_c, aom_dc_top_predictor_16x16_c,
-                aom_dc_128_predictor_16x16_c, aom_v_predictor_16x16_c,
-                aom_h_predictor_16x16_c, aom_paeth_predictor_16x16_c,
-                aom_smooth_predictor_16x16_c, aom_smooth_v_predictor_16x16_c,
-                aom_smooth_h_predictor_16x16_c)
-
-INTRA_PRED_TEST(C_2, TX_16X8, aom_dc_predictor_16x8_c,
-                aom_dc_left_predictor_16x8_c, aom_dc_top_predictor_16x8_c,
-                aom_dc_128_predictor_16x8_c, aom_v_predictor_16x8_c,
-                aom_h_predictor_16x8_c, aom_paeth_predictor_16x8_c,
-                aom_smooth_predictor_16x8_c, aom_smooth_v_predictor_16x8_c,
-                aom_smooth_h_predictor_16x8_c)
-
-INTRA_PRED_TEST(C_3, TX_16X32, aom_dc_predictor_16x32_c,
-                aom_dc_left_predictor_16x32_c, aom_dc_top_predictor_16x32_c,
-                aom_dc_128_predictor_16x32_c, aom_v_predictor_16x32_c,
-                aom_h_predictor_16x32_c, aom_paeth_predictor_16x32_c,
-                aom_smooth_predictor_16x32_c, aom_smooth_v_predictor_16x32_c,
-                aom_smooth_h_predictor_16x32_c)
-
-INTRA_PRED_TEST(C_4, TX_16X4, aom_dc_predictor_16x4_c,
-                aom_dc_left_predictor_16x4_c, aom_dc_top_predictor_16x4_c,
-                aom_dc_128_predictor_16x4_c, aom_v_predictor_16x4_c,
-                aom_h_predictor_16x4_c, aom_paeth_predictor_16x4_c,
-                aom_smooth_predictor_16x4_c, aom_smooth_v_predictor_16x4_c,
-                aom_smooth_h_predictor_16x4_c)
-
-INTRA_PRED_TEST(C_5, TX_16X64, aom_dc_predictor_16x64_c,
-                aom_dc_left_predictor_16x64_c, aom_dc_top_predictor_16x64_c,
-                aom_dc_128_predictor_16x64_c, aom_v_predictor_16x64_c,
-                aom_h_predictor_16x64_c, aom_paeth_predictor_16x64_c,
-                aom_smooth_predictor_16x64_c, aom_smooth_v_predictor_16x64_c,
-                aom_smooth_h_predictor_16x64_c)
-
-#if HAVE_SSE2
-INTRA_PRED_TEST(SSE2_1, TX_16X16, aom_dc_predictor_16x16_sse2,
-                aom_dc_left_predictor_16x16_sse2,
-                aom_dc_top_predictor_16x16_sse2,
-                aom_dc_128_predictor_16x16_sse2, aom_v_predictor_16x16_sse2,
-                aom_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_2, TX_16X8, aom_dc_predictor_16x8_sse2,
-                aom_dc_left_predictor_16x8_sse2, aom_dc_top_predictor_16x8_sse2,
-                aom_dc_128_predictor_16x8_sse2, aom_v_predictor_16x8_sse2,
-                aom_h_predictor_16x8_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_3, TX_16X32, aom_dc_predictor_16x32_sse2,
-                aom_dc_left_predictor_16x32_sse2,
-                aom_dc_top_predictor_16x32_sse2,
-                aom_dc_128_predictor_16x32_sse2, aom_v_predictor_16x32_sse2,
-                aom_h_predictor_16x32_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_4, TX_16X64, aom_dc_predictor_16x64_sse2,
-                aom_dc_left_predictor_16x64_sse2,
-                aom_dc_top_predictor_16x64_sse2,
-                aom_dc_128_predictor_16x64_sse2, aom_v_predictor_16x64_sse2,
-                aom_h_predictor_16x64_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_5, TX_16X4, aom_dc_predictor_16x4_sse2,
-                aom_dc_left_predictor_16x4_sse2, aom_dc_top_predictor_16x4_sse2,
-                aom_dc_128_predictor_16x4_sse2, aom_v_predictor_16x4_sse2,
-                aom_h_predictor_16x4_sse2, NULL, NULL, NULL, NULL)
-#endif  // HAVE_SSE2
-
-#if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3_1, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_16x16_ssse3,
-                aom_smooth_predictor_16x16_ssse3,
-                aom_smooth_v_predictor_16x16_ssse3,
-                aom_smooth_h_predictor_16x16_ssse3)
-INTRA_PRED_TEST(SSSE3_2, TX_16X8, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_16x8_ssse3, aom_smooth_predictor_16x8_ssse3,
-                aom_smooth_v_predictor_16x8_ssse3,
-                aom_smooth_h_predictor_16x8_ssse3)
-INTRA_PRED_TEST(SSSE3_3, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_16x32_ssse3,
-                aom_smooth_predictor_16x32_ssse3,
-                aom_smooth_v_predictor_16x32_ssse3,
-                aom_smooth_h_predictor_16x32_ssse3)
-INTRA_PRED_TEST(SSSE3_4, TX_16X64, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_16x64_ssse3,
-                aom_smooth_predictor_16x64_ssse3,
-                aom_smooth_v_predictor_16x64_ssse3,
-                aom_smooth_h_predictor_16x64_ssse3)
-INTRA_PRED_TEST(SSSE3_5, TX_16X4, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_16x4_ssse3, aom_smooth_predictor_16x4_ssse3,
-                aom_smooth_v_predictor_16x4_ssse3,
-                aom_smooth_h_predictor_16x4_ssse3)
-#endif  // HAVE_SSSE3
-
-#if HAVE_AVX2
-INTRA_PRED_TEST(AVX2_1, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_16x16_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_2, TX_16X8, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_16x8_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_3, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_16x32_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_4, TX_16X64, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_16x64_avx2, NULL, NULL, NULL)
-#endif  // HAVE_AVX2
-
-#if HAVE_DSPR2
-INTRA_PRED_TEST(DSPR2, TX_16X16, aom_dc_predictor_16x16_dspr2, NULL, NULL, NULL,
-                NULL, aom_h_predictor_16x16_dspr2, NULL, NULL, NULL, NULL)
-#endif  // HAVE_DSPR2
-
-#if HAVE_NEON
-INTRA_PRED_TEST(NEON, TX_16X16, aom_dc_predictor_16x16_neon,
-                aom_dc_left_predictor_16x16_neon,
-                aom_dc_top_predictor_16x16_neon,
-                aom_dc_128_predictor_16x16_neon, aom_v_predictor_16x16_neon,
-                aom_h_predictor_16x16_neon, NULL, NULL, NULL, NULL)
-#endif  // HAVE_NEON
-
-#if HAVE_MSA
-INTRA_PRED_TEST(MSA, TX_16X16, aom_dc_predictor_16x16_msa,
-                aom_dc_left_predictor_16x16_msa, aom_dc_top_predictor_16x16_msa,
-                aom_dc_128_predictor_16x16_msa, aom_v_predictor_16x16_msa,
-                aom_h_predictor_16x16_msa, NULL, NULL, NULL, NULL)
-#endif  // HAVE_MSA
-
-// -----------------------------------------------------------------------------
-// 32x32, 32x16, 32x64, 32x8
-
-INTRA_PRED_TEST(C_1, TX_32X32, aom_dc_predictor_32x32_c,
-                aom_dc_left_predictor_32x32_c, aom_dc_top_predictor_32x32_c,
-                aom_dc_128_predictor_32x32_c, aom_v_predictor_32x32_c,
-                aom_h_predictor_32x32_c, aom_paeth_predictor_32x32_c,
-                aom_smooth_predictor_32x32_c, aom_smooth_v_predictor_32x32_c,
-                aom_smooth_h_predictor_32x32_c)
-
-INTRA_PRED_TEST(C_2, TX_32X16, aom_dc_predictor_32x16_c,
-                aom_dc_left_predictor_32x16_c, aom_dc_top_predictor_32x16_c,
-                aom_dc_128_predictor_32x16_c, aom_v_predictor_32x16_c,
-                aom_h_predictor_32x16_c, aom_paeth_predictor_32x16_c,
-                aom_smooth_predictor_32x16_c, aom_smooth_v_predictor_32x16_c,
-                aom_smooth_h_predictor_32x16_c)
-
-INTRA_PRED_TEST(C_3, TX_32X64, aom_dc_predictor_32x64_c,
-                aom_dc_left_predictor_32x64_c, aom_dc_top_predictor_32x64_c,
-                aom_dc_128_predictor_32x64_c, aom_v_predictor_32x64_c,
-                aom_h_predictor_32x64_c, aom_paeth_predictor_32x64_c,
-                aom_smooth_predictor_32x64_c, aom_smooth_v_predictor_32x64_c,
-                aom_smooth_h_predictor_32x64_c)
-
-INTRA_PRED_TEST(C_4, TX_32X8, aom_dc_predictor_32x8_c,
-                aom_dc_left_predictor_32x8_c, aom_dc_top_predictor_32x8_c,
-                aom_dc_128_predictor_32x8_c, aom_v_predictor_32x8_c,
-                aom_h_predictor_32x8_c, aom_paeth_predictor_32x8_c,
-                aom_smooth_predictor_32x8_c, aom_smooth_v_predictor_32x8_c,
-                aom_smooth_h_predictor_32x8_c)
-
-#if HAVE_SSE2
-INTRA_PRED_TEST(SSE2_1, TX_32X32, aom_dc_predictor_32x32_sse2,
-                aom_dc_left_predictor_32x32_sse2,
-                aom_dc_top_predictor_32x32_sse2,
-                aom_dc_128_predictor_32x32_sse2, aom_v_predictor_32x32_sse2,
-                aom_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_2, TX_32X16, aom_dc_predictor_32x16_sse2,
-                aom_dc_left_predictor_32x16_sse2,
-                aom_dc_top_predictor_32x16_sse2,
-                aom_dc_128_predictor_32x16_sse2, aom_v_predictor_32x16_sse2,
-                aom_h_predictor_32x16_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_3, TX_32X64, aom_dc_predictor_32x64_sse2,
-                aom_dc_left_predictor_32x64_sse2,
-                aom_dc_top_predictor_32x64_sse2,
-                aom_dc_128_predictor_32x64_sse2, aom_v_predictor_32x64_sse2,
-                aom_h_predictor_32x64_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_4, TX_32X8, aom_dc_predictor_32x8_sse2,
-                aom_dc_left_predictor_32x8_sse2, aom_dc_top_predictor_32x8_sse2,
-                aom_dc_128_predictor_32x8_sse2, aom_v_predictor_32x8_sse2,
-                aom_h_predictor_32x8_sse2, NULL, NULL, NULL, NULL)
-#endif  // HAVE_SSE2
-
-#if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3_1, TX_32X32, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_32x32_ssse3,
-                aom_smooth_predictor_32x32_ssse3,
-                aom_smooth_v_predictor_32x32_ssse3,
-                aom_smooth_h_predictor_32x32_ssse3)
-INTRA_PRED_TEST(SSSE3_2, TX_32X16, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_32x16_ssse3,
-                aom_smooth_predictor_32x16_ssse3,
-                aom_smooth_v_predictor_32x16_ssse3,
-                aom_smooth_h_predictor_32x16_ssse3)
-INTRA_PRED_TEST(SSSE3_3, TX_32X64, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_32x64_ssse3,
-                aom_smooth_predictor_32x64_ssse3,
-                aom_smooth_v_predictor_32x64_ssse3,
-                aom_smooth_h_predictor_32x64_ssse3)
-INTRA_PRED_TEST(SSSE3_4, TX_32X8, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_32x8_ssse3, aom_smooth_predictor_32x8_ssse3,
-                aom_smooth_v_predictor_32x8_ssse3,
-                aom_smooth_h_predictor_32x8_ssse3)
-#endif  // HAVE_SSSE3
-
-#if HAVE_AVX2
-INTRA_PRED_TEST(AVX2_1, TX_32X32, aom_dc_predictor_32x32_avx2,
-                aom_dc_left_predictor_32x32_avx2,
-                aom_dc_top_predictor_32x32_avx2,
-                aom_dc_128_predictor_32x32_avx2, aom_v_predictor_32x32_avx2,
-                aom_h_predictor_32x32_avx2, aom_paeth_predictor_32x32_avx2,
-                NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_2, TX_32X16, aom_dc_predictor_32x16_avx2,
-                aom_dc_left_predictor_32x16_avx2,
-                aom_dc_top_predictor_32x16_avx2,
-                aom_dc_128_predictor_32x16_avx2, aom_v_predictor_32x16_avx2,
-                NULL, aom_paeth_predictor_32x16_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_3, TX_32X64, aom_dc_predictor_32x64_avx2,
-                aom_dc_left_predictor_32x64_avx2,
-                aom_dc_top_predictor_32x64_avx2,
-                aom_dc_128_predictor_32x64_avx2, aom_v_predictor_32x64_avx2,
-                NULL, aom_paeth_predictor_32x64_avx2, NULL, NULL, NULL)
-#endif  // HAVE_AVX2
-
-#if HAVE_NEON
-INTRA_PRED_TEST(NEON, TX_32X32, aom_dc_predictor_32x32_neon,
-                aom_dc_left_predictor_32x32_neon,
-                aom_dc_top_predictor_32x32_neon,
-                aom_dc_128_predictor_32x32_neon, aom_v_predictor_32x32_neon,
-                aom_h_predictor_32x32_neon, NULL, NULL, NULL, NULL)
-#endif  // HAVE_NEON
-
-#if HAVE_MSA
-INTRA_PRED_TEST(MSA, TX_32X32, aom_dc_predictor_32x32_msa,
-                aom_dc_left_predictor_32x32_msa, aom_dc_top_predictor_32x32_msa,
-                aom_dc_128_predictor_32x32_msa, aom_v_predictor_32x32_msa,
-                aom_h_predictor_32x32_msa, NULL, NULL, NULL, NULL)
-#endif  // HAVE_MSA
-
-// -----------------------------------------------------------------------------
-// 64x64, 64x32, 64x16
-
-INTRA_PRED_TEST(C_1, TX_64X64, aom_dc_predictor_64x64_c,
-                aom_dc_left_predictor_64x64_c, aom_dc_top_predictor_64x64_c,
-                aom_dc_128_predictor_64x64_c, aom_v_predictor_64x64_c,
-                aom_h_predictor_64x64_c, aom_paeth_predictor_64x64_c,
-                aom_smooth_predictor_64x64_c, aom_smooth_v_predictor_64x64_c,
-                aom_smooth_h_predictor_64x64_c)
-
-INTRA_PRED_TEST(C_2, TX_64X32, aom_dc_predictor_64x32_c,
-                aom_dc_left_predictor_64x32_c, aom_dc_top_predictor_64x32_c,
-                aom_dc_128_predictor_64x32_c, aom_v_predictor_64x32_c,
-                aom_h_predictor_64x32_c, aom_paeth_predictor_64x32_c,
-                aom_smooth_predictor_64x32_c, aom_smooth_v_predictor_64x32_c,
-                aom_smooth_h_predictor_64x32_c)
-
-INTRA_PRED_TEST(C_3, TX_64X16, aom_dc_predictor_64x16_c,
-                aom_dc_left_predictor_64x16_c, aom_dc_top_predictor_64x16_c,
-                aom_dc_128_predictor_64x16_c, aom_v_predictor_64x16_c,
-                aom_h_predictor_64x16_c, aom_paeth_predictor_64x16_c,
-                aom_smooth_predictor_64x16_c, aom_smooth_v_predictor_64x16_c,
-                aom_smooth_h_predictor_64x16_c)
-
-#if HAVE_SSE2
-INTRA_PRED_TEST(SSE2_4, TX_64X64, aom_dc_predictor_64x64_sse2,
-                aom_dc_left_predictor_64x64_sse2,
-                aom_dc_top_predictor_64x64_sse2,
-                aom_dc_128_predictor_64x64_sse2, aom_v_predictor_64x64_sse2,
-                aom_h_predictor_64x64_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_5, TX_64X32, aom_dc_predictor_64x32_sse2,
-                aom_dc_left_predictor_64x32_sse2,
-                aom_dc_top_predictor_64x32_sse2,
-                aom_dc_128_predictor_64x32_sse2, aom_v_predictor_64x32_sse2,
-                aom_h_predictor_64x32_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_6, TX_64X16, aom_dc_predictor_64x16_sse2,
-                aom_dc_left_predictor_64x16_sse2,
-                aom_dc_top_predictor_64x16_sse2,
-                aom_dc_128_predictor_64x16_sse2, aom_v_predictor_64x16_sse2,
-                aom_h_predictor_64x16_sse2, NULL, NULL, NULL, NULL)
-#endif
-
-#if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3_4, TX_64X64, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_64x64_ssse3,
-                aom_smooth_predictor_64x64_ssse3,
-                aom_smooth_v_predictor_64x64_ssse3,
-                aom_smooth_h_predictor_64x64_ssse3)
-INTRA_PRED_TEST(SSSE3_5, TX_64X32, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_64x32_ssse3,
-                aom_smooth_predictor_64x32_ssse3,
-                aom_smooth_v_predictor_64x32_ssse3,
-                aom_smooth_h_predictor_64x32_ssse3)
-INTRA_PRED_TEST(SSSE3_6, TX_64X16, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_paeth_predictor_64x16_ssse3,
-                aom_smooth_predictor_64x16_ssse3,
-                aom_smooth_v_predictor_64x16_ssse3,
-                aom_smooth_h_predictor_64x16_ssse3)
-#endif
-
-#if HAVE_AVX2
-INTRA_PRED_TEST(AVX2_4, TX_64X64, aom_dc_predictor_64x64_avx2,
-                aom_dc_left_predictor_64x64_avx2,
-                aom_dc_top_predictor_64x64_avx2,
-                aom_dc_128_predictor_64x64_avx2, aom_v_predictor_64x64_avx2,
-                NULL, aom_paeth_predictor_64x64_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_5, TX_64X32, aom_dc_predictor_64x32_avx2,
-                aom_dc_left_predictor_64x32_avx2,
-                aom_dc_top_predictor_64x32_avx2,
-                aom_dc_128_predictor_64x32_avx2, aom_v_predictor_64x32_avx2,
-                NULL, aom_paeth_predictor_64x32_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_6, TX_64X16, aom_dc_predictor_64x16_avx2,
-                aom_dc_left_predictor_64x16_avx2,
-                aom_dc_top_predictor_64x16_avx2,
-                aom_dc_128_predictor_64x16_avx2, aom_v_predictor_64x16_avx2,
-                NULL, aom_paeth_predictor_64x16_avx2, NULL, NULL, NULL)
-#endif
-
 // -----------------------------------------------------------------------------
 // High Bitdepth
 namespace {
@@ -916,7 +146,7 @@
     const int elapsed_time =
         static_cast<int>(aom_usec_timer_elapsed(&timer) / 1000);
     CheckMd5Signature(
-        tx_size, true, signatures, intra_pred_test_mem.src,
+        tx_size, signatures, intra_pred_test_mem.src,
         intra_pred_test_mem.num_pixels * sizeof(*intra_pred_test_mem.src),
         elapsed_time, k);
   }

diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc
index d4b4bc4..6e59477 100644
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc

@@ -136,7 +136,6 @@
   OpenMD5File(md5_filename);
 
   // Set decode config and flags.
-  cfg.allow_lowbitdepth = 0;
   set_cfg(cfg);
   set_flags(flags);
 

diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index d3d1847..58e4486 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc

@@ -35,7 +35,6 @@
     cfg.w = 704;
     cfg.h = 576;
     cfg.threads = 1;
-    cfg.allow_lowbitdepth = 1;
     fw_dec_ = codec_->CreateDecoder(cfg, 0);
     inv_dec_ = codec_->CreateDecoder(cfg, 0);
     inv_dec_->Control(AV1_INVERT_TILE_DECODE_ORDER, 1);

diff --git a/test/variance_test.cc b/test/variance_test.cc
index e149f4d..785371a 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc

@@ -220,114 +220,6 @@
   return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
 }
 
-static uint32_t dist_wtd_subpel_avg_variance_ref(
-    const uint8_t *ref, const uint8_t *src, const uint8_t *second_pred, int l2w,
-    int l2h, int xoff, int yoff, uint32_t *sse_ptr, bool use_high_bit_depth,
-    aom_bit_depth_t bit_depth, DIST_WTD_COMP_PARAMS *jcp_param) {
-  int64_t se = 0;
-  uint64_t sse = 0;
-  const int w = 1 << l2w;
-  const int h = 1 << l2h;
-
-  xoff <<= 1;
-  yoff <<= 1;
-
-  for (int y = 0; y < h; y++) {
-    for (int x = 0; x < w; x++) {
-      // bilinear interpolation at a 16th pel step
-      if (!use_high_bit_depth) {
-        const int a1 = ref[(w + 0) * (y + 0) + x + 0];
-        const int a2 = ref[(w + 0) * (y + 0) + x + 1];
-        const int b1 = ref[(w + 0) * (y + 1) + x + 0];
-        const int b2 = ref[(w + 0) * (y + 1) + x + 1];
-        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
-        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
-        const int r = a + (((b - a) * yoff + 8) >> 4);
-        const int avg = ROUND_POWER_OF_TWO(
-            r * jcp_param->fwd_offset +
-                second_pred[w * y + x] * jcp_param->bck_offset,
-            DIST_PRECISION_BITS);
-        const int diff = avg - src[w * y + x];
-
-        se += diff;
-        sse += diff * diff;
-      } else {
-        const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
-        const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
-        const uint16_t *sec16 = CONVERT_TO_SHORTPTR(second_pred);
-        const int a1 = ref16[(w + 0) * (y + 0) + x + 0];
-        const int a2 = ref16[(w + 0) * (y + 0) + x + 1];
-        const int b1 = ref16[(w + 0) * (y + 1) + x + 0];
-        const int b2 = ref16[(w + 0) * (y + 1) + x + 1];
-        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
-        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
-        const int r = a + (((b - a) * yoff + 8) >> 4);
-        const int avg =
-            ROUND_POWER_OF_TWO(r * jcp_param->fwd_offset +
-                                   sec16[w * y + x] * jcp_param->bck_offset,
-                               DIST_PRECISION_BITS);
-        const int diff = avg - src16[w * y + x];
-
-        se += diff;
-        sse += diff * diff;
-      }
-    }
-  }
-  RoundHighBitDepth(bit_depth, &se, &sse);
-  *sse_ptr = static_cast<uint32_t>(sse);
-  return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
-}
-
-static uint32_t obmc_subpel_variance_ref(const uint8_t *pre, int l2w, int l2h,
-                                         int xoff, int yoff,
-                                         const int32_t *wsrc,
-                                         const int32_t *mask, uint32_t *sse_ptr,
-                                         bool use_high_bit_depth_,
-                                         aom_bit_depth_t bit_depth) {
-  int64_t se = 0;
-  uint64_t sse = 0;
-  const int w = 1 << l2w;
-  const int h = 1 << l2h;
-
-  xoff <<= 1;
-  yoff <<= 1;
-
-  for (int y = 0; y < h; y++) {
-    for (int x = 0; x < w; x++) {
-      // Bilinear interpolation at a 16th pel step.
-      if (!use_high_bit_depth_) {
-        const int a1 = pre[(w + 1) * (y + 0) + x + 0];
-        const int a2 = pre[(w + 1) * (y + 0) + x + 1];
-        const int b1 = pre[(w + 1) * (y + 1) + x + 0];
-        const int b2 = pre[(w + 1) * (y + 1) + x + 1];
-        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
-        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
-        const int r = a + (((b - a) * yoff + 8) >> 4);
-        const int diff = ROUND_POWER_OF_TWO_SIGNED(
-            wsrc[w * y + x] - r * mask[w * y + x], 12);
-        se += diff;
-        sse += diff * diff;
-      } else {
-        uint16_t *pre16 = CONVERT_TO_SHORTPTR(pre);
-        const int a1 = pre16[(w + 1) * (y + 0) + x + 0];
-        const int a2 = pre16[(w + 1) * (y + 0) + x + 1];
-        const int b1 = pre16[(w + 1) * (y + 1) + x + 0];
-        const int b2 = pre16[(w + 1) * (y + 1) + x + 1];
-        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
-        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
-        const int r = a + (((b - a) * yoff + 8) >> 4);
-        const int diff = ROUND_POWER_OF_TWO_SIGNED(
-            wsrc[w * y + x] - r * mask[w * y + x], 12);
-        se += diff;
-        sse += diff * diff;
-      }
-    }
-  }
-  RoundHighBitDepth(bit_depth, &se, &sse);
-  *sse_ptr = static_cast<uint32_t>(sse);
-  return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 
 class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> {
@@ -410,106 +302,6 @@
 
 // Main class for testing a function type
 template <typename FunctionType>
-class MseWxHTestClass
-    : public ::testing::TestWithParam<TestParams<FunctionType> > {
- public:
-  virtual void SetUp() {
-    params_ = this->GetParam();
-
-    rnd_.Reset(ACMRandom::DeterministicSeed());
-    src_ = reinterpret_cast<uint16_t *>(
-        aom_memalign(16, block_size() * sizeof(src_)));
-    dst_ = reinterpret_cast<uint8_t *>(
-        aom_memalign(16, block_size() * sizeof(dst_)));
-    ASSERT_TRUE(src_ != NULL);
-    ASSERT_TRUE(dst_ != NULL);
-  }
-
-  virtual void TearDown() {
-    aom_free(src_);
-    aom_free(dst_);
-    src_ = NULL;
-    dst_ = NULL;
-    libaom_test::ClearSystemState();
-  }
-
- protected:
-  void RefMatchTestMse();
-  void SpeedTest();
-
- protected:
-  ACMRandom rnd_;
-  uint8_t *dst_;
-  uint16_t *src_;
-  TestParams<FunctionType> params_;
-
-  // some relay helpers
-  int block_size() const { return params_.block_size; }
-  int width() const { return params_.width; }
-  int height() const { return params_.height; }
-  int d_stride() const { return params_.width; }  // stride is same as width
-  int s_stride() const { return params_.width; }  // stride is same as width
-};
-
-template <typename MseWxHFunctionType>
-void MseWxHTestClass<MseWxHFunctionType>::SpeedTest() {
-  aom_usec_timer ref_timer, test_timer;
-  double elapsed_time_c = 0;
-  double elapsed_time_simd = 0;
-  int run_time = 10000000;
-  int w = width();
-  int h = height();
-  int dstride = d_stride();
-  int sstride = s_stride();
-
-  for (int k = 0; k < block_size(); ++k) {
-    dst_[k] = rnd_.Rand8();
-    src_[k] = rnd_.Rand8();
-  }
-  aom_usec_timer_start(&ref_timer);
-  for (int i = 0; i < run_time; i++) {
-    aom_mse_wxh_16bit_c(dst_, dstride, src_, sstride, w, h);
-  }
-  aom_usec_timer_mark(&ref_timer);
-  elapsed_time_c = static_cast<double>(aom_usec_timer_elapsed(&ref_timer));
-
-  aom_usec_timer_start(&test_timer);
-  for (int i = 0; i < run_time; i++) {
-    params_.func(dst_, dstride, src_, sstride, w, h);
-  }
-  aom_usec_timer_mark(&test_timer);
-  elapsed_time_simd = static_cast<double>(aom_usec_timer_elapsed(&test_timer));
-
-  printf("%dx%d\tc_time=%lf \t simd_time=%lf \t gain=%lf\n", width(), height(),
-         elapsed_time_c, elapsed_time_simd,
-         (elapsed_time_c / elapsed_time_simd));
-}
-
-template <typename MseWxHFunctionType>
-void MseWxHTestClass<MseWxHFunctionType>::RefMatchTestMse() {
-  uint64_t mse_ref = 0;
-  uint64_t mse_mod = 0;
-  int w = width();
-  int h = height();
-  int dstride = d_stride();
-  int sstride = s_stride();
-
-  for (int i = 0; i < 10; i++) {
-    for (int k = 0; k < block_size(); ++k) {
-      dst_[k] = rnd_.Rand8();
-      src_[k] = rnd_.Rand8();
-    }
-    ASM_REGISTER_STATE_CHECK(
-        mse_ref = aom_mse_wxh_16bit_c(dst_, dstride, src_, sstride, w, h));
-    ASM_REGISTER_STATE_CHECK(
-        mse_mod = params_.func(dst_, dstride, src_, sstride, w, h));
-    EXPECT_EQ(mse_ref, mse_mod)
-        << "ref mse: " << mse_ref << " mod mse: " << mse_mod;
-  }
-}
-
-// Main class for testing a function type
-template <typename FunctionType>
 class MainTestClass
     : public ::testing::TestWithParam<TestParams<FunctionType> > {
  public:
@@ -975,47 +767,6 @@
   }
 }
 
-template <>
-void SubpelVarianceTest<DistWtdSubpixAvgVarMxNFunc>::RefTest() {
-  for (int x = 0; x < 8; ++x) {
-    for (int y = 0; y < 8; ++y) {
-      if (!use_high_bit_depth()) {
-        for (int j = 0; j < block_size(); j++) {
-          src_[j] = rnd_.Rand8();
-          sec_[j] = rnd_.Rand8();
-        }
-        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
-          ref_[j] = rnd_.Rand8();
-        }
-      } else {
-        for (int j = 0; j < block_size(); j++) {
-          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
-          CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask();
-        }
-        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
-          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
-        }
-      }
-      for (int x0 = 0; x0 < 2; ++x0) {
-        for (int y0 = 0; y0 < 4; ++y0) {
-          uint32_t sse1, sse2;
-          uint32_t var1, var2;
-          jcp_param_.fwd_offset = quant_dist_lookup_table[y0][x0];
-          jcp_param_.bck_offset = quant_dist_lookup_table[y0][1 - x0];
-          ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 0, x, y,
-                                                       src_, width(), &sse1,
-                                                       sec_, &jcp_param_));
-          var2 = dist_wtd_subpel_avg_variance_ref(
-              ref_, src_, sec_, params_.log2width, params_.log2height, x, y,
-              &sse2, use_high_bit_depth(), params_.bit_depth, &jcp_param_);
-          EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
-          EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
-        }
-      }
-    }
-  }
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 
 static const int kMaskMax = 64;
@@ -1077,311 +828,23 @@
   uint32_t bd_mask() const { return params_.mask; }
 };
 
-template <>
-void ObmcVarianceTest<ObmcSubpelVarFunc>::RefTest() {
-  for (int x = 0; x < 8; ++x) {
-    for (int y = 0; y < 8; ++y) {
-      if (!use_high_bit_depth())
-        for (int j = 0; j < block_size() + width() + height() + 1; j++)
-          pre_[j] = rnd_.Rand8();
-      else
-        for (int j = 0; j < block_size() + width() + height() + 1; j++)
-          CONVERT_TO_SHORTPTR(pre_)[j] = rnd_.Rand16() & bd_mask();
-      for (int j = 0; j < block_size(); j++) {
-        wsrc_[j] = (rnd_.Rand16() & bd_mask()) * rnd_(kMaskMax * kMaskMax + 1);
-        mask_[j] = rnd_(kMaskMax * kMaskMax + 1);
-      }
-
-      uint32_t sse1, sse2;
-      uint32_t var1, var2;
-      ASM_REGISTER_STATE_CHECK(
-          var1 = params_.func(pre_, width() + 1, x, y, wsrc_, mask_, &sse1));
-      var2 = obmc_subpel_variance_ref(
-          pre_, params_.log2width, params_.log2height, x, y, wsrc_, mask_,
-          &sse2, use_high_bit_depth(), params_.bit_depth);
-      EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y;
-      EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y;
-    }
-  }
-}
-
-template <>
-void ObmcVarianceTest<ObmcSubpelVarFunc>::ExtremeRefTest() {
-  // Pre: Set the first half of values to the maximum, the second half to 0.
-  // Mask: same as above
-  // WSrc: Set the first half of values to 0, the second half to the maximum.
-  for (int x = 0; x < 8; ++x) {
-    for (int y = 0; y < 8; ++y) {
-      const int half = block_size() / 2;
-      if (!use_high_bit_depth()) {
-        memset(pre_, 255, half);
-        memset(pre_ + half, 0, half + width() + height() + 1);
-      } else {
-        aom_memset16(CONVERT_TO_SHORTPTR(pre_), bd_mask(), half);
-        aom_memset16(CONVERT_TO_SHORTPTR(pre_) + half, 0, half);
-      }
-      for (int j = 0; j < half; j++) {
-        wsrc_[j] = bd_mask() * kMaskMax * kMaskMax;
-        mask_[j] = 0;
-      }
-      for (int j = half; j < block_size(); j++) {
-        wsrc_[j] = 0;
-        mask_[j] = kMaskMax * kMaskMax;
-      }
-
-      uint32_t sse1, sse2;
-      uint32_t var1, var2;
-      ASM_REGISTER_STATE_CHECK(
-          var1 = params_.func(pre_, width() + 1, x, y, wsrc_, mask_, &sse1));
-      var2 = obmc_subpel_variance_ref(
-          pre_, params_.log2width, params_.log2height, x, y, wsrc_, mask_,
-          &sse2, use_high_bit_depth(), params_.bit_depth);
-      EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y;
-      EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y;
-    }
-  }
-}
-
-template <>
-void ObmcVarianceTest<ObmcSubpelVarFunc>::SpeedTest() {
-  if (!use_high_bit_depth())
-    for (int j = 0; j < block_size() + width() + height() + 1; j++)
-      pre_[j] = rnd_.Rand8();
-  else
-    for (int j = 0; j < block_size() + width() + height() + 1; j++)
-      CONVERT_TO_SHORTPTR(pre_)[j] = rnd_.Rand16() & bd_mask();
-  for (int j = 0; j < block_size(); j++) {
-    wsrc_[j] = (rnd_.Rand16() & bd_mask()) * rnd_(kMaskMax * kMaskMax + 1);
-    mask_[j] = rnd_(kMaskMax * kMaskMax + 1);
-  }
-  unsigned int sse1;
-  const int stride = width() + 1;
-  int run_time = 1000000000 / block_size();
-  aom_usec_timer timer;
-
-  aom_usec_timer_start(&timer);
-  for (int i = 0; i < run_time; ++i) {
-    int x = rnd_(8);
-    int y = rnd_(8);
-    ASM_REGISTER_STATE_CHECK(
-        params_.func(pre_, stride, x, y, wsrc_, mask_, &sse1));
-  }
-  aom_usec_timer_mark(&timer);
-
-  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-  printf("obmc_sub_pixel_variance_%dx%d_%d: %d us\n", width(), height(),
-         params_.bit_depth, elapsed_time);
-}
-
-typedef MseWxHTestClass<MseWxH16bitFunc> MseWxHTest;
-typedef MainTestClass<Get4x4SseFunc> AvxSseTest;
-typedef MainTestClass<VarianceMxNFunc> AvxMseTest;
-typedef MainTestClass<VarianceMxNFunc> AvxVarianceTest;
 typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxSubpelVarianceTest;
 typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> AvxSubpelAvgVarianceTest;
-typedef SubpelVarianceTest<DistWtdSubpixAvgVarMxNFunc>
-    AvxDistWtdSubpelAvgVarianceTest;
-typedef ObmcVarianceTest<ObmcSubpelVarFunc> AvxObmcSubpelVarianceTest;
-typedef TestParams<MseWxH16bitFunc> MseWxHParams;
 
-TEST_P(AvxSseTest, RefSse) { RefTestSse(); }
-TEST_P(AvxSseTest, MaxSse) { MaxTestSse(); }
-TEST_P(MseWxHTest, RefMse) { RefMatchTestMse(); }
-TEST_P(MseWxHTest, DISABLED_SpeedMse) { SpeedTest(); }
-TEST_P(AvxMseTest, RefMse) { RefTestMse(); }
-TEST_P(AvxMseTest, MaxMse) { MaxTestMse(); }
-TEST_P(AvxVarianceTest, Zero) { ZeroTest(); }
-TEST_P(AvxVarianceTest, Ref) { RefTest(); }
-TEST_P(AvxVarianceTest, RefStride) { RefStrideTest(); }
-TEST_P(AvxVarianceTest, OneQuarter) { OneQuarterTest(); }
-TEST_P(AvxVarianceTest, DISABLED_Speed) { SpeedTest(); }
 TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
 TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
 TEST_P(AvxSubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(AvxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
 TEST_P(AvxSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
 TEST_P(AvxSubpelAvgVarianceTest, Ref) { RefTest(); }
-TEST_P(AvxDistWtdSubpelAvgVarianceTest, Ref) { RefTest(); }
-TEST_P(AvxObmcSubpelVarianceTest, Ref) { RefTest(); }
-TEST_P(AvxObmcSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
-TEST_P(AvxObmcSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
-
-INSTANTIATE_TEST_SUITE_P(
-    C, MseWxHTest,
-    ::testing::Values(MseWxHParams(3, 3, &aom_mse_wxh_16bit_c, 8),
-                      MseWxHParams(3, 2, &aom_mse_wxh_16bit_c, 8),
-                      MseWxHParams(2, 3, &aom_mse_wxh_16bit_c, 8),
-                      MseWxHParams(2, 2, &aom_mse_wxh_16bit_c, 8)));
 
 INSTANTIATE_TEST_SUITE_P(C, SumOfSquaresTest,
                          ::testing::Values(aom_get_mb_ss_c));
 
-typedef TestParams<Get4x4SseFunc> SseParams;
-INSTANTIATE_TEST_SUITE_P(C, AvxSseTest,
-                         ::testing::Values(SseParams(2, 2,
-                                                     &aom_get4x4sse_cs_c)));
-
-typedef TestParams<VarianceMxNFunc> MseParams;
-INSTANTIATE_TEST_SUITE_P(C, AvxMseTest,
-                         ::testing::Values(MseParams(4, 4, &aom_mse16x16_c),
-                                           MseParams(4, 3, &aom_mse16x8_c),
-                                           MseParams(3, 4, &aom_mse8x16_c),
-                                           MseParams(3, 3, &aom_mse8x8_c)));
-
 typedef TestParams<VarianceMxNFunc> VarianceParams;
-INSTANTIATE_TEST_SUITE_P(
-    C, AvxVarianceTest,
-    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_c),
-                      VarianceParams(7, 6, &aom_variance128x64_c),
-                      VarianceParams(6, 7, &aom_variance64x128_c),
-                      VarianceParams(6, 6, &aom_variance64x64_c),
-                      VarianceParams(6, 5, &aom_variance64x32_c),
-                      VarianceParams(5, 6, &aom_variance32x64_c),
-                      VarianceParams(5, 5, &aom_variance32x32_c),
-                      VarianceParams(5, 4, &aom_variance32x16_c),
-                      VarianceParams(4, 5, &aom_variance16x32_c),
-                      VarianceParams(4, 4, &aom_variance16x16_c),
-                      VarianceParams(4, 3, &aom_variance16x8_c),
-                      VarianceParams(3, 4, &aom_variance8x16_c),
-                      VarianceParams(3, 3, &aom_variance8x8_c),
-                      VarianceParams(3, 2, &aom_variance8x4_c),
-                      VarianceParams(2, 3, &aom_variance4x8_c),
-                      VarianceParams(2, 2, &aom_variance4x4_c),
-
-                      VarianceParams(6, 4, &aom_variance64x16_c),
-                      VarianceParams(4, 6, &aom_variance16x64_c),
-                      VarianceParams(5, 3, &aom_variance32x8_c),
-                      VarianceParams(3, 5, &aom_variance8x32_c),
-                      VarianceParams(4, 2, &aom_variance16x4_c),
-                      VarianceParams(2, 4, &aom_variance4x16_c)));
-
 typedef TestParams<SubpixVarMxNFunc> SubpelVarianceParams;
-INSTANTIATE_TEST_SUITE_P(
-    C, AvxSubpelVarianceTest,
-    ::testing::Values(
-        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_c, 0),
-        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_c, 0),
-        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_c, 0),
-        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_c, 0),
-        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_c, 0),
-        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_c, 0),
-        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_c, 0),
-        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_c, 0),
-        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_c, 0),
-        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_c, 0),
-        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_c, 0),
-        SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_c, 0),
-        SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_c, 0),
-        SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_c, 0),
-        SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_c, 0),
-        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_c, 0),
-
-        SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_c, 0),
-        SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_c, 0),
-        SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_c, 0),
-        SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_c, 0),
-        SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_c, 0),
-        SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_c, 0)));
-
 typedef TestParams<SubpixAvgVarMxNFunc> SubpelAvgVarianceParams;
-INSTANTIATE_TEST_SUITE_P(
-    C, AvxSubpelAvgVarianceTest,
-    ::testing::Values(
-        SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_c, 0),
-        SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_c, 0),
-        SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_c, 0),
-        SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_c, 0),
-        SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_c, 0),
-        SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_c, 0),
-        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_c, 0),
-        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_c, 0),
-        SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_c, 0),
-        SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_c, 0),
-        SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_c, 0),
-        SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_c, 0),
-        SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_c, 0),
-        SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_c, 0),
-        SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_c, 0),
-        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_c, 0),
-
-        SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_c, 0),
-        SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_c, 0),
-        SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_c, 0),
-        SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_c, 0),
-        SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_c, 0),
-        SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_c, 0)));
-
 typedef TestParams<DistWtdSubpixAvgVarMxNFunc> DistWtdSubpelAvgVarianceParams;
-INSTANTIATE_TEST_SUITE_P(
-    C, AvxDistWtdSubpelAvgVarianceTest,
-    ::testing::Values(DistWtdSubpelAvgVarianceParams(
-                          6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          4, 3, &aom_dist_wtd_sub_pixel_avg_variance16x8_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          3, 4, &aom_dist_wtd_sub_pixel_avg_variance8x16_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          3, 3, &aom_dist_wtd_sub_pixel_avg_variance8x8_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          3, 2, &aom_dist_wtd_sub_pixel_avg_variance8x4_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          2, 3, &aom_dist_wtd_sub_pixel_avg_variance4x8_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_c, 0),
-
-                      DistWtdSubpelAvgVarianceParams(
-                          6, 4, &aom_dist_wtd_sub_pixel_avg_variance64x16_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          4, 6, &aom_dist_wtd_sub_pixel_avg_variance16x64_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          5, 3, &aom_dist_wtd_sub_pixel_avg_variance32x8_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          3, 5, &aom_dist_wtd_sub_pixel_avg_variance8x32_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          4, 2, &aom_dist_wtd_sub_pixel_avg_variance16x4_c, 0),
-                      DistWtdSubpelAvgVarianceParams(
-                          2, 4, &aom_dist_wtd_sub_pixel_avg_variance4x16_c,
-                          0)));
-
-INSTANTIATE_TEST_SUITE_P(
-    C, AvxObmcSubpelVarianceTest,
-    ::testing::Values(
-        ObmcSubpelVarianceParams(7, 7, &aom_obmc_sub_pixel_variance128x128_c,
-                                 0),
-        ObmcSubpelVarianceParams(7, 6, &aom_obmc_sub_pixel_variance128x64_c, 0),
-        ObmcSubpelVarianceParams(6, 7, &aom_obmc_sub_pixel_variance64x128_c, 0),
-        ObmcSubpelVarianceParams(6, 6, &aom_obmc_sub_pixel_variance64x64_c, 0),
-        ObmcSubpelVarianceParams(6, 5, &aom_obmc_sub_pixel_variance64x32_c, 0),
-        ObmcSubpelVarianceParams(5, 6, &aom_obmc_sub_pixel_variance32x64_c, 0),
-        ObmcSubpelVarianceParams(5, 5, &aom_obmc_sub_pixel_variance32x32_c, 0),
-        ObmcSubpelVarianceParams(5, 4, &aom_obmc_sub_pixel_variance32x16_c, 0),
-        ObmcSubpelVarianceParams(4, 5, &aom_obmc_sub_pixel_variance16x32_c, 0),
-        ObmcSubpelVarianceParams(4, 4, &aom_obmc_sub_pixel_variance16x16_c, 0),
-        ObmcSubpelVarianceParams(4, 3, &aom_obmc_sub_pixel_variance16x8_c, 0),
-        ObmcSubpelVarianceParams(3, 4, &aom_obmc_sub_pixel_variance8x16_c, 0),
-        ObmcSubpelVarianceParams(3, 3, &aom_obmc_sub_pixel_variance8x8_c, 0),
-        ObmcSubpelVarianceParams(3, 2, &aom_obmc_sub_pixel_variance8x4_c, 0),
-        ObmcSubpelVarianceParams(2, 3, &aom_obmc_sub_pixel_variance4x8_c, 0),
-        ObmcSubpelVarianceParams(2, 2, &aom_obmc_sub_pixel_variance4x4_c, 0),
-
-        ObmcSubpelVarianceParams(6, 4, &aom_obmc_sub_pixel_variance64x16_c, 0),
-        ObmcSubpelVarianceParams(4, 6, &aom_obmc_sub_pixel_variance16x64_c, 0),
-        ObmcSubpelVarianceParams(5, 3, &aom_obmc_sub_pixel_variance32x8_c, 0),
-        ObmcSubpelVarianceParams(3, 5, &aom_obmc_sub_pixel_variance8x32_c, 0),
-        ObmcSubpelVarianceParams(4, 2, &aom_obmc_sub_pixel_variance16x4_c, 0),
-        ObmcSubpelVarianceParams(2, 4, &aom_obmc_sub_pixel_variance4x16_c, 0)));
 
 typedef uint64_t (*MseHBDWxH16bitFunc)(uint16_t *dst, int dstride,
                                        uint16_t *src, int sstride, int w,
@@ -1918,113 +1381,13 @@
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_SUITE_P(
-    SSE2, MseWxHTest,
-    ::testing::Values(MseWxHParams(3, 3, &aom_mse_wxh_16bit_sse2, 8),
-                      MseWxHParams(3, 2, &aom_mse_wxh_16bit_sse2, 8),
-                      MseWxHParams(2, 3, &aom_mse_wxh_16bit_sse2, 8),
-                      MseWxHParams(2, 2, &aom_mse_wxh_16bit_sse2, 8)));
-
-INSTANTIATE_TEST_SUITE_P(SSE2, SumOfSquaresTest,
-                         ::testing::Values(aom_get_mb_ss_sse2));
-
-INSTANTIATE_TEST_SUITE_P(SSE2, AvxMseTest,
-                         ::testing::Values(MseParams(4, 4, &aom_mse16x16_sse2),
-                                           MseParams(4, 3, &aom_mse16x8_sse2),
-                                           MseParams(3, 4, &aom_mse8x16_sse2),
-                                           MseParams(3, 3, &aom_mse8x8_sse2)));
-
-INSTANTIATE_TEST_SUITE_P(
-    SSE2, AvxVarianceTest,
-    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_sse2),
-                      VarianceParams(7, 6, &aom_variance128x64_sse2),
-                      VarianceParams(6, 7, &aom_variance64x128_sse2),
-                      VarianceParams(6, 6, &aom_variance64x64_sse2),
-                      VarianceParams(6, 5, &aom_variance64x32_sse2),
-                      VarianceParams(6, 4, &aom_variance64x16_sse2),
-                      VarianceParams(5, 6, &aom_variance32x64_sse2),
-                      VarianceParams(5, 5, &aom_variance32x32_sse2),
-                      VarianceParams(5, 4, &aom_variance32x16_sse2),
-                      VarianceParams(5, 3, &aom_variance32x8_sse2),
-                      VarianceParams(4, 6, &aom_variance16x64_sse2),
-                      VarianceParams(4, 5, &aom_variance16x32_sse2),
-                      VarianceParams(4, 4, &aom_variance16x16_sse2),
-                      VarianceParams(4, 3, &aom_variance16x8_sse2),
-                      VarianceParams(4, 2, &aom_variance16x4_sse2),
-                      VarianceParams(3, 5, &aom_variance8x32_sse2),
-                      VarianceParams(3, 4, &aom_variance8x16_sse2),
-                      VarianceParams(3, 3, &aom_variance8x8_sse2),
-                      VarianceParams(3, 2, &aom_variance8x4_sse2),
-                      VarianceParams(2, 4, &aom_variance4x16_sse2),
-                      VarianceParams(2, 3, &aom_variance4x8_sse2),
-                      VarianceParams(2, 2, &aom_variance4x4_sse2)));
-
-INSTANTIATE_TEST_SUITE_P(
-    SSE2, AvxSubpelVarianceTest,
-    ::testing::Values(
-        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_sse2, 0),
-        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_sse2, 0),
-        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_sse2, 0),
-        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_sse2, 0),
-        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_sse2, 0),
-        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_sse2, 0),
-        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_sse2, 0),
-        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_sse2, 0),
-        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_sse2, 0),
-        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_sse2, 0),
-        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_sse2, 0),
-        SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_sse2, 0),
-        SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_sse2, 0),
-        SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_sse2, 0),
-        SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_sse2, 0),
-        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_sse2, 0),
-
-        SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_sse2, 0),
-        SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_sse2, 0),
-        SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_sse2, 0),
-        SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_sse2, 0),
-        SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_sse2, 0),
-        SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_sse2, 0)));
-
-INSTANTIATE_TEST_SUITE_P(
-    SSE2, AvxSubpelAvgVarianceTest,
-    ::testing::Values(
-        SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_sse2,
-                                0),
-        SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_sse2,
-                                0),
-        SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_sse2,
-                                0),
-        SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_sse2, 0),
-        SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_sse2, 0),
-        SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_sse2, 0),
-        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_sse2, 0),
-        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_sse2, 0),
-        SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_sse2, 0),
-        SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_sse2, 0),
-        SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_sse2, 0),
-        SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_sse2, 0),
-        SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_sse2, 0),
-        SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_sse2, 0),
-        SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_sse2, 0),
-        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_sse2, 0),
-
-        SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_sse2, 0),
-        SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_sse2, 0),
-        SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_sse2, 0),
-        SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_sse2, 0),
-        SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_sse2, 0),
-        SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_sse2,
-                                0)));
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(
     SSE2, MseHBDWxHTest,
     ::testing::Values(MseHBDWxHParams(3, 3, &aom_mse_wxh_16bit_highbd_sse2, 10),
                       MseHBDWxHParams(3, 2, &aom_mse_wxh_16bit_highbd_sse2, 10),
                       MseHBDWxHParams(2, 3, &aom_mse_wxh_16bit_highbd_sse2, 10),
                       MseHBDWxHParams(2, 2, &aom_mse_wxh_16bit_highbd_sse2,
                                       10)));
-#endif  // HAVE_SSE2
+
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, AvxSubpelVarianceTest,
@@ -2354,360 +1717,4 @@
 INSTANTIATE_TEST_SUITE_P(SSE2, AvxHBDSubpelAvgVarianceTest,
                          ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_sse2));
 #endif  // HAVE_SSE2
-
-#if HAVE_SSSE3
-INSTANTIATE_TEST_SUITE_P(
-    SSSE3, AvxSubpelVarianceTest,
-    ::testing::Values(
-        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_ssse3, 0),
-        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_ssse3, 0),
-        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_ssse3, 0),
-        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_ssse3, 0),
-        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_ssse3, 0),
-        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_ssse3, 0),
-        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_ssse3, 0),
-        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_ssse3, 0),
-        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_ssse3, 0),
-        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_ssse3, 0),
-        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_ssse3, 0),
-        SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_ssse3, 0),
-        SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_ssse3, 0),
-        SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_ssse3, 0),
-        SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_ssse3, 0),
-        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_ssse3, 0),
-
-        SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_ssse3, 0),
-        SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_ssse3, 0),
-        SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_ssse3, 0),
-        SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_ssse3, 0),
-        SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_ssse3, 0),
-        SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_ssse3, 0)));
-
-INSTANTIATE_TEST_SUITE_P(
-    SSSE3, AvxSubpelAvgVarianceTest,
-    ::testing::Values(
-        SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_ssse3,
-                                0),
-        SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_ssse3,
-                                0),
-        SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_ssse3,
-                                0),
-        SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_ssse3,
-                                0),
-        SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_ssse3,
-                                0),
-        SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_ssse3,
-                                0),
-        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_ssse3,
-                                0),
-        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_ssse3,
-                                0),
-        SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_ssse3,
-                                0),
-        SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_ssse3,
-                                0),
-        SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_ssse3, 0),
-        SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_ssse3, 0),
-        SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_ssse3, 0),
-        SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_ssse3, 0),
-        SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_ssse3, 0),
-        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_ssse3, 0),
-
-        SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_ssse3,
-                                0),
-        SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_ssse3,
-                                0),
-        SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_ssse3, 0),
-        SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_ssse3, 0),
-        SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_ssse3, 0),
-        SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_ssse3,
-                                0)));
-
-INSTANTIATE_TEST_SUITE_P(
-    SSSE3, AvxDistWtdSubpelAvgVarianceTest,
-    ::testing::Values(
-        DistWtdSubpelAvgVarianceParams(
-            7, 7, &aom_dist_wtd_sub_pixel_avg_variance128x128_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            7, 6, &aom_dist_wtd_sub_pixel_avg_variance128x64_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            6, 7, &aom_dist_wtd_sub_pixel_avg_variance64x128_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            4, 3, &aom_dist_wtd_sub_pixel_avg_variance16x8_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            3, 4, &aom_dist_wtd_sub_pixel_avg_variance8x16_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            3, 3, &aom_dist_wtd_sub_pixel_avg_variance8x8_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            3, 2, &aom_dist_wtd_sub_pixel_avg_variance8x4_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            2, 3, &aom_dist_wtd_sub_pixel_avg_variance4x8_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_ssse3, 0),
-
-        DistWtdSubpelAvgVarianceParams(
-            6, 4, &aom_dist_wtd_sub_pixel_avg_variance64x16_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            4, 6, &aom_dist_wtd_sub_pixel_avg_variance16x64_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            5, 3, &aom_dist_wtd_sub_pixel_avg_variance32x8_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            3, 5, &aom_dist_wtd_sub_pixel_avg_variance8x32_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            4, 2, &aom_dist_wtd_sub_pixel_avg_variance16x4_ssse3, 0),
-        DistWtdSubpelAvgVarianceParams(
-            2, 4, &aom_dist_wtd_sub_pixel_avg_variance4x16_ssse3, 0)));
-#endif  // HAVE_SSSE3
-
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(
-    SSE4_1, AvxObmcSubpelVarianceTest,
-    ::testing::Values(
-        ObmcSubpelVarianceParams(7, 7,
-                                 &aom_obmc_sub_pixel_variance128x128_sse4_1, 0),
-        ObmcSubpelVarianceParams(7, 6,
-                                 &aom_obmc_sub_pixel_variance128x64_sse4_1, 0),
-        ObmcSubpelVarianceParams(6, 7,
-                                 &aom_obmc_sub_pixel_variance64x128_sse4_1, 0),
-        ObmcSubpelVarianceParams(6, 6, &aom_obmc_sub_pixel_variance64x64_sse4_1,
-                                 0),
-        ObmcSubpelVarianceParams(6, 5, &aom_obmc_sub_pixel_variance64x32_sse4_1,
-                                 0),
-        ObmcSubpelVarianceParams(5, 6, &aom_obmc_sub_pixel_variance32x64_sse4_1,
-                                 0),
-        ObmcSubpelVarianceParams(5, 5, &aom_obmc_sub_pixel_variance32x32_sse4_1,
-                                 0),
-        ObmcSubpelVarianceParams(5, 4, &aom_obmc_sub_pixel_variance32x16_sse4_1,
-                                 0),
-        ObmcSubpelVarianceParams(4, 5, &aom_obmc_sub_pixel_variance16x32_sse4_1,
-                                 0),
-        ObmcSubpelVarianceParams(4, 4, &aom_obmc_sub_pixel_variance16x16_sse4_1,
-                                 0),
-        ObmcSubpelVarianceParams(4, 3, &aom_obmc_sub_pixel_variance16x8_sse4_1,
-                                 0),
-        ObmcSubpelVarianceParams(3, 4, &aom_obmc_sub_pixel_variance8x16_sse4_1,
-                                 0),
-        ObmcSubpelVarianceParams(3, 3, &aom_obmc_sub_pixel_variance8x8_sse4_1,
-                                 0),
-        ObmcSubpelVarianceParams(3, 2, &aom_obmc_sub_pixel_variance8x4_sse4_1,
-                                 0),
-        ObmcSubpelVarianceParams(2, 3, &aom_obmc_sub_pixel_variance4x8_sse4_1,
-                                 0),
-        ObmcSubpelVarianceParams(2, 2, &aom_obmc_sub_pixel_variance4x4_sse4_1,
-                                 0),
-
-        ObmcSubpelVarianceParams(6, 4, &aom_obmc_sub_pixel_variance64x16_sse4_1,
-                                 0),
-        ObmcSubpelVarianceParams(4, 6, &aom_obmc_sub_pixel_variance16x64_sse4_1,
-                                 0),
-        ObmcSubpelVarianceParams(5, 3, &aom_obmc_sub_pixel_variance32x8_sse4_1,
-                                 0),
-        ObmcSubpelVarianceParams(3, 5, &aom_obmc_sub_pixel_variance8x32_sse4_1,
-                                 0),
-        ObmcSubpelVarianceParams(4, 2, &aom_obmc_sub_pixel_variance16x4_sse4_1,
-                                 0),
-        ObmcSubpelVarianceParams(2, 4, &aom_obmc_sub_pixel_variance4x16_sse4_1,
-                                 0)));
-#endif  // HAVE_SSE4_1
-
-#if HAVE_AVX2
-
-INSTANTIATE_TEST_SUITE_P(
-    AVX2, MseWxHTest,
-    ::testing::Values(MseWxHParams(3, 3, &aom_mse_wxh_16bit_avx2, 8),
-                      MseWxHParams(3, 2, &aom_mse_wxh_16bit_avx2, 8),
-                      MseWxHParams(2, 3, &aom_mse_wxh_16bit_avx2, 8),
-                      MseWxHParams(2, 2, &aom_mse_wxh_16bit_avx2, 8)));
-
-INSTANTIATE_TEST_SUITE_P(AVX2, AvxMseTest,
-                         ::testing::Values(MseParams(4, 4,
-                                                     &aom_mse16x16_avx2)));
-
-INSTANTIATE_TEST_SUITE_P(
-    AVX2, AvxVarianceTest,
-    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_avx2),
-                      VarianceParams(7, 6, &aom_variance128x64_avx2),
-                      VarianceParams(6, 7, &aom_variance64x128_avx2),
-                      VarianceParams(6, 6, &aom_variance64x64_avx2),
-                      VarianceParams(6, 5, &aom_variance64x32_avx2),
-                      VarianceParams(6, 4, &aom_variance64x16_avx2),
-                      VarianceParams(5, 6, &aom_variance32x64_avx2),
-                      VarianceParams(5, 5, &aom_variance32x32_avx2),
-                      VarianceParams(5, 4, &aom_variance32x16_avx2),
-                      VarianceParams(5, 3, &aom_variance32x8_avx2),
-                      VarianceParams(4, 6, &aom_variance16x64_avx2),
-                      VarianceParams(4, 5, &aom_variance16x32_avx2),
-                      VarianceParams(4, 4, &aom_variance16x16_avx2),
-                      VarianceParams(4, 3, &aom_variance16x8_avx2),
-                      VarianceParams(4, 2, &aom_variance16x4_avx2)));
-
-INSTANTIATE_TEST_SUITE_P(
-    AVX2, AvxSubpelVarianceTest,
-    ::testing::Values(
-        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_avx2, 0),
-        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_avx2, 0),
-        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_avx2, 0),
-        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_avx2, 0),
-        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_avx2, 0),
-        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_avx2, 0),
-        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_avx2, 0),
-        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_avx2, 0),
-        SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_avx2, 0),
-        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_avx2, 0),
-        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_avx2, 0),
-        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_avx2, 0),
-        SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_avx2, 0)));
-
-INSTANTIATE_TEST_SUITE_P(
-    AVX2, AvxSubpelAvgVarianceTest,
-    ::testing::Values(
-        SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_avx2,
-                                0),
-        SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_avx2,
-                                0),
-        SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_avx2,
-                                0),
-        SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_avx2, 0),
-        SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_avx2, 0),
-        SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_avx2, 0),
-        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_avx2, 0),
-        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_avx2,
-                                0)));
-#endif  // HAVE_AVX2
-
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(NEON, AvxSseTest,
-                         ::testing::Values(SseParams(2, 2,
-                                                     &aom_get4x4sse_cs_neon)));
-
-INSTANTIATE_TEST_SUITE_P(NEON, AvxMseTest,
-                         ::testing::Values(MseParams(4, 4,
-                                                     &aom_mse16x16_neon)));
-
-INSTANTIATE_TEST_SUITE_P(
-    NEON, AvxVarianceTest,
-    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_neon),
-                      VarianceParams(6, 6, &aom_variance64x64_neon),
-                      VarianceParams(7, 6, &aom_variance128x64_neon),
-                      VarianceParams(6, 7, &aom_variance64x128_neon),
-                      VarianceParams(6, 6, &aom_variance64x64_neon),
-                      VarianceParams(6, 5, &aom_variance64x32_neon),
-                      VarianceParams(5, 6, &aom_variance32x64_neon),
-                      VarianceParams(5, 5, &aom_variance32x32_neon),
-                      VarianceParams(5, 4, &aom_variance32x16_neon),
-                      VarianceParams(4, 5, &aom_variance16x32_neon),
-                      VarianceParams(4, 4, &aom_variance16x16_neon),
-                      VarianceParams(4, 3, &aom_variance16x8_neon),
-                      VarianceParams(3, 4, &aom_variance8x16_neon),
-                      VarianceParams(3, 3, &aom_variance8x8_neon),
-                      VarianceParams(3, 2, &aom_variance8x4_neon),
-                      VarianceParams(2, 3, &aom_variance4x8_neon),
-                      VarianceParams(2, 2, &aom_variance4x4_neon)));
-
-INSTANTIATE_TEST_SUITE_P(
-    NEON, AvxSubpelVarianceTest,
-    ::testing::Values(
-        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_neon, 0),
-        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_neon, 0),
-        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_neon, 0),
-        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_neon, 0),
-        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_neon, 0),
-        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_neon, 0),
-        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_neon, 0),
-        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_neon, 0),
-        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_neon, 0),
-        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_neon, 0),
-        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_neon, 0),
-        SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_neon, 0),
-        SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_neon, 0),
-        SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_neon, 0),
-        SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_neon, 0),
-        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_neon, 0),
-
-        SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_neon, 0),
-        SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_neon, 0),
-        SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_neon, 0),
-        SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_neon, 0),
-        SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_neon, 0),
-        SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_neon, 0)));
-#endif  // HAVE_NEON
-
-#if HAVE_MSA
-INSTANTIATE_TEST_SUITE_P(MSA, SumOfSquaresTest,
-                         ::testing::Values(aom_get_mb_ss_msa));
-
-INSTANTIATE_TEST_SUITE_P(MSA, AvxSseTest,
-                         ::testing::Values(SseParams(2, 2,
-                                                     &aom_get4x4sse_cs_msa)));
-
-INSTANTIATE_TEST_SUITE_P(MSA, AvxMseTest,
-                         ::testing::Values(MseParams(4, 4, &aom_mse16x16_msa),
-                                           MseParams(4, 3, &aom_mse16x8_msa),
-                                           MseParams(3, 4, &aom_mse8x16_msa),
-                                           MseParams(3, 3, &aom_mse8x8_msa)));
-
-INSTANTIATE_TEST_SUITE_P(
-    MSA, AvxVarianceTest,
-    ::testing::Values(VarianceParams(6, 6, &aom_variance64x64_msa),
-                      VarianceParams(6, 5, &aom_variance64x32_msa),
-                      VarianceParams(5, 6, &aom_variance32x64_msa),
-                      VarianceParams(5, 5, &aom_variance32x32_msa),
-                      VarianceParams(5, 4, &aom_variance32x16_msa),
-                      VarianceParams(4, 5, &aom_variance16x32_msa),
-                      VarianceParams(4, 4, &aom_variance16x16_msa),
-                      VarianceParams(4, 3, &aom_variance16x8_msa),
-                      VarianceParams(3, 4, &aom_variance8x16_msa),
-                      VarianceParams(3, 3, &aom_variance8x8_msa),
-                      VarianceParams(3, 2, &aom_variance8x4_msa),
-                      VarianceParams(2, 3, &aom_variance4x8_msa),
-                      VarianceParams(2, 2, &aom_variance4x4_msa)));
-
-INSTANTIATE_TEST_SUITE_P(
-    MSA, AvxSubpelVarianceTest,
-    ::testing::Values(
-        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_msa, 0),
-        SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_msa, 0),
-        SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_msa, 0),
-        SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_msa, 0),
-        SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_msa, 0),
-        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_msa, 0),
-        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_msa, 0),
-        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_msa, 0),
-        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_msa, 0),
-        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_msa, 0),
-        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_msa, 0),
-        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_msa, 0),
-        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_msa, 0)));
-
-INSTANTIATE_TEST_SUITE_P(
-    MSA, AvxSubpelAvgVarianceTest,
-    ::testing::Values(
-        SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_msa, 0),
-        SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_msa, 0),
-        SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_msa, 0),
-        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_msa, 0),
-        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_msa, 0),
-        SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_msa, 0),
-        SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_msa, 0),
-        SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_msa, 0),
-        SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_msa, 0),
-        SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_msa, 0),
-        SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_msa, 0),
-        SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_msa, 0),
-        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_msa, 0)));
-#endif  // HAVE_MSA
 }  // namespace

diff --git a/test/warp_filter_test.cc b/test/warp_filter_test.cc
index 3bf14df..a7cf750 100644
--- a/test/warp_filter_test.cc
+++ b/test/warp_filter_test.cc

@@ -15,28 +15,12 @@
 #include "test/warp_filter_test_util.h"
 using libaom_test::ACMRandom;
 using libaom_test::AV1HighbdWarpFilter::AV1HighbdWarpFilterTest;
-using libaom_test::AV1WarpFilter::AV1WarpFilterTest;
 using std::make_tuple;
 using std::tuple;
 
 namespace {
 
-TEST_P(AV1WarpFilterTest, CheckOutput) {
-  RunCheckOutput(std::get<3>(GET_PARAM(0)));
-}
-TEST_P(AV1WarpFilterTest, DISABLED_Speed) {
-  RunSpeedTest(std::get<3>(GET_PARAM(0)));
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    C, AV1WarpFilterTest,
-    libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_c));
-
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(
-    SSE4_1, AV1WarpFilterTest,
-    libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_sse4_1));
-
 TEST_P(AV1HighbdWarpFilterTest, CheckOutput) {
   RunCheckOutput(std::get<4>(GET_PARAM(0)));
 }
@@ -51,19 +35,9 @@
 
 #if HAVE_AVX2
 INSTANTIATE_TEST_SUITE_P(
-    AVX2, AV1WarpFilterTest,
-    libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_avx2));
-
-INSTANTIATE_TEST_SUITE_P(
     AVX2, AV1HighbdWarpFilterTest,
     libaom_test::AV1HighbdWarpFilter::BuildParams(av1_highbd_warp_affine_avx2));
 
 #endif  // HAVE_AVX2
 
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(
-    NEON, AV1WarpFilterTest,
-    libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_neon));
-#endif  // HAVE_NEON
-
 }  // namespace

diff --git a/test/warp_filter_test_util.cc b/test/warp_filter_test_util.cc
index f7ee1e5..e711bb1 100644
--- a/test/warp_filter_test_util.cc
+++ b/test/warp_filter_test_util.cc

@@ -94,185 +94,6 @@
   }
 }
 
-namespace AV1WarpFilter {
-::testing::internal::ParamGenerator<WarpTestParams> BuildParams(
-    warp_affine_func filter) {
-  WarpTestParam params[] = {
-    make_tuple(4, 4, 5000, filter),  make_tuple(8, 8, 5000, filter),
-    make_tuple(64, 64, 100, filter), make_tuple(4, 16, 2000, filter),
-    make_tuple(32, 8, 1000, filter),
-  };
-  return ::testing::Combine(::testing::ValuesIn(params),
-                            ::testing::Values(0, 1), ::testing::Values(0, 1),
-                            ::testing::Values(0, 1), ::testing::Values(0, 1));
-}
-
-AV1WarpFilterTest::~AV1WarpFilterTest() {}
-void AV1WarpFilterTest::SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
-
-void AV1WarpFilterTest::TearDown() { libaom_test::ClearSystemState(); }
-
-void AV1WarpFilterTest::RunSpeedTest(warp_affine_func test_impl) {
-  const int w = 128, h = 128;
-  const int border = 16;
-  const int stride = w + 2 * border;
-  WarpTestParam params = GET_PARAM(0);
-  const int out_w = std::get<0>(params), out_h = std::get<1>(params);
-  const int is_alpha_zero = GET_PARAM(1);
-  const int is_beta_zero = GET_PARAM(2);
-  const int is_gamma_zero = GET_PARAM(3);
-  const int is_delta_zero = GET_PARAM(4);
-  int sub_x, sub_y;
-  const int bd = 8;
-
-  uint8_t *input_ = new uint8_t[h * stride];
-  uint8_t *input = input_ + border;
-
-  // The warp functions always write rows with widths that are multiples of 8.
-  // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
-  int output_n = ((out_w + 7) & ~7) * out_h;
-  uint8_t *output = new uint8_t[output_n];
-  int32_t mat[8];
-  int16_t alpha, beta, gamma, delta;
-  ConvolveParams conv_params = get_conv_params(0, 0, bd);
-  CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n];
-  generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
-                        is_alpha_zero, is_beta_zero, is_gamma_zero,
-                        is_delta_zero);
-
-  for (int r = 0; r < h; ++r)
-    for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand8();
-  for (int r = 0; r < h; ++r) {
-    memset(input + r * stride - border, input[r * stride], border);
-    memset(input + r * stride + w, input[r * stride + (w - 1)], border);
-  }
-
-  sub_x = 0;
-  sub_y = 0;
-  int do_average = 0;
-
-  conv_params = get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd);
-
-  const int num_loops = 1000000000 / (out_w + out_h);
-  aom_usec_timer timer;
-  aom_usec_timer_start(&timer);
-  for (int i = 0; i < num_loops; ++i)
-    test_impl(mat, input, w, h, stride, output, 32, 32, out_w, out_h, out_w,
-              sub_x, sub_y, &conv_params, alpha, beta, gamma, delta);
-
-  aom_usec_timer_mark(&timer);
-  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-  printf("warp %3dx%-3d: %7.2f ns\n", out_w, out_h,
-         1000.0 * elapsed_time / num_loops);
-
-  delete[] input_;
-  delete[] output;
-  delete[] dsta;
-}
-
-void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
-  const int w = 128, h = 128;
-  const int border = 16;
-  const int stride = w + 2 * border;
-  WarpTestParam params = GET_PARAM(0);
-  const int is_alpha_zero = GET_PARAM(1);
-  const int is_beta_zero = GET_PARAM(2);
-  const int is_gamma_zero = GET_PARAM(3);
-  const int is_delta_zero = GET_PARAM(4);
-  const int out_w = std::get<0>(params), out_h = std::get<1>(params);
-  const int num_iters = std::get<2>(params);
-  int i, j, sub_x, sub_y;
-  const int bd = 8;
-
-  // The warp functions always write rows with widths that are multiples of 8.
-  // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
-  int output_n = ((out_w + 7) & ~7) * out_h;
-  uint8_t *input_ = new uint8_t[h * stride];
-  uint8_t *input = input_ + border;
-  uint8_t *output = new uint8_t[output_n];
-  uint8_t *output2 = new uint8_t[output_n];
-  int32_t mat[8];
-  int16_t alpha, beta, gamma, delta;
-  ConvolveParams conv_params = get_conv_params(0, 0, bd);
-  CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n];
-  CONV_BUF_TYPE *dstb = new CONV_BUF_TYPE[output_n];
-  for (int i = 0; i < output_n; ++i) output[i] = output2[i] = rnd_.Rand8();
-
-  for (i = 0; i < num_iters; ++i) {
-    // Generate an input block and extend its borders horizontally
-    for (int r = 0; r < h; ++r)
-      for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand8();
-    for (int r = 0; r < h; ++r) {
-      memset(input + r * stride - border, input[r * stride], border);
-      memset(input + r * stride + w, input[r * stride + (w - 1)], border);
-    }
-    const int use_no_round = rnd_.Rand8() & 1;
-    for (sub_x = 0; sub_x < 2; ++sub_x)
-      for (sub_y = 0; sub_y < 2; ++sub_y) {
-        generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
-                              is_alpha_zero, is_beta_zero, is_gamma_zero,
-                              is_delta_zero);
-
-        for (int ii = 0; ii < 2; ++ii) {
-          for (int jj = 0; jj < 5; ++jj) {
-            for (int do_average = 0; do_average <= 1; ++do_average) {
-              if (use_no_round) {
-                conv_params =
-                    get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd);
-              } else {
-                conv_params = get_conv_params(0, 0, bd);
-              }
-              if (jj >= 4) {
-              } else {
-                conv_params.fwd_offset = quant_dist_lookup_table[jj][ii];
-                conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
-              }
-              av1_warp_affine_c(mat, input, w, h, stride, output, 32, 32, out_w,
-                                out_h, out_w, sub_x, sub_y, &conv_params, alpha,
-                                beta, gamma, delta);
-              if (use_no_round) {
-                conv_params =
-                    get_conv_params_no_round(do_average, 0, dstb, out_w, 1, bd);
-              }
-              if (jj >= 4) {
-              } else {
-                conv_params.fwd_offset = quant_dist_lookup_table[jj][ii];
-                conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
-              }
-              test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h,
-                        out_w, sub_x, sub_y, &conv_params, alpha, beta, gamma,
-                        delta);
-              if (use_no_round) {
-                for (j = 0; j < out_w * out_h; ++j)
-                  ASSERT_EQ(dsta[j], dstb[j])
-                      << "Pixel mismatch at index " << j << " = ("
-                      << (j % out_w) << ", " << (j / out_w) << ") on iteration "
-                      << i;
-                for (j = 0; j < out_w * out_h; ++j)
-                  ASSERT_EQ(output[j], output2[j])
-                      << "Pixel mismatch at index " << j << " = ("
-                      << (j % out_w) << ", " << (j / out_w) << ") on iteration "
-                      << i;
-              } else {
-                for (j = 0; j < out_w * out_h; ++j)
-                  ASSERT_EQ(output[j], output2[j])
-                      << "Pixel mismatch at index " << j << " = ("
-                      << (j % out_w) << ", " << (j / out_w) << ") on iteration "
-                      << i;
-              }
-            }
-          }
-        }
-      }
-  }
-  delete[] input_;
-  delete[] output;
-  delete[] output2;
-  delete[] dsta;
-  delete[] dstb;
-}
-}  // namespace AV1WarpFilter
-
 namespace AV1HighbdWarpFilter {
 ::testing::internal::ParamGenerator<HighbdWarpTestParams> BuildParams(
     highbd_warp_affine_func filter) {

diff --git a/test/wiener_test.cc b/test/wiener_test.cc
index 1b3d6f0..1bd5884 100644
--- a/test/wiener_test.cc
+++ b/test/wiener_test.cc

@@ -29,300 +29,6 @@
 #define MAX_WIENER_BLOCK 384
 #define MAX_DATA_BLOCK (MAX_WIENER_BLOCK + WIENER_WIN)
 
-// 8-bit-depth tests
-namespace wiener_lowbd {
-
-// C implementation of the algorithm implmented by the SIMD code.
-// This is a little more efficient than the version in av1_compute_stats_c().
-static void compute_stats_win_opt_c(int wiener_win, const uint8_t *dgd,
-                                    const uint8_t *src, int h_start, int h_end,
-                                    int v_start, int v_end, int dgd_stride,
-                                    int src_stride, int64_t *M, int64_t *H) {
-  ASSERT_TRUE(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
-  int i, j, k, l, m, n;
-  const int pixel_count = (h_end - h_start) * (v_end - v_start);
-  const int wiener_win2 = wiener_win * wiener_win;
-  const int wiener_halfwin = (wiener_win >> 1);
-  uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
-
-  std::vector<std::vector<int64_t> > M_int(wiener_win,
-                                           std::vector<int64_t>(wiener_win, 0));
-  std::vector<std::vector<int64_t> > H_int(
-      wiener_win * wiener_win, std::vector<int64_t>(wiener_win * 8, 0));
-  std::vector<std::vector<int32_t> > sumY(wiener_win,
-                                          std::vector<int32_t>(wiener_win, 0));
-  int32_t sumX = 0;
-  const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
-
-  // Main loop handles two pixels at a time
-  // We can assume that h_start is even, since it will always be aligned to
-  // a tile edge + some number of restoration units, and both of those will
-  // be 64-pixel aligned.
-  // However, at the edge of the image, h_end may be odd, so we need to handle
-  // that case correctly.
-  assert(h_start % 2 == 0);
-  for (i = v_start; i < v_end; i++) {
-    const int h_end_even = h_end & ~1;
-    const int has_odd_pixel = h_end & 1;
-    for (j = h_start; j < h_end_even; j += 2) {
-      const uint8_t X1 = src[i * src_stride + j];
-      const uint8_t X2 = src[i * src_stride + j + 1];
-      sumX += X1 + X2;
-
-      const uint8_t *dgd_ij = dgd_win + i * dgd_stride + j;
-      for (k = 0; k < wiener_win; k++) {
-        for (l = 0; l < wiener_win; l++) {
-          const uint8_t *dgd_ijkl = dgd_ij + k * dgd_stride + l;
-          int64_t *H_int_temp = &H_int[(l * wiener_win + k)][0];
-          const uint8_t D1 = dgd_ijkl[0];
-          const uint8_t D2 = dgd_ijkl[1];
-          sumY[k][l] += D1 + D2;
-          M_int[l][k] += D1 * X1 + D2 * X2;
-          for (m = 0; m < wiener_win; m++) {
-            for (n = 0; n < wiener_win; n++) {
-              H_int_temp[m * 8 + n] += D1 * dgd_ij[n + dgd_stride * m] +
-                                       D2 * dgd_ij[n + dgd_stride * m + 1];
-            }
-          }
-        }
-      }
-    }
-    // If the width is odd, add in the final pixel
-    if (has_odd_pixel) {
-      const uint8_t X1 = src[i * src_stride + j];
-      sumX += X1;
-
-      const uint8_t *dgd_ij = dgd_win + i * dgd_stride + j;
-      for (k = 0; k < wiener_win; k++) {
-        for (l = 0; l < wiener_win; l++) {
-          const uint8_t *dgd_ijkl = dgd_ij + k * dgd_stride + l;
-          int64_t *H_int_temp = &H_int[(l * wiener_win + k)][0];
-          const uint8_t D1 = dgd_ijkl[0];
-          sumY[k][l] += D1;
-          M_int[l][k] += D1 * X1;
-          for (m = 0; m < wiener_win; m++) {
-            for (n = 0; n < wiener_win; n++) {
-              H_int_temp[m * 8 + n] += D1 * dgd_ij[n + dgd_stride * m];
-            }
-          }
-        }
-      }
-    }
-  }
-
-  const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
-  for (k = 0; k < wiener_win; k++) {
-    for (l = 0; l < wiener_win; l++) {
-      M[l * wiener_win + k] =
-          M_int[l][k] + avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]);
-      for (m = 0; m < wiener_win; m++) {
-        for (n = 0; n < wiener_win; n++) {
-          H[(l * wiener_win + k) * wiener_win2 + m * wiener_win + n] =
-              H_int[(l * wiener_win + k)][n * 8 + m] + avg_square_sum -
-              (int64_t)avg * (sumY[k][l] + sumY[n][m]);
-        }
-      }
-    }
-  }
-}
-
-void compute_stats_opt_c(int wiener_win, const uint8_t *dgd, const uint8_t *src,
-                         int h_start, int h_end, int v_start, int v_end,
-                         int dgd_stride, int src_stride, int64_t *M,
-                         int64_t *H) {
-  if (wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA) {
-    compute_stats_win_opt_c(wiener_win, dgd, src, h_start, h_end, v_start,
-                            v_end, dgd_stride, src_stride, M, H);
-  } else {
-    av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
-                        dgd_stride, src_stride, M, H);
-  }
-}
-
-static const int kIterations = 100;
-typedef void (*compute_stats_Func)(int wiener_win, const uint8_t *dgd,
-                                   const uint8_t *src, int h_start, int h_end,
-                                   int v_start, int v_end, int dgd_stride,
-                                   int src_stride, int64_t *M, int64_t *H);
-
-////////////////////////////////////////////////////////////////////////////////
-// 8 bit
-////////////////////////////////////////////////////////////////////////////////
-
-typedef std::tuple<const compute_stats_Func> WienerTestParam;
-
-class WienerTest : public ::testing::TestWithParam<WienerTestParam> {
- public:
-  virtual void SetUp() {
-    src_buf = (uint8_t *)aom_memalign(
-        32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*src_buf));
-    dgd_buf = (uint8_t *)aom_memalign(
-        32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*dgd_buf));
-    target_func_ = GET_PARAM(0);
-  }
-  virtual void TearDown() {
-    aom_free(src_buf);
-    aom_free(dgd_buf);
-  }
-  void RunWienerTest(const int32_t wiener_win, int32_t run_times);
-  void RunWienerTest_ExtremeValues(const int32_t wiener_win);
-
- private:
-  compute_stats_Func target_func_;
-  libaom_test::ACMRandom rng_;
-  uint8_t *src_buf;
-  uint8_t *dgd_buf;
-};
-
-void WienerTest::RunWienerTest(const int32_t wiener_win, int32_t run_times) {
-  const int32_t wiener_halfwin = wiener_win >> 1;
-  const int32_t wiener_win2 = wiener_win * wiener_win;
-  DECLARE_ALIGNED(32, int64_t, M_ref[WIENER_WIN2]);
-  DECLARE_ALIGNED(32, int64_t, H_ref[WIENER_WIN2 * WIENER_WIN2]);
-  DECLARE_ALIGNED(32, int64_t, M_test[WIENER_WIN2]);
-  DECLARE_ALIGNED(32, int64_t, H_test[WIENER_WIN2 * WIENER_WIN2]);
-  // Note(rachelbarker):
-  // The SIMD code requires `h_start` to be even, but can otherwise
-  // deal with any values of `h_end`, `v_start`, `v_end`. We cover this
-  // entire range, even though (at the time of writing) `h_start` and `v_start`
-  // will always be multiples of 64 when called from non-test code.
-  // If in future any new requirements are added, these lines will
-  // need changing.
-  const int h_start = (rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & ~1;
-  int h_end = run_times != 1 ? 256 : (rng_.Rand16() % MAX_WIENER_BLOCK);
-  const int v_start = rng_.Rand16() % (MAX_WIENER_BLOCK / 2);
-  int v_end = run_times != 1 ? 256 : (rng_.Rand16() % MAX_WIENER_BLOCK);
-  const int dgd_stride = h_end;
-  const int src_stride = MAX_DATA_BLOCK;
-  const int iters = run_times == 1 ? kIterations : 2;
-  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
-    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
-      dgd_buf[i] = rng_.Rand8();
-      src_buf[i] = rng_.Rand8();
-    }
-    uint8_t *dgd = dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin;
-    uint8_t *src = src_buf;
-
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-    for (int i = 0; i < run_times; ++i) {
-      av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
-                          dgd_stride, src_stride, M_ref, H_ref);
-    }
-    aom_usec_timer_mark(&timer);
-    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
-    aom_usec_timer_start(&timer);
-    for (int i = 0; i < run_times; ++i) {
-      target_func_(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
-                   dgd_stride, src_stride, M_test, H_test);
-    }
-    aom_usec_timer_mark(&timer);
-    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
-    if (run_times > 10) {
-      printf("win %d %3dx%-3d:%7.2f/%7.2fns", wiener_win, h_end, v_end, time1,
-             time2);
-      printf("(%3.2f)\n", time1 / time2);
-    }
-    int failed = 0;
-    for (int i = 0; i < wiener_win2; ++i) {
-      if (M_ref[i] != M_test[i]) {
-        failed = 1;
-        printf("win %d M iter %d [%4d] ref %6" PRId64 " test %6" PRId64 " \n",
-               wiener_win, iter, i, M_ref[i], M_test[i]);
-        break;
-      }
-    }
-    for (int i = 0; i < wiener_win2 * wiener_win2; ++i) {
-      if (H_ref[i] != H_test[i]) {
-        failed = 1;
-        printf("win %d H iter %d [%4d] ref %6" PRId64 " test %6" PRId64 " \n",
-               wiener_win, iter, i, H_ref[i], H_test[i]);
-        break;
-      }
-    }
-    ASSERT_EQ(failed, 0);
-  }
-}
-
-void WienerTest::RunWienerTest_ExtremeValues(const int32_t wiener_win) {
-  const int32_t wiener_halfwin = wiener_win >> 1;
-  const int32_t wiener_win2 = wiener_win * wiener_win;
-  DECLARE_ALIGNED(32, int64_t, M_ref[WIENER_WIN2]);
-  DECLARE_ALIGNED(32, int64_t, H_ref[WIENER_WIN2 * WIENER_WIN2]);
-  DECLARE_ALIGNED(32, int64_t, M_test[WIENER_WIN2]);
-  DECLARE_ALIGNED(32, int64_t, H_test[WIENER_WIN2 * WIENER_WIN2]);
-  const int h_start = 16;
-  const int h_end = MAX_WIENER_BLOCK;
-  const int v_start = 16;
-  const int v_end = MAX_WIENER_BLOCK;
-  const int dgd_stride = h_end;
-  const int src_stride = MAX_DATA_BLOCK;
-  const int iters = 1;
-  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
-    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
-      dgd_buf[i] = 255;
-      src_buf[i] = 255;
-    }
-    uint8_t *dgd = dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin;
-    uint8_t *src = src_buf;
-
-    av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
-                        dgd_stride, src_stride, M_ref, H_ref);
-
-    target_func_(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
-                 dgd_stride, src_stride, M_test, H_test);
-
-    int failed = 0;
-    for (int i = 0; i < wiener_win2; ++i) {
-      if (M_ref[i] != M_test[i]) {
-        failed = 1;
-        printf("win %d M iter %d [%4d] ref %6" PRId64 " test %6" PRId64 " \n",
-               wiener_win, iter, i, M_ref[i], M_test[i]);
-        break;
-      }
-    }
-    for (int i = 0; i < wiener_win2 * wiener_win2; ++i) {
-      if (H_ref[i] != H_test[i]) {
-        failed = 1;
-        printf("win %d H iter %d [%4d] ref %6" PRId64 " test %6" PRId64 " \n",
-               wiener_win, iter, i, H_ref[i], H_test[i]);
-        break;
-      }
-    }
-    ASSERT_EQ(failed, 0);
-  }
-}
-
-TEST_P(WienerTest, RandomValues) {
-  RunWienerTest(WIENER_WIN, 1);
-  RunWienerTest(WIENER_WIN_CHROMA, 1);
-}
-
-TEST_P(WienerTest, ExtremeValues) {
-  RunWienerTest_ExtremeValues(WIENER_WIN);
-  RunWienerTest_ExtremeValues(WIENER_WIN_CHROMA);
-}
-
-TEST_P(WienerTest, DISABLED_Speed) {
-  RunWienerTest(WIENER_WIN, 200);
-  RunWienerTest(WIENER_WIN_CHROMA, 200);
-}
-
-INSTANTIATE_TEST_SUITE_P(C, WienerTest, ::testing::Values(compute_stats_opt_c));
-
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(SSE4_1, WienerTest,
-                         ::testing::Values(av1_compute_stats_sse4_1));
-#endif  // HAVE_SSE4_1
-
-#if HAVE_AVX2
-
-INSTANTIATE_TEST_SUITE_P(AVX2, WienerTest,
-                         ::testing::Values(av1_compute_stats_avx2));
-#endif  // HAVE_AVX2
-
-}  // namespace wiener_lowbd
-
 // High bit-depth tests:
 namespace wiener_highbd {
commit	857e93fa9519ec2478eacd44f32a40ac05535b23	[log] [tgz]
author	James Almer <jamrial@videolan.org>	Wed May 25 16:44:43 2022 +0000
committer	Urvang Joshi <urvang@google.com>	Wed May 25 16:44:43 2022 +0000
tree	37d57305506d01d07327d952c2f52250b9004633
parent	9aca3b7ef56b8a5f9ec5bfb0cdf05b39a34259bb [diff]