Upgrade resampler for scaled output from aomdec

When valid p/q is found for lanczos filters, 2d lanczos resampler
is used instead of using libyuv.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f5bba8b..02e54ca 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -166,6 +166,8 @@
 list(
   APPEND
   AOM_DECODER_APP_UTIL_SOURCES
+  "${AOM_ROOT}/av1/common/lanczos_resample.h"
+  "${AOM_ROOT}/av1/common/lanczos_resample.c"
   "${AOM_ROOT}/common/ivfdec.c"
   "${AOM_ROOT}/common/ivfdec.h"
   "${AOM_ROOT}/common/obudec.c"
diff --git a/apps/aomdec.c b/apps/aomdec.c
index b9f3981..25f1ccd 100644
--- a/apps/aomdec.c
+++ b/apps/aomdec.c
@@ -31,6 +31,7 @@
 #include "aom/aomdx.h"
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem_ops.h"
+#include "av1/common/lanczos_resample.h"
 #include "common/args.h"
 #include "common/ivfdec.h"
 #include "common/md5_utils.h"
@@ -120,6 +121,53 @@
   &skipfilmgrain,  NULL
 };
 
+static INLINE int get_plane_size_i420(int size, int is_uv) {
+  return is_uv ? (size + 1) >> 1 : size;
+  return size;
+}
+
+static INLINE int lanczos_scale(aom_image_t *src, aom_image_t *dst, int bd) {
+  if (src->fmt != dst->fmt ||
+      (src->fmt != AOM_IMG_FMT_I42016 && src->fmt != AOM_IMG_FMT_I420))
+    return -1;
+
+  int scale_q = -1;
+  int scale_p = -1;
+  av1_derive_scale_factor(src->d_w, dst->d_w, &scale_p, &scale_q);
+  if (scale_p <= 0 || scale_q <= 0) return -1;
+
+  int scale_q_h = -1;
+  int scale_p_h = -1;
+  av1_derive_scale_factor(src->d_h, dst->d_h, &scale_p_h, &scale_q_h);
+  if (scale_p != scale_p_h || scale_q != scale_q_h) return -1;
+
+  for (int i = 0; i < 3; ++i) {
+    const int is_uv = (i > 0);
+    const int lanczos_a_hor =
+        is_uv ? LANCZOS_A_NORMATIVE_HOR_C : LANCZOS_A_NORMATIVE_HOR_Y;
+    const int lanczos_a_ver =
+        is_uv ? LANCZOS_A_NORMATIVE_VER_C : LANCZOS_A_NORMATIVE_VER_Y;
+    const int src_h = get_plane_size_i420(src->d_h, is_uv);
+    const int src_w = get_plane_size_i420(src->d_w, is_uv);
+    const int dst_h = get_plane_size_i420(dst->d_h, is_uv);
+    const int dst_w = get_plane_size_i420(dst->d_w, is_uv);
+
+    if (src->fmt == AOM_IMG_FMT_I420) {
+      av1_resample_plane_2d_8b_lanczos(
+          src->planes[i], src_h, src_w, src->stride[i], dst->planes[i], dst_h,
+          dst_w, dst->stride[i], is_uv ? 1 : 0, is_uv ? 1 : 0, bd, scale_q,
+          scale_p, lanczos_a_hor, lanczos_a_ver);
+    } else {
+      av1_resample_plane_2d_lanczos(
+          (uint16_t *)src->planes[i], src_h, src_w, src->stride[i] / 2,
+          (uint16_t *)dst->planes[i], dst_h, dst_w, dst->stride[i] / 2,
+          is_uv ? 1 : 0, is_uv ? 1 : 0, bd, scale_q, scale_p, lanczos_a_hor,
+          lanczos_a_ver);
+    }
+  }
+  return 0;
+}
+
 #if CONFIG_LIBYUV
 static INLINE int libyuv_scale(aom_image_t *src, aom_image_t *dst,
                                FilterModeEnum mode) {
@@ -940,18 +988,22 @@
           }
 
           if (img->d_w != scaled_img->d_w || img->d_h != scaled_img->d_h) {
+            if (!lanczos_scale(img, scaled_img, img->bit_depth)) {
+              img = scaled_img;
+            } else {
 #if CONFIG_LIBYUV
-            libyuv_scale(img, scaled_img, kFilterBox);
-            img = scaled_img;
+              libyuv_scale(img, scaled_img, kFilterBox);
+              img = scaled_img;
 #else
-            fprintf(
-                stderr,
-                "Failed to scale output frame: %s.\n"
-                "libyuv is required for scaling but is currently disabled.\n"
-                "Be sure to specify -DCONFIG_LIBYUV=1 when running cmake.\n",
-                aom_codec_error(&decoder));
-            goto fail;
+              fprintf(
+                  stderr,
+                  "Failed to scale output frame: %s.\n"
+                  "libyuv is required for scaling but is currently disabled.\n"
+                  "Be sure to specify -DCONFIG_LIBYUV=1 when running cmake.\n",
+                  aom_codec_error(&decoder));
+              goto fail;
 #endif
+            }
           }
         }
         // Default to codec bit depth if output bit depth not set
diff --git a/av1/common/lanczos_resample.c b/av1/common/lanczos_resample.c
index fa79e45..46cf074 100644
--- a/av1/common/lanczos_resample.c
+++ b/av1/common/lanczos_resample.c
@@ -16,8 +16,9 @@
 #include <stdbool.h>
 #include <string.h>
 #include <assert.h>
+#include <limits.h>
 
-#include "tools/lanczos/lanczos_resample.h"
+#include "av1/common/lanczos_resample.h"
 
 /* Shift down with rounding for use when n >= 0, value >= 0 */
 #define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
@@ -166,6 +167,8 @@
 }
 
 static void integerize_array(double *x, int len, int bits, int16_t *y) {
+  assert(len <= MAX_FILTER_LEN);
+
   int sumy = 0;
   for (int i = 0; i < len; ++i) {
     y[i] = (int16_t)rint(x[i] * (1 << bits));
@@ -173,7 +176,7 @@
   }
   while (sumy > (1 << bits)) {
     double mx = -65536.0;
-    int imx = -1;
+    int imx = 0;
     for (int i = 0; i < len; ++i) {
       const double v = (double)y[i] - (x[i] * (1 << bits));
       if (v > mx) {
@@ -181,12 +184,13 @@
         imx = i;
       }
     }
+    assert(imx >= 0 && imx < len);
     y[imx] -= 1;
     sumy -= 1;
   }
   while (sumy < (1 << bits)) {
     double mx = 65536.0;
-    int imx = -1;
+    int imx = 0;
     for (int i = 0; i < len; ++i) {
       const double v = (double)y[i] - (x[i] * (1 << bits));
       if (v < mx) {
@@ -194,6 +198,7 @@
         imx = i;
       }
     }
+    assert(imx >= 0 && imx < len);
     y[imx] += 1;
     sumy += 1;
   }
@@ -257,6 +262,13 @@
 int get_resample_filter(int p, int q, int a, double x0, EXT_TYPE ext_type,
                         WIN_TYPE win_type, int subsampled, int bits,
                         RationalResampleFilter *rf) {
+  // Initialization to silence static analysis warnings.
+  for (int phase = 0; phase < MAX_RATIONAL_FACTOR; ++phase) {
+    for (int i = 0; i < MAX_FILTER_LEN; ++i) {
+      rf->filter[phase][i] = 0;
+    }
+  }
+
   double offset[MAX_RATIONAL_FACTOR + 1];
   int intpel[MAX_RATIONAL_FACTOR];
   if (p <= 0 || q <= 0) {
@@ -340,11 +352,18 @@
                              ClipProfile *clip, int16_t *y, int outlen) {
   (void)inlen;
   const int tapsby2 = rf->length / 2;
+  assert(tapsby2 * 2 <= MAX_FILTER_LEN);
+  assert(rf->p <= MAX_RATIONAL_FACTOR);
+
   const int16_t *xext = x;
   xext += rf->start;
   for (int i = 0, p = 0; i < outlen; ++i, p = (p + 1) % rf->p) {
     int64_t sum = 0;
+
+    assert(p >= 0 && p < MAX_RATIONAL_FACTOR);
     for (int j = -tapsby2 + 1; j <= tapsby2; ++j) {
+      assert(j + tapsby2 - 1 >= 0);
+      assert(j + tapsby2 - 1 < MAX_FILTER_LEN);
       sum += (int)rf->filter[p][j + tapsby2 - 1] * (int)xext[j];
     }
     sum = ROUND_POWER_OF_TWO_SIGNED(sum, downshift);
@@ -527,6 +546,256 @@
   free(tmparr_);
 }
 
+// Assume x buffer is already extended on both sides with x pointing to the
+// leftmost pixel, and the extension values are already filled up.
+static void resample_1d_core_in8b(const uint8_t *x, int inlen,
+                                  RationalResampleFilter *rf, int downshift,
+                                  ClipProfile *clip, int16_t *y, int outlen) {
+  (void)inlen;
+  const int tapsby2 = rf->length / 2;
+  assert(tapsby2 * 2 <= MAX_FILTER_LEN);
+  assert(rf->p <= MAX_RATIONAL_FACTOR);
+
+  const uint8_t *xext = x;
+  xext += rf->start;
+  for (int i = 0, p = 0; i < outlen; ++i, p = (p + 1) % rf->p) {
+    int64_t sum = 0;
+
+    assert(p >= 0 && p < MAX_RATIONAL_FACTOR);
+    for (int j = -tapsby2 + 1; j <= tapsby2; ++j) {
+      assert(j + tapsby2 - 1 >= 0);
+      assert(j + tapsby2 - 1 < MAX_FILTER_LEN);
+      sum += (int)rf->filter[p][j + tapsby2 - 1] * (int)xext[j];
+    }
+    sum = ROUND_POWER_OF_TWO_SIGNED(sum, downshift);
+    if (clip) {
+      y[i] = (int16_t)(clip->issigned
+                           ? doclip((int)sum, -(1 << (clip->bits - 1)),
+                                    (1 << (clip->bits - 1)) - 1)
+                           : doclip((int)sum, 0, (1 << clip->bits) - 1));
+    } else {
+      y[i] = (int16_t)doclip((int)sum, -(1 << 15), (1 << 15) - 1);
+    }
+    xext += rf->steps[p];
+  }
+}
+
+// Assume x buffer is already extended on both sides with x pointing to the
+// leftmost pixel, and the extension values are already filled up.
+static void resample_1d_core_8b(const uint8_t *x, int inlen,
+                                RationalResampleFilter *rf, int downshift,
+                                ClipProfile *clip, uint8_t *y, int outlen) {
+  (void)inlen;
+  const int tapsby2 = rf->length / 2;
+  const uint8_t *xext = x;
+  xext += rf->start;
+  for (int i = 0, p = 0; i < outlen; ++i, p = (p + 1) % rf->p) {
+    int64_t sum = 0;
+    for (int j = -tapsby2 + 1; j <= tapsby2; ++j) {
+      sum += (int)rf->filter[p][j + tapsby2 - 1] * (int)xext[j];
+    }
+    sum = ROUND_POWER_OF_TWO_SIGNED(sum, downshift);
+    if (clip) {
+      y[i] = (uint8_t)(clip->issigned
+                           ? doclip((int)sum, -(1 << (clip->bits - 1)),
+                                    (1 << (clip->bits - 1)) - 1)
+                           : doclip((int)sum, 0, (1 << clip->bits) - 1));
+    } else {
+      y[i] = (uint8_t)sum;
+    }
+    xext += rf->steps[p];
+  }
+}
+
+static void extend_border_8b(uint8_t *x, int inlen, EXT_TYPE ext_type,
+                             int border) {
+  switch (ext_type) {
+    case EXT_REPEAT:
+      for (int i = -border; i < 0; ++i) x[i] = x[0];
+      for (int i = 0; i < border; ++i) x[i + inlen] = x[inlen - 1];
+      break;
+    case EXT_SYMMETRIC:
+      if (inlen >= border) {
+        for (int i = -border; i < 0; ++i) x[i] = x[-i - 1];
+        for (int i = 0; i < border; ++i) x[i + inlen] = x[inlen - 1 - i];
+      } else {
+        for (int i = -border; i < 0; ++i)
+          x[i] = x[(-i - 1 > inlen - 1 ? inlen - 1 : -i - 1)];
+        for (int i = 0; i < border; ++i)
+          x[i + inlen] = x[(inlen - 1 - i < 0 ? 0 : inlen - 1 - i)];
+      }
+      break;
+    case EXT_REFLECT:
+      if (inlen > border) {
+        for (int i = -border; i < 0; ++i) x[i] = x[-i];
+        for (int i = 0; i < border; ++i) x[i + inlen] = x[inlen - 2 - i];
+      } else {
+        for (int i = -border; i < 0; ++i)
+          x[i] = x[(-i > inlen - 1 ? inlen - 1 : -i)];
+        for (int i = 0; i < border; ++i)
+          x[i + inlen] = x[(inlen - 2 - i < 0 ? 0 : inlen - 2 - i)];
+      }
+      break;
+    case EXT_GRADIENT:
+      if (inlen > border) {
+        for (int i = -border; i < 0; ++i) {
+          const int t = 2 * x[0] - x[-i];
+          x[i] = (uint8_t)doclip(t, 0, 255);
+        }
+        for (int i = 0; i < border; ++i) {
+          const int t = 2 * x[inlen - 1] - x[inlen - 2 - i];
+          x[i + inlen] = (uint8_t)doclip(t, 0, 255);
+        }
+      } else {
+        for (int i = -border; i < 0; ++i) {
+          const int t = 2 * x[0] - x[(-i > inlen - 1 ? inlen - 1 : -i)];
+          x[i] = (uint8_t)doclip(t, 0, 255);
+        }
+        for (int i = 0; i < border; ++i) {
+          const int t =
+              2 * x[inlen - 1] - x[(inlen - 2 - i < 0 ? 0 : inlen - 2 - i)];
+          x[i + inlen] = (uint8_t)doclip(t, 0, 255);
+        }
+      }
+      break;
+  }
+}
+
+static void resample_1d_xt_8b(uint8_t *x, int inlen, RationalResampleFilter *rf,
+                              int downshift, ClipProfile *clip, uint8_t *y,
+                              int outlen) {
+  extend_border_8b(x, inlen, rf->ext_type, rf->length / 2);
+  resample_1d_core_8b(x, inlen, rf, downshift, clip, y, outlen);
+}
+
+static void resample_1d_xc_8b(const uint8_t *x, int inlen,
+                              RationalResampleFilter *rf, int downshift,
+                              ClipProfile *clip, uint8_t *y, int outlen,
+                              uint8_t *xext) {
+  memcpy(xext, x, inlen * sizeof(*x));
+
+  resample_1d_xt_8b(xext, inlen, rf, downshift, clip, y, outlen);
+}
+
+static void resample_1d_xt_in8b(uint8_t *x, int inlen,
+                                RationalResampleFilter *rf, int downshift,
+                                ClipProfile *clip, int16_t *y, int outlen) {
+  extend_border_8b(x, inlen, rf->ext_type, rf->length / 2);
+  resample_1d_core_in8b(x, inlen, rf, downshift, clip, y, outlen);
+}
+
+static void resample_1d_xc_in8b(const uint8_t *x, int inlen,
+                                RationalResampleFilter *rf, int downshift,
+                                ClipProfile *clip, int16_t *y, int outlen,
+                                uint8_t *xext) {
+  memcpy(xext, x, inlen * sizeof(*x));
+
+  resample_1d_xt_in8b(xext, inlen, rf, downshift, clip, y, outlen);
+}
+
+static void fill_col_to_arr_in8b(const uint8_t *img, int stride, int len,
+                                 int16_t *arr) {
+  int i;
+  const uint8_t *iptr = img;
+  int16_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *aptr++ = (int16_t)(*iptr);
+  }
+}
+
+static void fill_arr_to_col_out8b(uint8_t *img, int stride, int len,
+                                  const int16_t *arr) {
+  int i;
+  uint8_t *iptr = img;
+  const int16_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *iptr = (uint8_t)*aptr++;
+  }
+}
+
+void resample_1d_8b(const uint8_t *x, int inlen, RationalResampleFilter *rf,
+                    int downshift, ClipProfile *clip, uint8_t *y, int outlen) {
+  const int tapsby2 = rf->length / 2;
+  uint8_t *xext_ = (uint8_t *)malloc((inlen + rf->length) * sizeof(*x));
+  uint8_t *xext = xext_ + tapsby2;
+
+  resample_1d_xc_8b(x, inlen, rf, downshift, clip, y, outlen, xext);
+
+  free(xext_);
+}
+
+void av1_resample_2d_8b(const uint8_t *x, int inwidth, int inheight,
+                        int instride, RationalResampleFilter *rfh,
+                        RationalResampleFilter *rfv, int int_extra_bits,
+                        ClipProfile *clip, uint8_t *y, int outwidth,
+                        int outheight, int outstride) {
+  if (rfv == NULL || is_resampler_noop(rfv)) {
+    resample_horz_8b(x, inwidth, inheight, instride, rfh, clip, y, outwidth,
+                     outstride);
+    return;
+  }
+  if (rfh == NULL || is_resampler_noop(rfh)) {
+    resample_vert_8b(x, inwidth, inheight, instride, rfv, clip, y, outheight,
+                     outstride);
+    return;
+  }
+  int16_t *tmpbuf = (int16_t *)malloc(sizeof(int16_t) * outwidth * inheight);
+  const int arrsize =
+      outheight + ((inheight + rfv->length > inwidth + rfh->length)
+                       ? (inheight + rfv->length)
+                       : (inwidth + rfh->length));
+  int16_t *tmparr_ = (int16_t *)calloc(arrsize, sizeof(int16_t));
+  int16_t *tmparrh = tmparr_ + outheight + rfh->length / 2;
+  int16_t *tmparrv = tmparr_ + outheight + rfv->length / 2;
+  int16_t *tmparro = tmparr_;
+  int tmpstride = outwidth;
+  // intermediate data is stored in 16 bit buffers, so limit int_extra_bits
+  int_extra_bits = MIN(int_extra_bits, 14 - clip->bits);
+  const int downshifth = rfh->filter_bits - int_extra_bits;
+  const int downshiftv = rfh->filter_bits + int_extra_bits;
+  for (int i = 0; i < inheight; ++i) {
+    resample_1d_xc_in8b(x + instride * i, inwidth, rfh, downshifth, NULL,
+                        tmpbuf + i * tmpstride, outwidth, (uint8_t *)tmparrh);
+  }
+  for (int i = 0; i < outwidth; ++i) {
+    fill_col_to_arr(tmpbuf + i, outwidth, inheight, tmparrv);
+    resample_1d_xt(tmparrv, inheight, rfv, downshiftv, clip, tmparro,
+                   outheight);
+    fill_arr_to_col_out8b(y + i, outstride, outheight, tmparro);
+  }
+  free(tmpbuf);
+  free(tmparr_);
+}
+
+void resample_horz_8b(const uint8_t *x, int inwidth, int inheight, int instride,
+                      RationalResampleFilter *rfh, ClipProfile *clip,
+                      uint8_t *y, int outwidth, int outstride) {
+  const int arrsize = inwidth + rfh->length;
+  uint8_t *tmparr_ = (uint8_t *)calloc(arrsize, sizeof(*tmparr_));
+  uint8_t *tmparrh = tmparr_ + rfh->length / 2;
+  for (int i = 0; i < inheight; ++i) {
+    resample_1d_xc_8b(x + instride * i, inwidth, rfh, rfh->filter_bits, clip,
+                      y + i * outstride, outwidth, tmparrh);
+  }
+  free(tmparr_);
+}
+
+void resample_vert_8b(const uint8_t *x, int inwidth, int inheight, int instride,
+                      RationalResampleFilter *rfv, ClipProfile *clip,
+                      uint8_t *y, int outheight, int outstride) {
+  const int arrsize = outheight + inheight + rfv->length;
+  int16_t *tmparr_ = (int16_t *)calloc(arrsize, sizeof(int16_t));
+  int16_t *tmparrv = tmparr_ + outheight + rfv->length / 2;
+  int16_t *tmparro = tmparr_;
+  for (int i = 0; i < inwidth; ++i) {
+    fill_col_to_arr_in8b(x + i, instride, inheight, tmparrv);
+    resample_1d_xt(tmparrv, inheight, rfv, rfv->filter_bits, clip, tmparro,
+                   outheight);
+    fill_arr_to_col_out8b(y + i, outstride, outheight, tmparro);
+  }
+  free(tmparr_);
+}
+
 int get_resampled_output_length(int inlen, int p, int q, int force_even) {
   if (!force_even) {
     // round
@@ -539,3 +808,106 @@
   else
     return outlen_floor;
 }
+
+void av1_derive_scale_factor(int width, int width_scaled, int *p, int *q) {
+  assert(width > 0);
+  assert(width_scaled > 0);
+
+  *p = -1;
+  *q = -1;
+
+  // Lanczos library supports a scaling factor p/q with both p and q <= 16.
+  if ((width > (width_scaled << 4)) || (width_scaled > (width << 4))) return;
+
+  int best_err = abs(width - width_scaled);
+  int best_denom = 1;
+
+  for (int denom = 1; denom <= 16; ++denom) {
+    for (int num = 1; num <= 16; ++num) {
+      int err = abs(num * width - denom * width_scaled);
+
+      if (err * best_denom < best_err * denom) {
+        *p = num;
+        *q = denom;
+        best_err = err;
+        best_denom = denom;
+      }
+    }
+  }
+
+  if (best_err > (best_denom * width) >> 5) {
+    *p = -1;
+    *q = -1;
+  }
+  return;
+}
+
+void av1_resample_plane_2d_lanczos(const uint16_t *const input, int height,
+                                   int width, int in_stride, uint16_t *output,
+                                   int height2, int width2, int out_stride,
+                                   int subx, int suby, int bd, int denom,
+                                   int num, int lanczos_a_hor,
+                                   int lanczos_a_ver) {
+  int coeff_prec_bits = 14;
+  int extra_prec_bits = 2;
+  WIN_TYPE win = WIN_LANCZOS;
+  EXT_TYPE ext = EXT_REPEAT;
+  ClipProfile clip = { bd, 0 };
+  int horz_a = lanczos_a_hor;
+  int vert_a = lanczos_a_ver;
+  double horz_x0 = subx ? (double)('d') : (double)('c');
+  double vert_x0 = suby ? (double)('d') : (double)('c');
+
+  RationalResampleFilter horz_rf;
+  RationalResampleFilter vert_rf;
+
+  if (!get_resample_filter(num, denom, horz_a, horz_x0, ext, win, subx,
+                           coeff_prec_bits, &horz_rf)) {
+    fprintf(stderr, "Cannot generate filter, exiting!\n");
+    exit(1);
+  }
+  if (!get_resample_filter(num, denom, vert_a, vert_x0, ext, win, suby,
+                           coeff_prec_bits, &vert_rf)) {
+    fprintf(stderr, "Cannot generate filter, exiting!\n");
+    exit(1);
+  }
+
+  av1_resample_2d((const int16_t *)input, width, height, in_stride, &horz_rf,
+                  &vert_rf, extra_prec_bits, &clip, (int16_t *)output, width2,
+                  height2, out_stride);
+}
+
+void av1_resample_plane_2d_8b_lanczos(const uint8_t *const input, int height,
+                                      int width, int in_stride, uint8_t *output,
+                                      int height2, int width2, int out_stride,
+                                      int subx, int suby, int bd, int denom,
+                                      int num, int lanczos_a_hor,
+                                      int lanczos_a_ver) {
+  int coeff_prec_bits = 14;
+  int extra_prec_bits = 2;
+  WIN_TYPE win = WIN_LANCZOS;
+  EXT_TYPE ext = EXT_REPEAT;
+  ClipProfile clip = { bd, 0 };
+  int horz_a = lanczos_a_hor;
+  int vert_a = lanczos_a_ver;
+  double horz_x0 = subx ? (double)('d') : (double)('c');
+  double vert_x0 = suby ? (double)('d') : (double)('c');
+
+  RationalResampleFilter horz_rf;
+  RationalResampleFilter vert_rf;
+
+  if (!get_resample_filter(num, denom, horz_a, horz_x0, ext, win, subx,
+                           coeff_prec_bits, &horz_rf)) {
+    fprintf(stderr, "Cannot generate filter, exiting!\n");
+    exit(1);
+  }
+  if (!get_resample_filter(num, denom, vert_a, vert_x0, ext, win, suby,
+                           coeff_prec_bits, &vert_rf)) {
+    fprintf(stderr, "Cannot generate filter, exiting!\n");
+    exit(1);
+  }
+
+  av1_resample_2d_8b(input, width, height, in_stride, &horz_rf, &vert_rf,
+                     extra_prec_bits, &clip, output, width2, height2,
+                     out_stride);
+}
diff --git a/av1/common/lanczos_resample.h b/av1/common/lanczos_resample.h
index 08d9d97..f40d7a3 100644
--- a/av1/common/lanczos_resample.h
+++ b/av1/common/lanczos_resample.h
@@ -20,6 +20,32 @@
 #define MAX_RATIONAL_FACTOR 16
 #define MAX_FILTER_LEN 320
 
+void av1_derive_scale_factor(int width, int width_scaled, int *p, int *q);
+
+#define LANCZOS_A_NORMATIVE_HOR_Y 6  // Normative hor Lanczos a Luma
+#define LANCZOS_A_NORMATIVE_HOR_C 4  // Normative hor Lanczos a Chroma
+#define LANCZOS_A_NORMATIVE_VER_Y 4  // Normative ver Lanczos a Luma
+#define LANCZOS_A_NORMATIVE_VER_C 4  // Normative ver Lanczos a Chroma
+
+#define LANCZOS_A_NONNORMATIVE_HOR_Y 6  // Non-normative hor Lanczos a Luma
+#define LANCZOS_A_NONNORMATIVE_HOR_C 4  // Non-normative hor Lanczos a Chroma
+#define LANCZOS_A_NONNORMATIVE_VER_Y 6  // Non-normative ver Lanczos a Luma
+#define LANCZOS_A_NONNORMATIVE_VER_C \
+  4  // Non-normative ver Lanczos a Chroma
+     // Chroma
+void av1_resample_plane_2d_lanczos(const uint16_t *const input, int height,
+                                   int width, int in_stride, uint16_t *output,
+                                   int height2, int width2, int out_stride,
+                                   int subx, int suby, int bd, int denom,
+                                   int num, int lanczos_a_hor,
+                                   int lanczos_a_ver);
+void av1_resample_plane_2d_8b_lanczos(const uint8_t *const input, int height,
+                                      int width, int in_stride, uint8_t *output,
+                                      int height2, int width2, int out_stride,
+                                      int subx, int suby, int bd, int denom,
+                                      int num, int lanczos_a_hor,
+                                      int lanczos_a_ver);
+
 // Note: check window() function implementation for values of any
 // other params used by these windowing functions.
 typedef enum {
@@ -96,6 +122,26 @@
                    RationalResampleFilter *rfv, ClipProfile *clip, int16_t *y,
                    int outheight, int outstride);
 
+// 8-bit versions of high-level resampling functions
+
+// Assume no extension of the input x buffer
+void resample_1d_8b(const uint8_t *x, int inlen, RationalResampleFilter *rf,
+                    int downshift, ClipProfile *clip, uint8_t *y, int outlen);
+
+void av1_resample_2d_8b(const uint8_t *x, int inwidth, int inheight,
+                        int instride, RationalResampleFilter *rfh,
+                        RationalResampleFilter *rfv, int int_extra_bits,
+                        ClipProfile *clip, uint8_t *y, int outwidth,
+                        int outheight, int outstride);
+
+void resample_horz_8b(const uint8_t *x, int inwidth, int inheight, int instride,
+                      RationalResampleFilter *rfh, ClipProfile *clip,
+                      uint8_t *y, int outwidth, int outstride);
+
+void resample_vert_8b(const uint8_t *x, int inwidth, int inheight, int instride,
+                      RationalResampleFilter *rfv, ClipProfile *clip,
+                      uint8_t *y, int outheight, int outstride);
+
 void show_resample_filter(RationalResampleFilter *rf);
 
 int get_resampled_output_length(int inlen, int p, int q, int force_even);
diff --git a/av1/common/resize.c b/av1/common/resize.c
index 0b2ab30..53b040a 100644
--- a/av1/common/resize.c
+++ b/av1/common/resize.c
@@ -17,8 +17,6 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "aom_ports/system_state.h"
-
 #include "config/aom_config.h"
 
 #include "aom_dsp/aom_dsp_common.h"
@@ -989,54 +987,6 @@
   aom_free(arrbuf2);
 }
 
-#define LANCZOS_A_NORMATIVE_HOR_Y 6  // Normative hor Lanczos a Luma
-#define LANCZOS_A_NORMATIVE_HOR_C 4  // Normative hor Lanczos a Chroma
-#define LANCZOS_A_NORMATIVE_VER_Y 4  // Normative ver Lanczos a Luma
-#define LANCZOS_A_NORMATIVE_VER_C 4  // Normative ver Lanczos a Chroma
-
-#define LANCZOS_A_NONNORMATIVE_HOR_Y 6  // Non-normative hor Lanczos a Luma
-#define LANCZOS_A_NONNORMATIVE_HOR_C 4  // Non-normative hor Lanczos a Chroma
-#define LANCZOS_A_NONNORMATIVE_VER_Y 6  // Non-normative ver Lanczos a Luma
-#define LANCZOS_A_NONNORMATIVE_VER_C \
-  4  // Non-normative ver Lanczos a Chroma
-     // Chroma
-void av1_resample_plane_2d_lanczos(const uint16_t *const input, int height,
-                                   int width, int in_stride, uint16_t *output,
-                                   int height2, int width2, int out_stride,
-                                   int subx, int suby, int bd, int denom,
-                                   int num, int lanczos_a_hor,
-                                   int lanczos_a_ver) {
-  (void)suby;
-
-  int coeff_prec_bits = 14;
-  int extra_prec_bits = 2;
-  WIN_TYPE win = WIN_LANCZOS;
-  EXT_TYPE ext = EXT_REPEAT;
-  ClipProfile clip = { bd, 0 };
-  int horz_a = lanczos_a_hor;
-  int vert_a = lanczos_a_ver;
-  double horz_x0 = subx ? (double)('d') : (double)('c');
-  double vert_x0 = (double)('c');
-
-  RationalResampleFilter horz_rf;
-  RationalResampleFilter vert_rf;
-
-  if (!get_resample_filter(num, denom, horz_a, horz_x0, ext, win, subx,
-                           coeff_prec_bits, &horz_rf)) {
-    fprintf(stderr, "Cannot generate filter, exiting!\n");
-    exit(1);
-  }
-  if (!get_resample_filter(num, denom, vert_a, vert_x0, ext, win, 0,
-                           coeff_prec_bits, &vert_rf)) {
-    fprintf(stderr, "Cannot generate filter, exiting!\n");
-    exit(1);
-  }
-
-  av1_resample_2d((const int16_t *)input, width, height, in_stride, &horz_rf,
-                  &vert_rf, extra_prec_bits, &clip, (int16_t *)output, width2,
-                  height2, out_stride);
-}
-
 void av1_resize_lanczos_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                                          YV12_BUFFER_CONFIG *dst, int bd,
                                          const int num_planes, const int subx,
@@ -1057,41 +1007,6 @@
   aom_extend_frame_borders(dst, num_planes);
 }
 
-static void derive_scale_factor(int width, int width_scaled, int *p, int *q) {
-  assert(width > 0);
-  assert(width_scaled > 0);
-
-  *p = -1;
-  *q = -1;
-
-  // Lanczos library supports a scaling factor p/q with both p and q <= 16.
-  if ((width > (width_scaled << 4)) || (width_scaled > (width << 4))) return;
-
-  aom_clear_system_state();
-
-  const float scale_factor = (float)width_scaled / (float)width;
-  const float error_thresh = 0.05f;
-  float error_min = 1.0f;
-
-  for (int denom = 1; denom <= 16; ++denom) {
-    for (int num = 1; num <= 16; ++num) {
-      float error = fabsf((float)num / (float)denom - scale_factor);
-
-      if (error < error_min) {
-        *p = num;
-        *q = denom;
-        error_min = error;
-      }
-    }
-  }
-
-  if (error_min > error_thresh) {
-    *p = -1;
-    *q = -1;
-  }
-  return;
-}
-
 #if CONFIG_EXT_SUPERRES
 int64_t av1_downup_lanczos_sse(const YV12_BUFFER_CONFIG *src, int bd, int denom,
                                int num) {
@@ -1449,8 +1364,8 @@
       // TODO(yuec): implement 1D superres based on lanczos resampling
       if (cm->superres_scale_denominator == SCALE_NUMERATOR)
 #endif
-        derive_scale_factor(unscaled->y_crop_width, scaled->y_crop_width,
-                            &scale_num, &scale_denom);
+        av1_derive_scale_factor(unscaled->y_crop_width, scaled->y_crop_width,
+                                &scale_num, &scale_denom);
 
       if (scale_denom > 0 && scale_num > 0) {
         av1_resize_lanczos_and_extend_frame(