Make resampling lib more efficient in memory usage

Streamlines the code to make it more efficient in memory
usage and eliminates unnecessary buffer copies.

Change-Id: I1e4dd76e59bf2483579557af709baa32c8370e67
diff --git a/tools/lanczos/lanczos_resample.c b/tools/lanczos/lanczos_resample.c
index d550a14..97db428 100644
--- a/tools/lanczos/lanczos_resample.c
+++ b/tools/lanczos/lanczos_resample.c
@@ -223,9 +223,11 @@
   get_resample_filter(q, p, a, y0, bits, rf);
 }
 
-void resample_1d_core(const int16_t *x, int inlen, RationalResampleFilter *rf,
-                      int downshift, ClipProfile *clip, int16_t *y,
-                      int outlen) {
+// Assume x buffer is already extended on both sides with x pointing to the
+// leftmost pixel, and the extension values are already filled up.
+static void resample_1d_core(const int16_t *x, int inlen,
+                             RationalResampleFilter *rf, int downshift,
+                             ClipProfile *clip, int16_t *y, int outlen) {
   (void)inlen;
   const int tapsby2 = rf->length / 2;
   const int16_t *xext = x;
@@ -245,34 +247,30 @@
   }
 }
 
-void resample_1d_xt(int16_t *x, int inlen, RationalResampleFilter *rf,
-                    int downshift, ClipProfile *clip, int16_t *y, int outlen) {
-  const int tapsby2 = rf->length / 2;
-  for (int i = -tapsby2; i < 0; ++i) x[i] = x[0];
-  for (int i = 0; i < tapsby2; ++i) x[i + inlen] = x[inlen - 1];
+static void extend_border(int16_t *x, int inlen, int border) {
+  for (int i = -border; i < 0; ++i) x[i] = x[0];
+  for (int i = 0; i < border; ++i) x[i + inlen] = x[inlen - 1];
+}
+
+// Assume x buffer is already extended on both sides with x pointing to the
+// leftmost pixel, but the extension values are not filled up.
+static void resample_1d_xt(int16_t *x, int inlen, RationalResampleFilter *rf,
+                           int downshift, ClipProfile *clip, int16_t *y,
+                           int outlen) {
+  extend_border(x, inlen, rf->length / 2);
   resample_1d_core(x, inlen, rf, downshift, clip, y, outlen);
 }
 
-void resample_1d_xc(const int16_t *x, int inlen, RationalResampleFilter *rf,
-                    int downshift, ClipProfile *clip, int16_t *y, int outlen,
-                    int16_t *xext) {
+// Assume a scratch buffer xext of size inlen + rf->length is provided
+static void resample_1d_xc(const int16_t *x, int inlen,
+                           RationalResampleFilter *rf, int downshift,
+                           ClipProfile *clip, int16_t *y, int outlen,
+                           int16_t *xext) {
   memcpy(xext, x, sizeof(*x) * inlen);
 
   resample_1d_xt(xext, inlen, rf, downshift, clip, y, outlen);
 }
 
-void resample_1d(const int16_t *x, int inlen, RationalResampleFilter *rf,
-                 int downshift, ClipProfile *clip, int16_t *y, int outlen) {
-  const int tapsby2 = rf->length / 2;
-  int16_t *xext_ = (int16_t *)malloc((inlen + rf->length) * sizeof(*x));
-  int16_t *xext = xext_ + tapsby2;
-  memcpy(xext, x, sizeof(*x) * inlen);
-
-  resample_1d_xt(xext, inlen, rf, downshift, clip, y, outlen);
-
-  free(xext_);
-}
-
 static void fill_col_to_arr(const int16_t *img, int stride, int len,
                             int16_t *arr) {
   int i;
@@ -293,6 +291,17 @@
   }
 }
 
+void resample_1d(const int16_t *x, int inlen, RationalResampleFilter *rf,
+                 int downshift, ClipProfile *clip, int16_t *y, int outlen) {
+  const int tapsby2 = rf->length / 2;
+  int16_t *xext_ = (int16_t *)malloc((inlen + rf->length) * sizeof(*x));
+  int16_t *xext = xext_ + tapsby2;
+
+  resample_1d_xc(xext, inlen, rf, downshift, clip, y, outlen, xext);
+
+  free(xext_);
+}
+
 void resample_2d(const int16_t *x, int inwidth, int inheight, int instride,
                  RationalResampleFilter *rfh, RationalResampleFilter *rfv,
                  int int_extra_bits, ClipProfile *clip, int16_t *y,
@@ -362,6 +371,191 @@
   free(tmparr_);
 }
 
+// Assume x buffer is already extended on both sides with x pointing to the
+// leftmost pixel, and the extension values are already filled up.
+static void resample_1d_core_in8b(const uint8_t *x, int inlen,
+                                  RationalResampleFilter *rf, int downshift,
+                                  ClipProfile *clip, int16_t *y, int outlen) {
+  (void)inlen;
+  const int tapsby2 = rf->length / 2;
+  const uint8_t *xext = x;
+  xext += rf->start;
+  for (int i = 0, p = 0; i < outlen; ++i, p = (p + 1) % rf->p) {
+    int sum = 0;
+    for (int j = -tapsby2 + 1; j <= tapsby2; ++j) {
+      sum += (int)rf->filter[p][j + tapsby2 - 1] * (int)xext[j];
+    }
+    y[i] = (int16_t)ROUND_POWER_OF_TWO_SIGNED(sum, downshift);
+    if (clip) {
+      y[i] = (int16_t)clip->issigned ? doclip(y[i], -(1 << (clip->bits - 1)),
+                                              (1 << (clip->bits - 1)) - 1)
+                                     : doclip(y[i], 0, (1 << clip->bits) - 1);
+    }
+    xext += rf->steps[p];
+  }
+}
+
+// Assume x buffer is already extended on both sides with x pointing to the
+// leftmost pixel, and the extension values are already filled up.
+static void resample_1d_core_8b(const uint8_t *x, int inlen,
+                                RationalResampleFilter *rf, int downshift,
+                                ClipProfile *clip, uint8_t *y, int outlen) {
+  (void)inlen;
+  const int tapsby2 = rf->length / 2;
+  const uint8_t *xext = x;
+  xext += rf->start;
+  for (int i = 0, p = 0; i < outlen; ++i, p = (p + 1) % rf->p) {
+    int sum = 0;
+    for (int j = -tapsby2 + 1; j <= tapsby2; ++j) {
+      sum += (int)rf->filter[p][j + tapsby2 - 1] * (int)xext[j];
+    }
+    y[i] = (uint8_t)ROUND_POWER_OF_TWO_SIGNED(sum, downshift);
+    if (clip) {
+      y[i] = (uint8_t)clip->issigned ? doclip(y[i], -(1 << (clip->bits - 1)),
+                                              (1 << (clip->bits - 1)) - 1)
+                                     : doclip(y[i], 0, (1 << clip->bits) - 1);
+    }
+    xext += rf->steps[p];
+  }
+}
+
+static void extend_border_8b(uint8_t *x, int inlen, int border) {
+  for (int i = -border; i < 0; ++i) x[i] = x[0];
+  for (int i = 0; i < border; ++i) x[i + inlen] = x[inlen - 1];
+}
+
+static void resample_1d_xt_8b(uint8_t *x, int inlen, RationalResampleFilter *rf,
+                              int downshift, ClipProfile *clip, uint8_t *y,
+                              int outlen) {
+  extend_border_8b(x, inlen, rf->length / 2);
+  resample_1d_core_8b(x, inlen, rf, downshift, clip, y, outlen);
+}
+
+static void resample_1d_xc_8b(const uint8_t *x, int inlen,
+                              RationalResampleFilter *rf, int downshift,
+                              ClipProfile *clip, uint8_t *y, int outlen,
+                              uint8_t *xext) {
+  memcpy(xext, x, inlen * sizeof(*x));
+
+  resample_1d_xt_8b(xext, inlen, rf, downshift, clip, y, outlen);
+}
+
+static void resample_1d_xt_in8b(uint8_t *x, int inlen,
+                                RationalResampleFilter *rf, int downshift,
+                                ClipProfile *clip, int16_t *y, int outlen) {
+  extend_border_8b(x, inlen, rf->length / 2);
+  resample_1d_core_in8b(x, inlen, rf, downshift, clip, y, outlen);
+}
+
+static void resample_1d_xc_in8b(const uint8_t *x, int inlen,
+                                RationalResampleFilter *rf, int downshift,
+                                ClipProfile *clip, int16_t *y, int outlen,
+                                uint8_t *xext) {
+  memcpy(xext, x, inlen * sizeof(*x));
+
+  resample_1d_xt_in8b(xext, inlen, rf, downshift, clip, y, outlen);
+}
+
+static void fill_col_to_arr_in8b(const uint8_t *img, int stride, int len,
+                                 int16_t *arr) {
+  int i;
+  const uint8_t *iptr = img;
+  int16_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *aptr++ = (int16_t)(*iptr);
+  }
+}
+
+static void fill_arr_to_col_out8b(uint8_t *img, int stride, int len,
+                                  const int16_t *arr) {
+  int i;
+  uint8_t *iptr = img;
+  const int16_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *iptr = (uint8_t)*aptr++;
+  }
+}
+
+void resample_1d_8b(const uint8_t *x, int inlen, RationalResampleFilter *rf,
+                    int downshift, ClipProfile *clip, uint8_t *y, int outlen) {
+  const int tapsby2 = rf->length / 2;
+  uint8_t *xext_ = (uint8_t *)malloc((inlen + rf->length) * sizeof(*x));
+  uint8_t *xext = xext_ + tapsby2;
+
+  resample_1d_xc_8b(x, inlen, rf, downshift, clip, y, outlen, xext);
+
+  free(xext_);
+}
+
+void resample_2d_8b(const uint8_t *x, int inwidth, int inheight, int instride,
+                    RationalResampleFilter *rfh, RationalResampleFilter *rfv,
+                    int int_extra_bits, ClipProfile *clip, uint8_t *y,
+                    int outwidth, int outheight, int outstride) {
+  if (rfv == NULL || is_resampler_noop(rfv)) {
+    resample_horz_8b(x, inwidth, inheight, instride, rfh, clip, y, outwidth,
+                     outstride);
+    return;
+  }
+  if (rfh == NULL || is_resampler_noop(rfh)) {
+    resample_vert_8b(x, inwidth, inheight, instride, rfv, clip, y, outheight,
+                     outstride);
+    return;
+  }
+  int16_t *tmpbuf = (int16_t *)malloc(sizeof(int16_t) * outwidth * inheight);
+  const int arrsize =
+      outheight + ((inheight + rfv->length > inwidth + rfh->length)
+                       ? (inheight + rfv->length)
+                       : (inwidth + rfh->length));
+  int16_t *tmparr_ = (int16_t *)calloc(arrsize, sizeof(int16_t));
+  int16_t *tmparrh = tmparr_ + outheight + rfh->length / 2;
+  int16_t *tmparrv = tmparr_ + outheight + rfv->length / 2;
+  int16_t *tmparro = tmparr_;
+  int tmpstride = outwidth;
+  const int downshifth = rfh->filter_bits - int_extra_bits;
+  const int downshiftv = rfh->filter_bits + int_extra_bits;
+  for (int i = 0; i < inheight; ++i) {
+    resample_1d_xc_in8b(x + instride * i, inwidth, rfh, downshifth, NULL,
+                        tmpbuf + i * tmpstride, outwidth, (uint8_t *)tmparrh);
+  }
+  for (int i = 0; i < outwidth; ++i) {
+    fill_col_to_arr(tmpbuf + i, outwidth, inheight, tmparrv);
+    resample_1d_xt(tmparrv, inheight, rfv, downshiftv, clip, tmparro,
+                   outheight);
+    fill_arr_to_col_out8b(y + i, outstride, outheight, tmparro);
+  }
+  free(tmpbuf);
+  free(tmparr_);
+}
+
+void resample_horz_8b(const uint8_t *x, int inwidth, int inheight, int instride,
+                      RationalResampleFilter *rfh, ClipProfile *clip,
+                      uint8_t *y, int outwidth, int outstride) {
+  const int arrsize = inwidth + rfh->length;
+  uint8_t *tmparr_ = (uint8_t *)calloc(arrsize, sizeof(*tmparr_));
+  uint8_t *tmparrh = tmparr_ + rfh->length / 2;
+  for (int i = 0; i < inheight; ++i) {
+    resample_1d_xc_8b(x + instride * i, inwidth, rfh, rfh->filter_bits, clip,
+                      y + i * outstride, outwidth, tmparrh);
+  }
+  free(tmparr_);
+}
+
+void resample_vert_8b(const uint8_t *x, int inwidth, int inheight, int instride,
+                      RationalResampleFilter *rfv, ClipProfile *clip,
+                      uint8_t *y, int outheight, int outstride) {
+  const int arrsize = outheight + inheight + rfv->length;
+  int16_t *tmparr_ = (int16_t *)calloc(arrsize, sizeof(int16_t));
+  int16_t *tmparrv = tmparr_ + outheight + rfv->length / 2;
+  int16_t *tmparro = tmparr_;
+  for (int i = 0; i < inwidth; ++i) {
+    fill_col_to_arr_in8b(x + i, instride, inheight, tmparrv);
+    resample_1d_xt(tmparrv, inheight, rfv, rfv->filter_bits, clip, tmparro,
+                   outheight);
+    fill_arr_to_col_out8b(y + i, outstride, outheight, tmparro);
+  }
+  free(tmparr_);
+}
+
 int get_resampled_output_length(int inlen, int p, int q, int force_even) {
   if (!force_even) {
     // round
diff --git a/tools/lanczos/lanczos_resample.h b/tools/lanczos/lanczos_resample.h
index 51fff05..039fd87 100644
--- a/tools/lanczos/lanczos_resample.h
+++ b/tools/lanczos/lanczos_resample.h
@@ -49,25 +49,12 @@
 // whether the resampler filter is a no-op
 int is_resampler_noop(RationalResampleFilter *rf);
 
+// 16-bit versions of high-level resampling functions
+
 // Assume no extension of the input x buffer
 void resample_1d(const int16_t *x, int inlen, RationalResampleFilter *rf,
                  int downshift, ClipProfile *clip, int16_t *y, int outlen);
 
-// Assume a scratch buffer xext of size inlen + rf->length is provided
-void resample_1d_xc(const int16_t *x, int inlen, RationalResampleFilter *rf,
-                    int downshift, ClipProfile *clip, int16_t *y, int outlen,
-                    int16_t *xext);
-
-// Assume x buffer is already extended on both sides with x pointing to the
-// leftmost pixel, but the extension values are not filled up.
-void resample_1d_xt(int16_t *x, int inlen, RationalResampleFilter *rf,
-                    int downshift, ClipProfile *clip, int16_t *y, int outlen);
-
-// Assume x buffer is already extended on both sides with x pointing to the
-// leftmost pixel, and the extension values are already filled up.
-void resample_1d_core(const int16_t *x, int inlen, RationalResampleFilter *rf,
-                      int downshift, ClipProfile *clip, int16_t *y, int outlen);
-
 void resample_2d(const int16_t *x, int inwidth, int inheight, int instride,
                  RationalResampleFilter *rfh, RationalResampleFilter *rfv,
                  int int_extra_bits, ClipProfile *clip, int16_t *y,
@@ -76,10 +63,30 @@
 void resample_horz(const int16_t *x, int inwidth, int inheight, int instride,
                    RationalResampleFilter *rfh, ClipProfile *clip, int16_t *y,
                    int outwidth, int outstride);
+
 void resample_vert(const int16_t *x, int inwidth, int inheight, int instride,
                    RationalResampleFilter *rfv, ClipProfile *clip, int16_t *y,
                    int outheight, int outstride);
 
+// 8-bit versions of high-level resampling functions
+
+// Assume no extension of the input x buffer
+void resample_1d_8b(const uint8_t *x, int inlen, RationalResampleFilter *rf,
+                    int downshift, ClipProfile *clip, uint8_t *y, int outlen);
+
+void resample_2d_8b(const uint8_t *x, int inwidth, int inheight, int instride,
+                    RationalResampleFilter *rfh, RationalResampleFilter *rfv,
+                    int int_extra_bits, ClipProfile *clip, uint8_t *y,
+                    int outwidth, int outheight, int outstride);
+
+void resample_horz_8b(const uint8_t *x, int inwidth, int inheight, int instride,
+                      RationalResampleFilter *rfh, ClipProfile *clip,
+                      uint8_t *y, int outwidth, int outstride);
+
+void resample_vert_8b(const uint8_t *x, int inwidth, int inheight, int instride,
+                      RationalResampleFilter *rfv, ClipProfile *clip,
+                      uint8_t *y, int outheight, int outstride);
+
 void show_resample_filter(RationalResampleFilter *rf);
 
 int get_resampled_output_length(int inlen, int p, int q, int force_even);
diff --git a/tools/lanczos/lanczos_resample_y4m.c b/tools/lanczos/lanczos_resample_y4m.c
index d3c859b..88fa447 100644
--- a/tools/lanczos/lanczos_resample_y4m.c
+++ b/tools/lanczos/lanczos_resample_y4m.c
@@ -230,8 +230,6 @@
       (uint8_t *)malloc((ysize + 2 * uvsize) * bytes_per_pel * sizeof(uint8_t));
   uint8_t *outbuf = (uint8_t *)malloc((rysize + 2 * ruvsize) * bytes_per_pel *
                                       sizeof(uint8_t));
-  int16_t *src = (int16_t *)malloc((ysize + 2 * uvsize) * sizeof(int16_t));
-  int16_t *res = (int16_t *)malloc((rysize + 2 * ruvsize) * sizeof(int16_t));
 
   ClipProfile clip = { bitdepth, 0 };
 
@@ -245,45 +243,38 @@
       break;
     }
     if (fread(inbuf, (ysize + 2 * uvsize) * bytes_per_pel, 1, fin) != 1) break;
-    int16_t *s, *r;
     if (bytes_per_pel == 1) {
-      uint8_t *d = inbuf;
-      s = src;
-      for (int i = 0; i < ysize + 2 * uvsize; ++i) *s++ = (int16_t)(*d++);
+      uint8_t *s = inbuf;
+      uint8_t *r = outbuf;
+      resample_2d_8b(s, ywidth, yheight, ywidth, &horz_rf, &vert_rf,
+                     int_extra_bits, &clip, r, rywidth, ryheight, rywidth);
+      s += ysize;
+      r += rysize;
+      resample_2d_8b(s, uvwidth, uvheight, uvwidth, &horz_rf, &vert_rf,
+                     int_extra_bits, &clip, r, ruvwidth, ruvheight, ruvwidth);
+      s += uvsize;
+      r += ruvsize;
+      resample_2d_8b(s, uvwidth, uvheight, uvwidth, &horz_rf, &vert_rf,
+                     int_extra_bits, &clip, r, ruvwidth, ruvheight, ruvwidth);
     } else {
-      uint16_t *d = (uint16_t *)inbuf;
-      s = src;
-      for (int i = 0; i < ysize + 2 * uvsize; ++i) *s++ = (int16_t)(*d++);
-    }
-    s = src;
-    r = res;
-    resample_2d(s, ywidth, yheight, ywidth, &horz_rf, &vert_rf, int_extra_bits,
-                &clip, r, rywidth, ryheight, rywidth);
-    s += ysize;
-    r += rysize;
-    resample_2d(s, uvwidth, uvheight, uvwidth, &horz_rf, &vert_rf,
-                int_extra_bits, &clip, r, ruvwidth, ruvheight, ruvwidth);
-    s += uvsize;
-    r += ruvsize;
-    resample_2d(s, uvwidth, uvheight, uvwidth, &horz_rf, &vert_rf,
-                int_extra_bits, &clip, r, ruvwidth, ruvheight, ruvwidth);
-    if (bytes_per_pel == 1) {
-      uint8_t *d = outbuf;
-      r = res;
-      for (int i = 0; i < rysize + 2 * ruvsize; ++i) *d++ = (uint8_t)(*r++);
-    } else {
-      uint16_t *d = (uint16_t *)outbuf;
-      r = res;
-      for (int i = 0; i < rysize + 2 * ruvsize; ++i) *d++ = (uint16_t)(*r++);
+      int16_t *s = (int16_t *)inbuf;
+      int16_t *r = (int16_t *)outbuf;
+      resample_2d(s, ywidth, yheight, ywidth, &horz_rf, &vert_rf,
+                  int_extra_bits, &clip, r, rywidth, ryheight, rywidth);
+      s += ysize;
+      r += rysize;
+      resample_2d(s, uvwidth, uvheight, uvwidth, &horz_rf, &vert_rf,
+                  int_extra_bits, &clip, r, ruvwidth, ruvheight, ruvwidth);
+      s += uvsize;
+      r += ruvsize;
+      resample_2d(s, uvwidth, uvheight, uvwidth, &horz_rf, &vert_rf,
+                  int_extra_bits, &clip, r, ruvwidth, ruvheight, ruvwidth);
     }
     fwrite(frametag, 6, 1, fout);
     fwrite(outbuf, (rysize + 2 * ruvsize) * bytes_per_pel, 1, fout);
   }
   fclose(fin);
   fclose(fout);
-
   free(inbuf);
   free(outbuf);
-  free(src);
-  free(res);
 }
diff --git a/tools/lanczos/lanczos_resample_yuv.c b/tools/lanczos/lanczos_resample_yuv.c
index 8cc4a1e..2f9e7f8 100644
--- a/tools/lanczos/lanczos_resample_yuv.c
+++ b/tools/lanczos/lanczos_resample_yuv.c
@@ -207,8 +207,6 @@
       (uint8_t *)malloc((ysize + 2 * uvsize) * bytes_per_pel * sizeof(uint8_t));
   uint8_t *outbuf = (uint8_t *)malloc((rysize + 2 * ruvsize) * bytes_per_pel *
                                       sizeof(uint8_t));
-  int16_t *src = (int16_t *)malloc((ysize + 2 * uvsize) * sizeof(int16_t));
-  int16_t *res = (int16_t *)malloc((rysize + 2 * ruvsize) * sizeof(int16_t));
 
   FILE *fin = fopen(yuv_input, "rb");
   FILE *fout = fopen(yuv_output, "wb");
@@ -217,44 +215,37 @@
 
   for (int n = 0; n < num_frames; ++n) {
     if (fread(inbuf, (ysize + 2 * uvsize) * bytes_per_pel, 1, fin) != 1) break;
-    int16_t *s, *r;
     if (bytes_per_pel == 1) {
-      uint8_t *d = inbuf;
-      s = src;
-      for (int i = 0; i < ysize + 2 * uvsize; ++i) *s++ = (int16_t)(*d++);
+      uint8_t *s = inbuf;
+      uint8_t *r = outbuf;
+      resample_2d_8b(s, ywidth, yheight, ywidth, &horz_rf, &vert_rf,
+                     int_extra_bits, &clip, r, rywidth, ryheight, rywidth);
+      s += ysize;
+      r += rysize;
+      resample_2d_8b(s, uvwidth, uvheight, uvwidth, &horz_rf, &vert_rf,
+                     int_extra_bits, &clip, r, ruvwidth, ruvheight, ruvwidth);
+      s += uvsize;
+      r += ruvsize;
+      resample_2d_8b(s, uvwidth, uvheight, uvwidth, &horz_rf, &vert_rf,
+                     int_extra_bits, &clip, r, ruvwidth, ruvheight, ruvwidth);
     } else {
-      uint16_t *d = (uint16_t *)inbuf;
-      s = src;
-      for (int i = 0; i < ysize + 2 * uvsize; ++i) *s++ = (int16_t)(*d++);
-    }
-    s = src;
-    r = res;
-    resample_2d(s, ywidth, yheight, ywidth, &horz_rf, &vert_rf, int_extra_bits,
-                &clip, r, rywidth, ryheight, rywidth);
-    s += ysize;
-    r += rysize;
-    resample_2d(s, uvwidth, uvheight, uvwidth, &horz_rf, &vert_rf,
-                int_extra_bits, &clip, r, ruvwidth, ruvheight, ruvwidth);
-    s += uvsize;
-    r += ruvsize;
-    resample_2d(s, uvwidth, uvheight, uvwidth, &horz_rf, &vert_rf,
-                int_extra_bits, &clip, r, ruvwidth, ruvheight, ruvwidth);
-    if (bytes_per_pel == 1) {
-      uint8_t *d = outbuf;
-      r = res;
-      for (int i = 0; i < rysize + 2 * ruvsize; ++i) *d++ = (uint8_t)(*r++);
-    } else {
-      uint16_t *d = (uint16_t *)outbuf;
-      r = res;
-      for (int i = 0; i < rysize + 2 * ruvsize; ++i) *d++ = (uint16_t)(*r++);
+      int16_t *s = (int16_t *)inbuf;
+      int16_t *r = (int16_t *)outbuf;
+      resample_2d(s, ywidth, yheight, ywidth, &horz_rf, &vert_rf,
+                  int_extra_bits, &clip, r, rywidth, ryheight, rywidth);
+      s += ysize;
+      r += rysize;
+      resample_2d(s, uvwidth, uvheight, uvwidth, &horz_rf, &vert_rf,
+                  int_extra_bits, &clip, r, ruvwidth, ruvheight, ruvwidth);
+      s += uvsize;
+      r += ruvsize;
+      resample_2d(s, uvwidth, uvheight, uvwidth, &horz_rf, &vert_rf,
+                  int_extra_bits, &clip, r, ruvwidth, ruvheight, ruvwidth);
     }
     fwrite(outbuf, (rysize + 2 * ruvsize) * bytes_per_pel, 1, fout);
   }
   fclose(fin);
   fclose(fout);
-
   free(inbuf);
   free(outbuf);
-  free(src);
-  free(res);
 }