Support horz only and vert only resampling better

Change-Id: Ib7e68794facf042a52edc58915b830da700e43fe
diff --git a/tools/lanczos/lanczos_README.txt b/tools/lanczos/lanczos_README.txt
index 8a44b1f..d55b3ba 100644
--- a/tools/lanczos/lanczos_README.txt
+++ b/tools/lanczos/lanczos_README.txt
@@ -45,6 +45,8 @@
                    or 'c' meaning centered
       <outwidth>x<outheight> is output video dimensions
                              only needed in case of upsampling
+      Resampling config string of 1:1:1:0 horizontally or vertically
+          is regarded as a no-op in that direction.
 
 Example usages:
 
@@ -111,6 +113,8 @@
                    or 'c' meaning centered
       <outwidth>x<outheight> is output video dimensions
                              only needed in case of upsampling
+      Resampling config string of 1:1:1:0 horizontally or vertically
+          is regarded as a no-op in that direction.
 
 
 
diff --git a/tools/lanczos/lanczos_resample.c b/tools/lanczos/lanczos_resample.c
index 3165e34..d550a14 100644
--- a/tools/lanczos/lanczos_resample.c
+++ b/tools/lanczos/lanczos_resample.c
@@ -187,6 +187,7 @@
   rf->p = p / g;
   rf->q = q / g;
   if (x0 == (double)('c')) x0 = get_centered_x0(rf->p, rf->q);
+  rf->filter_bits = bits;
   for (int i = 0; i < rf->p; ++i) {
     offset[i] = (double)rf->q / (double)rf->p * i + x0;
     intpel[i] = (int)floor(offset[i]);
@@ -272,19 +273,21 @@
   free(xext_);
 }
 
-static void fill_col_to_arr(int16_t *img, int stride, int len, int16_t *arr) {
+static void fill_col_to_arr(const int16_t *img, int stride, int len,
+                            int16_t *arr) {
   int i;
-  int16_t *iptr = img;
+  const int16_t *iptr = img;
   int16_t *aptr = arr;
   for (i = 0; i < len; ++i, iptr += stride) {
     *aptr++ = *iptr;
   }
 }
 
-static void fill_arr_to_col(int16_t *img, int stride, int len, int16_t *arr) {
+static void fill_arr_to_col(int16_t *img, int stride, int len,
+                            const int16_t *arr) {
   int i;
   int16_t *iptr = img;
-  int16_t *aptr = arr;
+  const int16_t *aptr = arr;
   for (i = 0; i < len; ++i, iptr += stride) {
     *iptr = *aptr++;
   }
@@ -292,18 +295,30 @@
 
 void resample_2d(const int16_t *x, int inwidth, int inheight, int instride,
                  RationalResampleFilter *rfh, RationalResampleFilter *rfv,
-                 int downshifth, int downshiftv, ClipProfile *clip, int16_t *y,
+                 int int_extra_bits, ClipProfile *clip, int16_t *y,
                  int outwidth, int outheight, int outstride) {
+  if (rfv == NULL || is_resampler_noop(rfv)) {
+    resample_horz(x, inwidth, inheight, instride, rfh, clip, y, outwidth,
+                  outstride);
+    return;
+  }
+  if (rfh == NULL || is_resampler_noop(rfh)) {
+    resample_vert(x, inwidth, inheight, instride, rfv, clip, y, outheight,
+                  outstride);
+    return;
+  }
   int16_t *tmpbuf = (int16_t *)malloc(sizeof(int16_t) * outwidth * inheight);
   const int arrsize =
       outheight + ((inheight + rfv->length > inwidth + rfh->length)
                        ? (inheight + rfv->length)
                        : (inwidth + rfh->length));
-  int16_t *tmparr_ = (int16_t *)malloc(sizeof(int16_t) * arrsize);
+  int16_t *tmparr_ = (int16_t *)calloc(arrsize, sizeof(int16_t));
   int16_t *tmparrh = tmparr_ + outheight + rfh->length / 2;
   int16_t *tmparrv = tmparr_ + outheight + rfv->length / 2;
   int16_t *tmparro = tmparr_;
   int tmpstride = outwidth;
+  const int downshifth = rfh->filter_bits - int_extra_bits;
+  const int downshiftv = rfh->filter_bits + int_extra_bits;
   for (int i = 0; i < inheight; ++i) {
     resample_1d_xc(x + instride * i, inwidth, rfh, downshifth, NULL,
                    tmpbuf + i * tmpstride, outwidth, tmparrh);
@@ -318,19 +333,35 @@
   free(tmparr_);
 }
 
-void resample_hor(const int16_t *x, int inwidth, int inheight, int instride,
-                  RationalResampleFilter *rfh, int downshifth,
-                  ClipProfile *clip, int16_t *y, int outwidth, int outstride) {
-  int16_t *tmparr_ =
-      (int16_t *)malloc(sizeof(int16_t) * (inwidth + rfh->length));
+void resample_horz(const int16_t *x, int inwidth, int inheight, int instride,
+                   RationalResampleFilter *rfh, ClipProfile *clip, int16_t *y,
+                   int outwidth, int outstride) {
+  const int arrsize = inwidth + rfh->length;
+  int16_t *tmparr_ = (int16_t *)calloc(arrsize, sizeof(int16_t));
   int16_t *tmparrh = tmparr_ + rfh->length / 2;
   for (int i = 0; i < inheight; ++i) {
-    resample_1d_xc(x + instride * i, inwidth, rfh, downshifth, clip,
+    resample_1d_xc(x + instride * i, inwidth, rfh, rfh->filter_bits, clip,
                    y + i * outstride, outwidth, tmparrh);
   }
   free(tmparr_);
 }
 
+void resample_vert(const int16_t *x, int inwidth, int inheight, int instride,
+                   RationalResampleFilter *rfv, ClipProfile *clip, int16_t *y,
+                   int outheight, int outstride) {
+  const int arrsize = outheight + inheight + rfv->length;
+  int16_t *tmparr_ = (int16_t *)calloc(arrsize, sizeof(int16_t));
+  int16_t *tmparrv = tmparr_ + outheight + rfv->length / 2;
+  int16_t *tmparro = tmparr_;
+  for (int i = 0; i < inwidth; ++i) {
+    fill_col_to_arr(x + i, instride, inheight, tmparrv);
+    resample_1d_xt(tmparrv, inheight, rfv, rfv->filter_bits, clip, tmparro,
+                   outheight);
+    fill_arr_to_col(y + i, outstride, outheight, tmparro);
+  }
+  free(tmparr_);
+}
+
 int get_resampled_output_length(int inlen, int p, int q, int force_even) {
   if (!force_even) {
     // round
diff --git a/tools/lanczos/lanczos_resample.h b/tools/lanczos/lanczos_resample.h
index 3c54023..51fff05 100644
--- a/tools/lanczos/lanczos_resample.h
+++ b/tools/lanczos/lanczos_resample.h
@@ -25,6 +25,7 @@
   int length;
   int start;
   int steps[MAX_RATIONAL_FACTOR];
+  int filter_bits;
   int16_t filter[MAX_RATIONAL_FACTOR][MAX_FILTER_LEN];
   double phases[MAX_RATIONAL_FACTOR];
 } RationalResampleFilter;
@@ -69,12 +70,15 @@
 
 void resample_2d(const int16_t *x, int inwidth, int inheight, int instride,
                  RationalResampleFilter *rfh, RationalResampleFilter *rfv,
-                 int downshifth, int downshiftv, ClipProfile *clip, int16_t *y,
+                 int int_extra_bits, ClipProfile *clip, int16_t *y,
                  int outwidth, int outheight, int outstride);
 
-void resample_hor(const int16_t *x, int inwidth, int inheight, int instride,
-                  RationalResampleFilter *rfh, int downshifth,
-                  ClipProfile *clip, int16_t *y, int outwidth, int outstride);
+void resample_horz(const int16_t *x, int inwidth, int inheight, int instride,
+                   RationalResampleFilter *rfh, ClipProfile *clip, int16_t *y,
+                   int outwidth, int outstride);
+void resample_vert(const int16_t *x, int inwidth, int inheight, int instride,
+                   RationalResampleFilter *rfv, ClipProfile *clip, int16_t *y,
+                   int outheight, int outstride);
 
 void show_resample_filter(RationalResampleFilter *rf);
 
diff --git a/tools/lanczos/lanczos_resample_y4m.c b/tools/lanczos/lanczos_resample_y4m.c
index 52129c0..d3c859b 100644
--- a/tools/lanczos/lanczos_resample_y4m.c
+++ b/tools/lanczos/lanczos_resample_y4m.c
@@ -219,8 +219,7 @@
   const int ruvsize = ruvwidth * ruvheight;
 
   const int bits = COEFF_PREC_BITS;
-  const int horz_downshift = bits - INT_EXTRA_PREC_BITS;
-  const int vert_downshift = bits + INT_EXTRA_PREC_BITS;
+  const int int_extra_bits = INT_EXTRA_PREC_BITS;
 
   get_resample_filter(horz_p, horz_q, horz_a, horz_x0, bits, &horz_rf);
   // show_resample_filter(&horz_rf);
@@ -258,18 +257,16 @@
     }
     s = src;
     r = res;
-    resample_2d(s, ywidth, yheight, ywidth, &horz_rf, &vert_rf, horz_downshift,
-                vert_downshift, &clip, r, rywidth, ryheight, rywidth);
+    resample_2d(s, ywidth, yheight, ywidth, &horz_rf, &vert_rf, int_extra_bits,
+                &clip, r, rywidth, ryheight, rywidth);
     s += ysize;
     r += rysize;
     resample_2d(s, uvwidth, uvheight, uvwidth, &horz_rf, &vert_rf,
-                horz_downshift, vert_downshift, &clip, r, ruvwidth, ruvheight,
-                ruvwidth);
+                int_extra_bits, &clip, r, ruvwidth, ruvheight, ruvwidth);
     s += uvsize;
     r += ruvsize;
     resample_2d(s, uvwidth, uvheight, uvwidth, &horz_rf, &vert_rf,
-                horz_downshift, vert_downshift, &clip, r, ruvwidth, ruvheight,
-                ruvwidth);
+                int_extra_bits, &clip, r, ruvwidth, ruvheight, ruvwidth);
     if (bytes_per_pel == 1) {
       uint8_t *d = outbuf;
       r = res;
diff --git a/tools/lanczos/lanczos_resample_yuv.c b/tools/lanczos/lanczos_resample_yuv.c
index 2df0110..8cc4a1e 100644
--- a/tools/lanczos/lanczos_resample_yuv.c
+++ b/tools/lanczos/lanczos_resample_yuv.c
@@ -196,8 +196,7 @@
   const int ruvsize = ruvwidth * ruvheight;
 
   const int bits = COEFF_PREC_BITS;
-  const int horz_downshift = bits - INT_EXTRA_PREC_BITS;
-  const int vert_downshift = bits + INT_EXTRA_PREC_BITS;
+  const int int_extra_bits = INT_EXTRA_PREC_BITS;
 
   get_resample_filter(horz_p, horz_q, horz_a, horz_x0, bits, &horz_rf);
   // show_resample_filter(&horz_rf);
@@ -230,18 +229,16 @@
     }
     s = src;
     r = res;
-    resample_2d(s, ywidth, yheight, ywidth, &horz_rf, &vert_rf, horz_downshift,
-                vert_downshift, &clip, r, rywidth, ryheight, rywidth);
+    resample_2d(s, ywidth, yheight, ywidth, &horz_rf, &vert_rf, int_extra_bits,
+                &clip, r, rywidth, ryheight, rywidth);
     s += ysize;
     r += rysize;
     resample_2d(s, uvwidth, uvheight, uvwidth, &horz_rf, &vert_rf,
-                horz_downshift, vert_downshift, &clip, r, ruvwidth, ruvheight,
-                ruvwidth);
+                int_extra_bits, &clip, r, ruvwidth, ruvheight, ruvwidth);
     s += uvsize;
     r += ruvsize;
     resample_2d(s, uvwidth, uvheight, uvwidth, &horz_rf, &vert_rf,
-                horz_downshift, vert_downshift, &clip, r, ruvwidth, ruvheight,
-                ruvwidth);
+                int_extra_bits, &clip, r, ruvwidth, ruvheight, ruvwidth);
     if (bytes_per_pel == 1) {
       uint8_t *d = outbuf;
       r = res;