Add special C func to do 2 to 1 scale

used by svc
optimizations neon/ssse3 following

Change-Id: I0afef5553b840ead10124910650ce1fa7382c19d
diff --git a/aom_dsp/aom_convolve.c b/aom_dsp/aom_convolve.c
index 3cd5cfc..f9dd35b 100644
--- a/aom_dsp/aom_convolve.c
+++ b/aom_dsp/aom_convolve.c
@@ -111,6 +111,40 @@
                 w, h);
 }
 
+void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                     ptrdiff_t dst_stride, const InterpKernel *filter,
+                     int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
+                     int h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
+  uint8_t temp[64 * 135];
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
+
+  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
+                 filter, x0_q4, x_step_q4, w, intermediate_height);
+  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,
+                y0_q4, y_step_q4, w, h);
+}
+
 void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                          ptrdiff_t dst_stride, int w, int h) {
   for (int r = h; r > 0; --r) {
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 90af7e7..400416f 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -353,6 +353,7 @@
 #
 # Sub Pixel Filters
 #
+add_proto qw/void aom_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 add_proto qw/void aom_convolve_copy/,             "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, int w, int h";
 add_proto qw/void aom_convolve8_horiz/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 add_proto qw/void aom_convolve8_vert/,            "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 901203e..ecd6d63 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -264,6 +264,9 @@
 add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit";
 specialize "av1_round_shift_array", qw/sse4_1 neon/;
 
+# Resize functions.
+add_proto qw/void av1_resize_and_extend_frame/, "const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes";
+
 #
 # Encoder functions below this point.
 #
diff --git a/av1/common/resize.c b/av1/common/resize.c
index 98f28f7..e383947 100644
--- a/av1/common/resize.c
+++ b/av1/common/resize.c
@@ -24,6 +24,7 @@
 #include "av1/common/common.h"
 #include "av1/common/resize.h"
 
+#include "config/aom_dsp_rtcd.h"
 #include "config/aom_scale_rtcd.h"
 
 // Filters for interpolation (0.5-band) - note this also filters integer pels.
@@ -1188,9 +1189,47 @@
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                 YV12_BUFFER_CONFIG *dst, int bd,
-                                 const int num_planes) {
+void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
+                                   YV12_BUFFER_CONFIG *dst,
+                                   const InterpFilter filter,
+                                   const int phase_scaler,
+                                   const int num_planes) {
+  const int src_w = src->y_crop_width;
+  const int src_h = src->y_crop_height;
+  const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
+                                   src->v_buffer };
+  const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
+  uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
+  const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
+  assert(filter == BILINEAR || filter == EIGHTTAP_SMOOTH);
+  const InterpKernel *const kernel =
+      filter == BILINEAR ? av1_bilinear_filters : av1_sub_pel_filters_8smooth;
+  const int dst_w = dst->y_crop_width;
+  const int dst_h = dst->y_crop_height;
+  for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
+    const int factor = (i == 0 || i == 3 ? 1 : 2);
+    const int src_stride = src_strides[i];
+    const int dst_stride = dst_strides[i];
+    for (int y = 0; y < dst_h; y += 16) {
+      const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler;
+      for (int x = 0; x < dst_w; x += 16) {
+        const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler;
+        const uint8_t *src_ptr = srcs[i] +
+                                 (y / factor) * src_h / dst_h * src_stride +
+                                 (x / factor) * src_w / dst_w;
+        uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
+
+        aom_convolve8_c(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+                        x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+                        16 * src_h / dst_h, 16 / factor, 16 / factor);
+      }
+    }
+  }
+}
+
+void av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                              YV12_BUFFER_CONFIG *dst, int bd,
+                                              const int num_planes) {
   // TODO(dkovalev): replace YV12_BUFFER_CONFIG with aom_image_t
 
   // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
@@ -1300,12 +1339,19 @@
 
 YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
                                           YV12_BUFFER_CONFIG *unscaled,
-                                          YV12_BUFFER_CONFIG *scaled) {
+                                          YV12_BUFFER_CONFIG *scaled,
+                                          const InterpFilter filter,
+                                          const int phase) {
   const int num_planes = av1_num_planes(cm);
   if (cm->width != unscaled->y_crop_width ||
       cm->height != unscaled->y_crop_height) {
-    av1_resize_and_extend_frame(unscaled, scaled, (int)cm->seq_params.bit_depth,
-                                num_planes);
+    if ((cm->width << 1) == unscaled->y_crop_width &&
+        (cm->height << 1) == unscaled->y_crop_height) {
+      av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes);
+    } else {
+      av1_resize_and_extend_frame_nonnormative(
+          unscaled, scaled, (int)cm->seq_params.bit_depth, num_planes);
+    }
     return scaled;
   } else {
     return unscaled;
diff --git a/av1/common/resize.h b/av1/common/resize.h
index 8ee859e..f5c84b0 100644
--- a/av1/common/resize.h
+++ b/av1/common/resize.h
@@ -63,9 +63,6 @@
                                 uint8_t *oy, int oy_stride, uint8_t *ou,
                                 uint8_t *ov, int ouv_stride, int oheight,
                                 int owidth, int bd);
-void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                 YV12_BUFFER_CONFIG *dst, int bd,
-                                 const int num_planes);
 
 void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
                                 int src_stride, uint8_t *dst, int dst_stride,
@@ -76,7 +73,13 @@
 
 YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
                                           YV12_BUFFER_CONFIG *unscaled,
-                                          YV12_BUFFER_CONFIG *scaled);
+                                          YV12_BUFFER_CONFIG *scaled,
+                                          const InterpFilter filter,
+                                          const int phase);
+
+void av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                              YV12_BUFFER_CONFIG *dst, int bd,
+                                              const int num_planes);
 
 // Calculates the scaled dimensions from the given original dimensions and the
 // resize scale denominator.
diff --git a/av1/encoder/encode_strategy.c b/av1/encoder/encode_strategy.c
index 013bbfb..b9fbd23 100644
--- a/av1/encoder/encode_strategy.c
+++ b/av1/encoder/encode_strategy.c
@@ -947,7 +947,8 @@
   // Set frame_input source to true source for psnr calculation.
   if (apply_filtering && is_psnr_calc_enabled(cpi)) {
     cpi->source =
-        av1_scale_if_required(cm, source_kf_buffer, &cpi->scaled_source);
+        av1_scale_if_required(cm, source_kf_buffer, &cpi->scaled_source,
+                              cm->features.interp_filter, 0);
     cpi->unscaled_source = source_kf_buffer;
   }
 
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 872a4e5..cefe418 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -1960,7 +1960,7 @@
             aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
           }
-          av1_resize_and_extend_frame(
+          av1_resize_and_extend_frame_nonnormative(
               ref, &new_fb->buf, (int)cm->seq_params.bit_depth, num_planes);
           cpi->scaled_ref_buf[ref_frame - 1] = new_fb;
           alloc_frame_mvs(cm, new_fb);
@@ -2755,10 +2755,12 @@
   aom_clear_system_state();
 
   cpi->source =
-      av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
+      av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source,
+                            cm->features.interp_filter, 0);
   if (cpi->unscaled_last_source != NULL) {
     cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
-                                             &cpi->scaled_last_source);
+                                             &cpi->scaled_last_source,
+                                             cm->features.interp_filter, 0);
   }
 
   setup_frame(cpi);
@@ -2832,8 +2834,19 @@
 static int encode_without_recode(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   const QuantizationCfg *const q_cfg = &cpi->oxcf.q_cfg;
+  SVC *const svc = &cpi->svc;
   int top_index = 0, bottom_index = 0, q = 0;
-
+  YV12_BUFFER_CONFIG *unscaled = cpi->unscaled_source;
+  InterpFilter downsample_filter =
+      cpi->use_svc ? svc->downsample_filter_type[svc->spatial_layer_id]
+                   : EIGHTTAP_REGULAR;
+  int phase_scaler =
+      cpi->use_svc ? svc->downsample_filter_phase[svc->spatial_layer_id] : 0;
+  if ((cm->width << 1) == unscaled->y_crop_width &&
+      (cm->height << 1) == unscaled->y_crop_height) {
+    downsample_filter = BILINEAR;
+    phase_scaler = 8;
+  }
   set_size_independent_vars(cpi);
   av1_setup_frame_size(cpi);
   set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
@@ -2849,11 +2862,12 @@
 
   aom_clear_system_state();
 
-  cpi->source =
-      av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
+  cpi->source = av1_scale_if_required(cm, unscaled, &cpi->scaled_source,
+                                      downsample_filter, phase_scaler);
   if (cpi->unscaled_last_source != NULL) {
     cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
-                                             &cpi->scaled_last_source);
+                                             &cpi->scaled_last_source,
+                                             downsample_filter, phase_scaler);
   }
   if (!frame_is_intra_only(cm)) scale_references(cpi);
 
@@ -3014,10 +3028,12 @@
       }
     }
     cpi->source =
-        av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
+        av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source,
+                              cm->features.interp_filter, 0);
     if (cpi->unscaled_last_source != NULL) {
       cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
-                                               &cpi->scaled_last_source);
+                                               &cpi->scaled_last_source,
+                                               cm->features.interp_filter, 0);
     }
 
     if (!frame_is_intra_only(cm)) {
diff --git a/av1/encoder/encoder_alloc.h b/av1/encoder/encoder_alloc.h
index a2e1b03f..985313b 100644
--- a/av1/encoder/encoder_alloc.h
+++ b/av1/encoder/encoder_alloc.h
@@ -392,8 +392,9 @@
                        "Failed to reallocate scaled source buffer");
   assert(cpi->scaled_source.y_crop_width == scaled_width);
   assert(cpi->scaled_source.y_crop_height == scaled_height);
-  av1_resize_and_extend_frame(cpi->unscaled_source, &cpi->scaled_source,
-                              (int)cm->seq_params.bit_depth, num_planes);
+  av1_resize_and_extend_frame_nonnormative(
+      cpi->unscaled_source, &cpi->scaled_source, (int)cm->seq_params.bit_depth,
+      num_planes);
   return &cpi->scaled_source;
 }
 
diff --git a/av1/encoder/svc_layercontext.c b/av1/encoder/svc_layercontext.c
index b21997f..afc7933 100644
--- a/av1/encoder/svc_layercontext.c
+++ b/av1/encoder/svc_layercontext.c
@@ -77,6 +77,8 @@
         memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size);
       }
     }
+    svc->downsample_filter_type[sl] = BILINEAR;
+    svc->downsample_filter_phase[sl] = 8;
   }
 }
 
diff --git a/av1/encoder/svc_layercontext.h b/av1/encoder/svc_layercontext.h
index a8ad843..538a949 100644
--- a/av1/encoder/svc_layercontext.h
+++ b/av1/encoder/svc_layercontext.h
@@ -65,6 +65,11 @@
   int temporal_layer_fb[REF_FRAMES];
   // Layer context used for rate control in CBR mode.
   LAYER_CONTEXT layer_context[AOM_MAX_LAYERS];
+  // EIGHTTAP_SMOOTH or BILINEAR
+  InterpFilter downsample_filter_type[AOM_MAX_SS_LAYERS];
+  // downsample_filter_phase: = 0 will do sub-sampling (no weighted average),
+  // = 8 will center the target pixel and get a symmetric averaging filter.
+  int downsample_filter_phase[AOM_MAX_SS_LAYERS];
 } SVC;
 
 struct AV1_COMP;
diff --git a/av1/encoder/tune_vmaf.c b/av1/encoder/tune_vmaf.c
index 957123f..6044e5a 100644
--- a/av1/encoder/tune_vmaf.c
+++ b/av1/encoder/tune_vmaf.c
@@ -523,8 +523,8 @@
       &resized_source, y_width / resize_factor, y_height / resize_factor, 1, 1,
       cm->seq_params.use_highbitdepth, cpi->oxcf.border_in_pixels,
       cm->features.byte_alignment);
-  av1_resize_and_extend_frame(cpi->source, &resized_source, bit_depth,
-                              av1_num_planes(cm));
+  av1_resize_and_extend_frame_nonnormative(cpi->source, &resized_source,
+                                           bit_depth, av1_num_planes(cm));
 
   const int resized_y_width = resized_source.y_width;
   const int resized_y_height = resized_source.y_height;