Add special C func to do 2 to 1 scale
used by svc
optimizations neon/ssse3 following
Change-Id: I0afef5553b840ead10124910650ce1fa7382c19d
diff --git a/aom_dsp/aom_convolve.c b/aom_dsp/aom_convolve.c
index 3cd5cfc..f9dd35b 100644
--- a/aom_dsp/aom_convolve.c
+++ b/aom_dsp/aom_convolve.c
@@ -111,6 +111,40 @@
w, h);
}
+void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ // When calling in frame scaling function, the smallest scaling factor is x1/4
+ // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+ // big enough.
+ uint8_t temp[64 * 135];
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= 64);
+ assert(h <= 64);
+ assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+ assert(x_step_q4 <= 64);
+
+ convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
+ filter, x0_q4, x_step_q4, w, intermediate_height);
+ convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,
+ y0_q4, y_step_q4, w, h);
+}
+
void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, int w, int h) {
for (int r = h; r > 0; --r) {
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 90af7e7..400416f 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -353,6 +353,7 @@
#
# Sub Pixel Filters
#
+add_proto qw/void aom_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
add_proto qw/void aom_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, int w, int h";
add_proto qw/void aom_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
add_proto qw/void aom_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 901203e..ecd6d63 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -264,6 +264,9 @@
add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit";
specialize "av1_round_shift_array", qw/sse4_1 neon/;
+# Resize functions.
+add_proto qw/void av1_resize_and_extend_frame/, "const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes";
+
#
# Encoder functions below this point.
#
diff --git a/av1/common/resize.c b/av1/common/resize.c
index 98f28f7..e383947 100644
--- a/av1/common/resize.c
+++ b/av1/common/resize.c
@@ -24,6 +24,7 @@
#include "av1/common/common.h"
#include "av1/common/resize.h"
+#include "config/aom_dsp_rtcd.h"
#include "config/aom_scale_rtcd.h"
// Filters for interpolation (0.5-band) - note this also filters integer pels.
@@ -1188,9 +1189,47 @@
}
#endif // CONFIG_AV1_HIGHBITDEPTH
-void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
- YV12_BUFFER_CONFIG *dst, int bd,
- const int num_planes) {
+void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst,
+ const InterpFilter filter,
+ const int phase_scaler,
+ const int num_planes) {
+ const int src_w = src->y_crop_width;
+ const int src_h = src->y_crop_height;
+ const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
+ src->v_buffer };
+ const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
+ uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
+ const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
+ assert(filter == BILINEAR || filter == EIGHTTAP_SMOOTH);
+ const InterpKernel *const kernel =
+ filter == BILINEAR ? av1_bilinear_filters : av1_sub_pel_filters_8smooth;
+ const int dst_w = dst->y_crop_width;
+ const int dst_h = dst->y_crop_height;
+ for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
+ const int factor = (i == 0 || i == 3 ? 1 : 2);
+ const int src_stride = src_strides[i];
+ const int dst_stride = dst_strides[i];
+ for (int y = 0; y < dst_h; y += 16) {
+ const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler;
+ for (int x = 0; x < dst_w; x += 16) {
+ const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler;
+ const uint8_t *src_ptr = srcs[i] +
+ (y / factor) * src_h / dst_h * src_stride +
+ (x / factor) * src_w / dst_w;
+ uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
+
+ aom_convolve8_c(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+ x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+ 16 * src_h / dst_h, 16 / factor, 16 / factor);
+ }
+ }
+ }
+}
+
+void av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst, int bd,
+ const int num_planes) {
// TODO(dkovalev): replace YV12_BUFFER_CONFIG with aom_image_t
// We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
@@ -1300,12 +1339,19 @@
YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
YV12_BUFFER_CONFIG *unscaled,
- YV12_BUFFER_CONFIG *scaled) {
+ YV12_BUFFER_CONFIG *scaled,
+ const InterpFilter filter,
+ const int phase) {
const int num_planes = av1_num_planes(cm);
if (cm->width != unscaled->y_crop_width ||
cm->height != unscaled->y_crop_height) {
- av1_resize_and_extend_frame(unscaled, scaled, (int)cm->seq_params.bit_depth,
- num_planes);
+ if ((cm->width << 1) == unscaled->y_crop_width &&
+ (cm->height << 1) == unscaled->y_crop_height) {
+ av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes);
+ } else {
+ av1_resize_and_extend_frame_nonnormative(
+ unscaled, scaled, (int)cm->seq_params.bit_depth, num_planes);
+ }
return scaled;
} else {
return unscaled;
diff --git a/av1/common/resize.h b/av1/common/resize.h
index 8ee859e..f5c84b0 100644
--- a/av1/common/resize.h
+++ b/av1/common/resize.h
@@ -63,9 +63,6 @@
uint8_t *oy, int oy_stride, uint8_t *ou,
uint8_t *ov, int ouv_stride, int oheight,
int owidth, int bd);
-void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
- YV12_BUFFER_CONFIG *dst, int bd,
- const int num_planes);
void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
int src_stride, uint8_t *dst, int dst_stride,
@@ -76,7 +73,13 @@
YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
YV12_BUFFER_CONFIG *unscaled,
- YV12_BUFFER_CONFIG *scaled);
+ YV12_BUFFER_CONFIG *scaled,
+ const InterpFilter filter,
+ const int phase);
+
+void av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst, int bd,
+ const int num_planes);
// Calculates the scaled dimensions from the given original dimensions and the
// resize scale denominator.
diff --git a/av1/encoder/encode_strategy.c b/av1/encoder/encode_strategy.c
index 013bbfb..b9fbd23 100644
--- a/av1/encoder/encode_strategy.c
+++ b/av1/encoder/encode_strategy.c
@@ -947,7 +947,8 @@
// Set frame_input source to true source for psnr calculation.
if (apply_filtering && is_psnr_calc_enabled(cpi)) {
cpi->source =
- av1_scale_if_required(cm, source_kf_buffer, &cpi->scaled_source);
+ av1_scale_if_required(cm, source_kf_buffer, &cpi->scaled_source,
+ cm->features.interp_filter, 0);
cpi->unscaled_source = source_kf_buffer;
}
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 872a4e5..cefe418 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -1960,7 +1960,7 @@
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate frame buffer");
}
- av1_resize_and_extend_frame(
+ av1_resize_and_extend_frame_nonnormative(
ref, &new_fb->buf, (int)cm->seq_params.bit_depth, num_planes);
cpi->scaled_ref_buf[ref_frame - 1] = new_fb;
alloc_frame_mvs(cm, new_fb);
@@ -2755,10 +2755,12 @@
aom_clear_system_state();
cpi->source =
- av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
+ av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source,
+ cm->features.interp_filter, 0);
if (cpi->unscaled_last_source != NULL) {
cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
- &cpi->scaled_last_source);
+ &cpi->scaled_last_source,
+ cm->features.interp_filter, 0);
}
setup_frame(cpi);
@@ -2832,8 +2834,19 @@
static int encode_without_recode(AV1_COMP *cpi) {
AV1_COMMON *const cm = &cpi->common;
const QuantizationCfg *const q_cfg = &cpi->oxcf.q_cfg;
+ SVC *const svc = &cpi->svc;
int top_index = 0, bottom_index = 0, q = 0;
-
+ YV12_BUFFER_CONFIG *unscaled = cpi->unscaled_source;
+ InterpFilter downsample_filter =
+ cpi->use_svc ? svc->downsample_filter_type[svc->spatial_layer_id]
+ : EIGHTTAP_REGULAR;
+ int phase_scaler =
+ cpi->use_svc ? svc->downsample_filter_phase[svc->spatial_layer_id] : 0;
+ if ((cm->width << 1) == unscaled->y_crop_width &&
+ (cm->height << 1) == unscaled->y_crop_height) {
+ downsample_filter = BILINEAR;
+ phase_scaler = 8;
+ }
set_size_independent_vars(cpi);
av1_setup_frame_size(cpi);
set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
@@ -2849,11 +2862,12 @@
aom_clear_system_state();
- cpi->source =
- av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
+ cpi->source = av1_scale_if_required(cm, unscaled, &cpi->scaled_source,
+ downsample_filter, phase_scaler);
if (cpi->unscaled_last_source != NULL) {
cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
- &cpi->scaled_last_source);
+ &cpi->scaled_last_source,
+ downsample_filter, phase_scaler);
}
if (!frame_is_intra_only(cm)) scale_references(cpi);
@@ -3014,10 +3028,12 @@
}
}
cpi->source =
- av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
+ av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source,
+ cm->features.interp_filter, 0);
if (cpi->unscaled_last_source != NULL) {
cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
- &cpi->scaled_last_source);
+ &cpi->scaled_last_source,
+ cm->features.interp_filter, 0);
}
if (!frame_is_intra_only(cm)) {
diff --git a/av1/encoder/encoder_alloc.h b/av1/encoder/encoder_alloc.h
index a2e1b03f..985313b 100644
--- a/av1/encoder/encoder_alloc.h
+++ b/av1/encoder/encoder_alloc.h
@@ -392,8 +392,9 @@
"Failed to reallocate scaled source buffer");
assert(cpi->scaled_source.y_crop_width == scaled_width);
assert(cpi->scaled_source.y_crop_height == scaled_height);
- av1_resize_and_extend_frame(cpi->unscaled_source, &cpi->scaled_source,
- (int)cm->seq_params.bit_depth, num_planes);
+ av1_resize_and_extend_frame_nonnormative(
+ cpi->unscaled_source, &cpi->scaled_source, (int)cm->seq_params.bit_depth,
+ num_planes);
return &cpi->scaled_source;
}
diff --git a/av1/encoder/svc_layercontext.c b/av1/encoder/svc_layercontext.c
index b21997f..afc7933 100644
--- a/av1/encoder/svc_layercontext.c
+++ b/av1/encoder/svc_layercontext.c
@@ -77,6 +77,8 @@
memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size);
}
}
+ svc->downsample_filter_type[sl] = BILINEAR;
+ svc->downsample_filter_phase[sl] = 8;
}
}
diff --git a/av1/encoder/svc_layercontext.h b/av1/encoder/svc_layercontext.h
index a8ad843..538a949 100644
--- a/av1/encoder/svc_layercontext.h
+++ b/av1/encoder/svc_layercontext.h
@@ -65,6 +65,11 @@
int temporal_layer_fb[REF_FRAMES];
// Layer context used for rate control in CBR mode.
LAYER_CONTEXT layer_context[AOM_MAX_LAYERS];
+ // EIGHTTAP_SMOOTH or BILINEAR
+ InterpFilter downsample_filter_type[AOM_MAX_SS_LAYERS];
+ // downsample_filter_phase: = 0 will do sub-sampling (no weighted average),
+ // = 8 will center the target pixel and get a symmetric averaging filter.
+ int downsample_filter_phase[AOM_MAX_SS_LAYERS];
} SVC;
struct AV1_COMP;
diff --git a/av1/encoder/tune_vmaf.c b/av1/encoder/tune_vmaf.c
index 957123f..6044e5a 100644
--- a/av1/encoder/tune_vmaf.c
+++ b/av1/encoder/tune_vmaf.c
@@ -523,8 +523,8 @@
&resized_source, y_width / resize_factor, y_height / resize_factor, 1, 1,
cm->seq_params.use_highbitdepth, cpi->oxcf.border_in_pixels,
cm->features.byte_alignment);
- av1_resize_and_extend_frame(cpi->source, &resized_source, bit_depth,
- av1_num_planes(cm));
+ av1_resize_and_extend_frame_nonnormative(cpi->source, &resized_source,
+ bit_depth, av1_num_planes(cm));
const int resized_y_width = resized_source.y_width;
const int resized_y_height = resized_source.y_height;