rtc: Add optimized scaler for 2:1 horiz only for Bilinear
For x86 only.
This also fixes a bug with the aom_scaled_2d(),
for horiz or vert only scaling. The x/y_q4 was not set to 0
if the height/width is the same (no scale change).
So for 2:1 horiz scaling, it was still filtering along
the vertical direction.
Eightap case to be added, and ARM to be added.
Change-Id: I994195e578471c1e898e7fed55c71ea3bb792f8d
diff --git a/av1/common/resize.c b/av1/common/resize.c
index f89f7ca..55dfc6f 100644
--- a/av1/common/resize.c
+++ b/av1/common/resize.c
@@ -1247,9 +1247,11 @@
uint8_t *dst_buffer = dst->buffers[i];
const int dst_stride = dst->strides[is_uv];
for (int y = 0; y < dst_h; y += 16) {
- const int y_q4 = y * 16 * src_h / dst_h + phase_scaler;
+ const int y_q4 =
+ src_h == dst_h ? 0 : y * 16 * src_h / dst_h + phase_scaler;
for (int x = 0; x < dst_w; x += 16) {
- const int x_q4 = x * 16 * src_w / dst_w + phase_scaler;
+ const int x_q4 =
+ src_w == dst_w ? 0 : x * 16 * src_w / dst_w + phase_scaler;
const uint8_t *src_ptr =
src_buffer + y * src_h / dst_h * src_stride + x * src_w / dst_w;
uint8_t *dst_ptr = dst_buffer + y * dst_stride + x;
diff --git a/av1/common/x86/resize_ssse3.c b/av1/common/x86/resize_ssse3.c
index a7fdb5a..622e4ec 100644
--- a/av1/common/x86/resize_ssse3.c
+++ b/av1/common/x86/resize_ssse3.c
@@ -89,6 +89,29 @@
return temp;
}
+static void scale_plane_2_to_1_phase_0_horiz(const uint8_t *src,
+ const ptrdiff_t src_stride,
+ uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const int dst_w, const int dst_h) {
+ const int max_width = (dst_w + 15) & ~15;
+ const __m128i mask = _mm_set1_epi16(0x00FF);
+ int y = dst_h;
+
+ do {
+ int x = max_width;
+ do {
+ const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask);
+ _mm_storeu_si128((__m128i *)dst, d);
+ src += 32;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += src_stride - 2 * max_width;
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
static void scale_plane_2_to_1_phase_0(const uint8_t *src,
const ptrdiff_t src_stride, uint8_t *dst,
const ptrdiff_t dst_stride,
@@ -148,6 +171,32 @@
return _mm_packus_epi16(t4, t5);
}
+static void scale_plane_2_to_1_bilinear_horiz(const uint8_t *src,
+ const ptrdiff_t src_stride,
+ uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const int dst_w, const int dst_h,
+ const __m128i c0c1) {
+ const int max_width = (dst_w + 15) & ~15;
+ int y = dst_h;
+ do {
+ int x = max_width;
+ do {
+ __m128i s[2], d;
+ // Horizontal
+ s[0] = _mm_loadu_si128((const __m128i *)(src + 0));
+ s[1] = _mm_loadu_si128((const __m128i *)(src + 16));
+ d = scale_plane_bilinear_kernel(s, c0c1);
+ _mm_storeu_si128((__m128i *)dst, d);
+ src += 32;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += src_stride - (max_width << 1);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
static void scale_plane_2_to_1_bilinear(const uint8_t *src,
const ptrdiff_t src_stride,
uint8_t *dst,
@@ -809,13 +858,15 @@
} while (--y);
}
-// There's SIMD optimizations for 1/4, 1/2 and 3/4 downscaling and 2x upscaling
+// There's SIMD optimizations for: (1/4 x 1/4), (1/2 x 1/2), (3/4 x 3/4),
+// horizontal-only downscale (1/2 x 1), and for (2 x 2) upscaling,
// in SSSE3.
static INLINE bool has_normative_scaler_ssse3(const int src_width,
const int src_height,
const int dst_width,
const int dst_height) {
const bool has_normative_scaler =
+ (2 * dst_width == src_width && dst_height == src_height) ||
(2 * dst_width == src_width && 2 * dst_height == src_height) ||
(4 * dst_width == src_width && 4 * dst_height == src_height) ||
(4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) ||
@@ -839,7 +890,7 @@
dst->uv_crop_width, dst->uv_crop_height);
}
- if (!has_normative_scaler) {
+ if (!has_normative_scaler || filter != BILINEAR) {
av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes);
return;
}
@@ -857,19 +908,30 @@
const int dst_y_w = (dst->crop_widths[0] + 1) & ~1;
const int dst_y_h = (dst->crop_heights[0] + 1) & ~1;
- if (2 * dst_w == src_w && 2 * dst_h == src_h) {
+ if ((2 * dst_w == src_w && 2 * dst_h == src_h) ||
+ (2 * dst_w == src_w && dst_h == src_h)) {
// 2 to 1
if (phase == 0) {
- scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv],
- dst->buffers[i], dst->strides[is_uv], dst_w,
- dst_h);
+ if (2 * dst_w == src_w && dst_h == src_h)
+ scale_plane_2_to_1_phase_0_horiz(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv],
+ dst_w, dst_h);
+ else
+ scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv],
+ dst_w, dst_h);
} else if (filter == BILINEAR) {
const int16_t c0 = av1_bilinear_filters[phase][3];
const int16_t c1 = av1_bilinear_filters[phase][4];
const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0
- scale_plane_2_to_1_bilinear(src->buffers[i], src->strides[is_uv],
- dst->buffers[i], dst->strides[is_uv], dst_w,
- dst_h, c0c1);
+ if (2 * dst_w == src_w && dst_h == src_h)
+ scale_plane_2_to_1_bilinear_horiz(
+ src->buffers[i], src->strides[is_uv], dst->buffers[i],
+ dst->strides[is_uv], dst_w, dst_h, c0c1);
+ else
+ scale_plane_2_to_1_bilinear(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv],
+ dst_w, dst_h, c0c1);
} else {
const int buffer_stride = (dst_y_w + 3) & ~3;
const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;