Add ssse3 optimization for 2:1 scaling
Change-Id: I0eb0717abdf952a002465869e27100bc497289f6
diff --git a/av1/av1.cmake b/av1/av1.cmake
index f280ccf..cc7bfb3 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -271,7 +271,8 @@
"${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_ssse3.c"
"${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_ssse3.c"
"${AOM_ROOT}/av1/common/x86/jnt_convolve_ssse3.c"
- "${AOM_ROOT}/av1/common/x86/reconinter_ssse3.c")
+ "${AOM_ROOT}/av1/common/x86/reconinter_ssse3.c"
+ "${AOM_ROOT}/av1/common/x86/resize_ssse3.c")
if(NOT CONFIG_AV1_HIGHBITDEPTH)
list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_SSSE3
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index ecd6d63..db38ce0 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -266,6 +266,7 @@
# Resize functions.
add_proto qw/void av1_resize_and_extend_frame/, "const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes";
+specialize qw/av1_resize_and_extend_frame ssse3/;
#
# Encoder functions below this point.
diff --git a/av1/common/resize.c b/av1/common/resize.c
index e383947..faa3978 100644
--- a/av1/common/resize.c
+++ b/av1/common/resize.c
@@ -1345,6 +1345,16 @@
const int num_planes = av1_num_planes(cm);
if (cm->width != unscaled->y_crop_width ||
cm->height != unscaled->y_crop_height) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if ((cm->width << 1) == unscaled->y_crop_width &&
+ (cm->height << 1) == unscaled->y_crop_height &&
+ cm->seq_params.bit_depth == AOM_BITS_8) {
+ av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes);
+ } else {
+ av1_resize_and_extend_frame_nonnormative(
+ unscaled, scaled, (int)cm->seq_params.bit_depth, num_planes);
+ }
+#else
if ((cm->width << 1) == unscaled->y_crop_width &&
(cm->height << 1) == unscaled->y_crop_height) {
av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes);
@@ -1352,6 +1362,7 @@
av1_resize_and_extend_frame_nonnormative(
unscaled, scaled, (int)cm->seq_params.bit_depth, num_planes);
}
+#endif
return scaled;
} else {
return unscaled;
diff --git a/av1/common/x86/resize_ssse3.c b/av1/common/x86/resize_ssse3.c
new file mode 100644
index 0000000..02f3bd8
--- /dev/null
+++ b/av1/common/x86/resize_ssse3.c
@@ -0,0 +1,138 @@
+/*
+ *
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h> // SSSE3
+
+#include "av1/common/resize.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+static INLINE __m128i scale_plane_2_to_1_phase_0_kernel(
+ const uint8_t *const src, const __m128i *const mask) {
+ const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0]));
+ const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16]));
+ const __m128i a_and = _mm_and_si128(a, *mask);
+ const __m128i b_and = _mm_and_si128(b, *mask);
+ return _mm_packus_epi16(a_and, b_and);
+}
+
+static void scale_plane_2_to_1_phase_0(const uint8_t *src,
+ const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const int dst_w, const int dst_h) {
+ const int max_width = (dst_w + 15) & ~15;
+ const __m128i mask = _mm_set1_epi16(0x00FF);
+ int y = dst_h;
+
+ do {
+ int x = max_width;
+ do {
+ const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask);
+ _mm_storeu_si128((__m128i *)dst, d);
+ src += 32;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += 2 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static INLINE __m128i scale_plane_bilinear_kernel(const __m128i *const s,
+ const __m128i c0c1) {
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1);
+ const __m128i t1 = _mm_maddubs_epi16(s[1], c0c1);
+ // round and shift by 7 bit each 16 bit
+ const __m128i t2 = _mm_adds_epi16(t0, k_64);
+ const __m128i t3 = _mm_adds_epi16(t1, k_64);
+ const __m128i t4 = _mm_srai_epi16(t2, 7);
+ const __m128i t5 = _mm_srai_epi16(t3, 7);
+ return _mm_packus_epi16(t4, t5);
+}
+
+static void scale_plane_2_to_1_bilinear(const uint8_t *src,
+ const ptrdiff_t src_stride,
+ uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const int dst_w, const int dst_h,
+ const __m128i c0c1) {
+ const int max_width = (dst_w + 15) & ~15;
+ int y = dst_h;
+
+ do {
+ int x = max_width;
+ do {
+ __m128i s[2], d[2];
+
+ // Horizontal
+ // Even rows
+ s[0] = _mm_loadu_si128((const __m128i *)(src + 0));
+ s[1] = _mm_loadu_si128((const __m128i *)(src + 16));
+ d[0] = scale_plane_bilinear_kernel(s, c0c1);
+
+ // odd rows
+ s[0] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
+ s[1] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
+ d[1] = scale_plane_bilinear_kernel(s, c0c1);
+
+ // Vertical
+ s[0] = _mm_unpacklo_epi8(d[0], d[1]);
+ s[1] = _mm_unpackhi_epi8(d[0], d[1]);
+ d[0] = scale_plane_bilinear_kernel(s, c0c1);
+
+ _mm_storeu_si128((__m128i *)dst, d[0]);
+ src += 32;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += 2 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst,
+ const InterpFilter filter,
+ const int phase, const int num_planes) {
+ // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+ // the static analysis warnings.
+ for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
+ const int is_uv = i > 0;
+ const int src_w = src->crop_widths[is_uv];
+ const int src_h = src->crop_heights[is_uv];
+ const int dst_w = dst->crop_widths[is_uv];
+ const int dst_h = dst->crop_heights[is_uv];
+
+ if (2 * dst_w == src_w && 2 * dst_h == src_h) {
+ if (phase == 0) {
+ scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], dst_w,
+ dst_h);
+ } else if (filter == BILINEAR) {
+ const int16_t c0 = av1_bilinear_filters[phase][3];
+ const int16_t c1 = av1_bilinear_filters[phase][4];
+ const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0
+ scale_plane_2_to_1_bilinear(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], dst_w,
+ dst_h, c0c1);
+ } else {
+ av1_resize_plane(src->buffers[i], src_h, src_w, src->strides[is_uv],
+ dst->buffers[i], dst_h, dst_w, dst->strides[is_uv]);
+ }
+ } else {
+ av1_resize_plane(src->buffers[i], src_h, src_w, src->strides[is_uv],
+ dst->buffers[i], dst_h, dst_w, dst->strides[is_uv]);
+ }
+ }
+ aom_extend_frame_borders(dst, num_planes);
+}