SIMD implementation of horz superres
SSE4.1 implementations of av1_convolve_horiz_rs and
av1_highbd_convolve_horiz_rs have been added, along
with the corresponding speed and correctness tests.
The interp_taps argument was defunct and has now been
removed and replaced with the UPSCALE_NORMATIVE_TAPS
macro.
Code associated with values of UPSCALE_NORMATIVE_TAPS
that are no longer used has been removed.
Change-Id: Ie74d8ca479a70c8d473ac12883cfe4f10b37a66d
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 5066ce2..c26aa3e 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -308,6 +308,12 @@
${AOM_AV1_COMMON_INTRIN_SSE2}
"${AOM_ROOT}/av1/common/x86/convolve_sse2.c")
+if (CONFIG_HORZONLY_FRAME_SUPERRES)
+ set(AOM_AV1_COMMON_INTRIN_SSE4_1
+ ${AOM_AV1_COMMON_INTRIN_SSE4_1}
+ "${AOM_ROOT}/av1/common/x86/av1_convolve_horiz_rs_sse4.c")
+endif ()
+
set(AOM_AV1_COMMON_INTRIN_AVX2
${AOM_AV1_COMMON_INTRIN_AVX2}
"${AOM_ROOT}/av1/common/x86/convolve_avx2.c")
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 7f40213..edd5fcb 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -70,9 +70,11 @@
specialize qw/av1_convolve_vert ssse3/;
if (aom_config("CONFIG_HORZONLY_FRAME_SUPERRES") eq "yes") {
- add_proto qw/void av1_convolve_horiz_rs/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int interp_taps, const int x0_qn, const int x_step_qn";
+ add_proto qw/void av1_convolve_horiz_rs/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, const int x0_qn, const int x_step_qn";
+ specialize qw/av1_convolve_horiz_rs sse4_1/;
- add_proto qw/void av1_highbd_convolve_horiz_rs/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int interp_taps, const int x0_qn, const int x_step_qn, int bd";
+ add_proto qw/void av1_highbd_convolve_horiz_rs/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, const int x0_qn, const int x_step_qn, int bd";
+ specialize qw/av1_highbd_convolve_horiz_rs sse4_1/;
}
add_proto qw/void av1_highbd_convolve_horiz/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd";
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 00c87c9..7de1074 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -18,6 +18,7 @@
#include "av1/common/convolve.h"
#include "av1/common/filter.h"
#include "av1/common/onyxc_int.h"
+#include "av1/common/resize.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_ports/mem.h"
@@ -32,9 +33,9 @@
void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
- const int16_t *x_filters, int interp_taps,
- const int x0_qn, const int x_step_qn) {
- src -= interp_taps / 2 - 1;
+ const int16_t *x_filters, const int x0_qn,
+ const int x_step_qn) {
+ src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
for (int y = 0; y < h; ++y) {
int x_qn = x0_qn;
for (int x = 0; x < w; ++x) {
@@ -42,9 +43,11 @@
const int x_filter_idx =
(x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
assert(x_filter_idx <= RS_SUBPEL_MASK);
- const int16_t *const x_filter = &x_filters[x_filter_idx * interp_taps];
+ const int16_t *const x_filter =
+ &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
int sum = 0;
- for (int k = 0; k < interp_taps; ++k) sum += src_x[k] * x_filter[k];
+ for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
+ sum += src_x[k] * x_filter[k];
dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
x_qn += x_step_qn;
}
@@ -55,10 +58,9 @@
// TODO(yaowu: remove "const" from pass-by-value params in this and other funcs)
void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
uint16_t *dst, int dst_stride, int w, int h,
- const int16_t *x_filters, int interp_taps,
- const int x0_qn, const int x_step_qn,
- int bd) {
- src -= interp_taps / 2 - 1;
+ const int16_t *x_filters, const int x0_qn,
+ const int x_step_qn, int bd) {
+ src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
for (int y = 0; y < h; ++y) {
int x_qn = x0_qn;
for (int x = 0; x < w; ++x) {
@@ -66,9 +68,11 @@
const int x_filter_idx =
(x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
assert(x_filter_idx <= RS_SUBPEL_MASK);
- const int16_t *const x_filter = &x_filters[x_filter_idx * interp_taps];
+ const int16_t *const x_filter =
+ &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
int sum = 0;
- for (int k = 0; k < interp_taps; ++k) sum += src_x[k] * x_filter[k];
+ for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
+ sum += src_x[k] * x_filter[k];
dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
x_qn += x_step_qn;
}
diff --git a/av1/common/resize.c b/av1/common/resize.c
index 4b2a4bd..0bbecbc 100644
--- a/av1/common/resize.c
+++ b/av1/common/resize.c
@@ -208,75 +208,7 @@
#if CONFIG_HORZONLY_FRAME_SUPERRES
const int16_t av1_resize_filter_normative[(
1 << RS_SUBPEL_BITS)][UPSCALE_NORMATIVE_TAPS] = {
-#if UPSCALE_NORMATIVE_TAPS == 2
- { 128, 0 }, { 126, 2 }, { 124, 4 }, { 122, 6 }, { 120, 8 }, { 118, 10 },
- { 116, 12 }, { 114, 14 }, { 112, 16 }, { 110, 18 }, { 108, 20 }, { 106, 22 },
- { 104, 24 }, { 102, 26 }, { 100, 28 }, { 98, 30 }, { 96, 32 }, { 94, 34 },
- { 92, 36 }, { 90, 38 }, { 88, 40 }, { 86, 42 }, { 84, 44 }, { 82, 46 },
- { 80, 48 }, { 78, 50 }, { 76, 52 }, { 74, 54 }, { 72, 56 }, { 70, 58 },
- { 68, 60 }, { 66, 62 }, { 64, 64 }, { 62, 66 }, { 60, 68 }, { 58, 70 },
- { 56, 72 }, { 54, 74 }, { 52, 76 }, { 50, 78 }, { 48, 80 }, { 46, 82 },
- { 44, 84 }, { 42, 86 }, { 40, 88 }, { 38, 90 }, { 36, 92 }, { 34, 94 },
- { 32, 96 }, { 30, 98 }, { 28, 100 }, { 26, 102 }, { 24, 104 }, { 22, 106 },
- { 20, 108 }, { 18, 110 }, { 16, 112 }, { 14, 114 }, { 12, 116 }, { 10, 118 },
- { 8, 120 }, { 6, 122 }, { 4, 124 }, { 2, 126 },
-#elif UPSCALE_NORMATIVE_TAPS == 4
- { 0, 128, 0, 0 }, { -1, 128, 2, -1 }, { -2, 127, 4, -1 },
- { -3, 126, 7, -2 }, { -4, 125, 9, -2 }, { -5, 125, 11, -3 },
- { -6, 124, 13, -3 }, { -7, 123, 16, -4 }, { -7, 122, 18, -5 },
- { -8, 121, 20, -5 }, { -9, 120, 23, -6 }, { -9, 118, 25, -6 },
- { -10, 117, 28, -7 }, { -11, 116, 30, -7 }, { -11, 114, 33, -8 },
- { -12, 113, 35, -8 }, { -12, 111, 38, -9 }, { -13, 109, 41, -9 },
- { -13, 108, 43, -10 }, { -13, 106, 45, -10 }, { -13, 104, 48, -11 },
- { -14, 102, 51, -11 }, { -14, 100, 53, -11 }, { -14, 98, 56, -12 },
- { -14, 96, 58, -12 }, { -14, 94, 61, -13 }, { -15, 92, 64, -13 },
- { -15, 90, 66, -13 }, { -15, 87, 69, -13 }, { -14, 85, 71, -14 },
- { -14, 83, 73, -14 }, { -14, 80, 76, -14 }, { -14, 78, 78, -14 },
- { -14, 76, 80, -14 }, { -14, 73, 83, -14 }, { -14, 71, 85, -14 },
- { -13, 69, 87, -15 }, { -13, 66, 90, -15 }, { -13, 64, 92, -15 },
- { -13, 61, 94, -14 }, { -12, 58, 96, -14 }, { -12, 56, 98, -14 },
- { -11, 53, 100, -14 }, { -11, 51, 102, -14 }, { -11, 48, 104, -13 },
- { -10, 45, 106, -13 }, { -10, 43, 108, -13 }, { -9, 41, 109, -13 },
- { -9, 38, 111, -12 }, { -8, 35, 113, -12 }, { -8, 33, 114, -11 },
- { -7, 30, 116, -11 }, { -7, 28, 117, -10 }, { -6, 25, 118, -9 },
- { -6, 23, 120, -9 }, { -5, 20, 121, -8 }, { -5, 18, 122, -7 },
- { -4, 16, 123, -7 }, { -3, 13, 124, -6 }, { -3, 11, 125, -5 },
- { -2, 9, 125, -4 }, { -2, 7, 126, -3 }, { -1, 4, 127, -2 },
- { -1, 2, 128, -1 },
-#elif UPSCALE_NORMATIVE_TAPS == 6
- { 0, 0, 128, 0, 0, 0 }, { 0, -1, 128, 2, -1, 0 },
- { 1, -3, 127, 4, -2, 1 }, { 1, -4, 127, 6, -3, 1 },
- { 2, -6, 126, 8, -3, 1 }, { 2, -7, 125, 11, -4, 1 },
- { 2, -9, 125, 13, -5, 2 }, { 3, -10, 124, 15, -6, 2 },
- { 3, -11, 123, 18, -7, 2 }, { 3, -12, 122, 20, -8, 3 },
- { 4, -13, 121, 22, -9, 3 }, { 4, -14, 119, 25, -9, 3 },
- { 4, -15, 118, 27, -10, 4 }, { 4, -16, 117, 30, -11, 4 },
- { 5, -17, 116, 32, -12, 4 }, { 5, -17, 114, 35, -13, 4 },
- { 5, -18, 112, 37, -13, 5 }, { 5, -19, 111, 40, -14, 5 },
- { 6, -19, 109, 42, -15, 5 }, { 6, -20, 107, 45, -15, 5 },
- { 6, -20, 105, 48, -16, 5 }, { 6, -21, 103, 51, -17, 6 },
- { 6, -21, 101, 53, -17, 6 }, { 6, -21, 99, 56, -18, 6 },
- { 7, -22, 97, 58, -18, 6 }, { 7, -22, 95, 61, -19, 6 },
- { 7, -22, 93, 63, -19, 6 }, { 7, -22, 91, 66, -20, 6 },
- { 7, -22, 88, 69, -20, 6 }, { 7, -22, 86, 71, -21, 7 },
- { 7, -22, 83, 74, -21, 7 }, { 7, -22, 81, 76, -21, 7 },
- { 7, -22, 79, 79, -22, 7 }, { 7, -21, 76, 81, -22, 7 },
- { 7, -21, 74, 83, -22, 7 }, { 7, -21, 71, 86, -22, 7 },
- { 6, -20, 69, 88, -22, 7 }, { 6, -20, 66, 91, -22, 7 },
- { 6, -19, 63, 93, -22, 7 }, { 6, -19, 61, 95, -22, 7 },
- { 6, -18, 58, 97, -22, 7 }, { 6, -18, 56, 99, -21, 6 },
- { 6, -17, 53, 101, -21, 6 }, { 6, -17, 51, 103, -21, 6 },
- { 5, -16, 48, 105, -20, 6 }, { 5, -15, 45, 107, -20, 6 },
- { 5, -15, 42, 109, -19, 6 }, { 5, -14, 40, 111, -19, 5 },
- { 5, -13, 37, 112, -18, 5 }, { 4, -13, 35, 114, -17, 5 },
- { 4, -12, 32, 116, -17, 5 }, { 4, -11, 30, 117, -16, 4 },
- { 4, -10, 27, 118, -15, 4 }, { 3, -9, 25, 119, -14, 4 },
- { 3, -9, 22, 121, -13, 4 }, { 3, -8, 20, 122, -12, 3 },
- { 2, -7, 18, 123, -11, 3 }, { 2, -6, 15, 124, -10, 3 },
- { 2, -5, 13, 125, -9, 2 }, { 1, -4, 11, 125, -7, 2 },
- { 1, -3, 8, 126, -6, 2 }, { 1, -3, 6, 127, -4, 1 },
- { 1, -2, 4, 127, -3, 1 }, { 0, -1, 2, 128, -1, 0 },
-#elif UPSCALE_NORMATIVE_TAPS == 8
+#if UPSCALE_NORMATIVE_TAPS == 8
{ 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -1, 128, 2, -1, 0, 0 },
{ 0, 1, -3, 127, 4, -2, 1, 0 }, { 0, 1, -4, 127, 6, -3, 1, 0 },
{ 0, 2, -6, 126, 8, -3, 1, 0 }, { 0, 2, -7, 125, 11, -4, 1, 0 },
@@ -311,7 +243,7 @@
{ 0, 1, -2, 4, 127, -3, 1, 0 }, { 0, 0, -1, 2, 128, -1, 0, 0 },
#else
#error "Invalid value of UPSCALE_NORMATIVE_TAPS"
-#endif // UPSCALE_NORMATIVE_TAPS == 2
+#endif // UPSCALE_NORMATIVE_TAPS == 8
};
#endif // CONFIG_HORZONLY_FRAME_SUPERRES
@@ -699,8 +631,8 @@
}
av1_convolve_horiz_rs(input - 1, in_stride, output, out_stride, width2,
- height2, &av1_resize_filter_normative[0][0],
- UPSCALE_NORMATIVE_TAPS, x0_qn, x_step_qn);
+ height2, &av1_resize_filter_normative[0][0], x0_qn,
+ x_step_qn);
// Restore the left/right border pixels
if (pad_left) {
@@ -1053,7 +985,7 @@
av1_highbd_convolve_horiz_rs(CONVERT_TO_SHORTPTR(input - 1), in_stride,
CONVERT_TO_SHORTPTR(output), out_stride, width2,
height2, &av1_resize_filter_normative[0][0],
- UPSCALE_NORMATIVE_TAPS, x0_qn, x_step_qn, bd);
+ x0_qn, x_step_qn, bd);
// Restore the left/right border pixels
if (pad_left) {
diff --git a/av1/common/x86/av1_convolve_horiz_rs_sse4.c b/av1/common/x86/av1_convolve_horiz_rs_sse4.c
new file mode 100644
index 0000000..82b65ed
--- /dev/null
+++ b/av1/common/x86/av1_convolve_horiz_rs_sse4.c
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "av1/common/convolve.h"
+#include "av1/common/resize.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// Note: If the crop width is not a multiple of 4, then, unlike the C version,
+// this function will overwrite some of the padding on the right hand side of
+// the frame. This padding appears to be trashed anyway, so this should not
+// affect the running of the decoder.
+void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const int16_t *x_filters, const int x0_qn,
+ const int x_step_qn) {
+ assert(UPSCALE_NORMATIVE_TAPS == 8);
+
+ src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+
+ const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+ const __m128i zero = _mm_setzero_si128();
+
+ const uint8_t *src_y;
+ uint8_t *dst_y;
+ int x_qn = x0_qn;
+ for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
+ const int x_filter_idx0 =
+ ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx1 =
+ ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx2 =
+ ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx3 =
+ ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+
+ assert(x_filter_idx0 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx1 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx2 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx3 <= RS_SUBPEL_MASK);
+
+ const int16_t *const x_filter0 =
+ &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter1 =
+ &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter2 =
+ &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter3 =
+ &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
+
+ const __m128i fil0_16 = xx_loadu_128(x_filter0);
+ const __m128i fil1_16 = xx_loadu_128(x_filter1);
+ const __m128i fil2_16 = xx_loadu_128(x_filter2);
+ const __m128i fil3_16 = xx_loadu_128(x_filter3);
+
+ src_y = src;
+ dst_y = dst;
+ for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
+ const uint8_t *const src_x0 =
+ &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint8_t *const src_x1 =
+ &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint8_t *const src_x2 =
+ &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint8_t *const src_x3 =
+ &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+
+ // Load up the source data. This is 8-bit input data, so each load
+ // gets 8 pixels.
+ const __m128i src0_8 = xx_loadl_64(src_x0);
+ const __m128i src1_8 = xx_loadl_64(src_x1);
+ const __m128i src2_8 = xx_loadl_64(src_x2);
+ const __m128i src3_8 = xx_loadl_64(src_x3);
+
+ // Now zero-extend up to 16-bit precision, i.e.
+ // [ 00 00 00 00 hg fe dc ba ] -> [ 0h 0g 0f 0e 0d 0c 0b 0a ]
+ const __m128i src0_16 = _mm_cvtepu8_epi16(src0_8);
+ const __m128i src1_16 = _mm_cvtepu8_epi16(src1_8);
+ const __m128i src2_16 = _mm_cvtepu8_epi16(src2_8);
+ const __m128i src3_16 = _mm_cvtepu8_epi16(src3_8);
+
+ // Multiply by filter coefficients (results in a 32-bit value),
+ // and add adjacent pairs, i.e.
+ // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
+ // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
+ const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
+ const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
+ const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
+ const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
+
+ // Reduce horizontally and add, i.e.
+ // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
+ const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
+ const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
+
+ const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
+
+ // Divide down by (1 << FILTER_BITS), rounding to nearest.
+ const __m128i shifted_32 =
+ _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
+
+ // Pack 32-bit values into 16-bit values, i.e.
+ // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
+ const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
+
+ // Pack 16-bit values into 8-bit values, i.e.
+ // ([ 0 0 0 0 D C B A ], [ 0 0 0 0 0 0 0 0 ])
+ // -> [ 0 0 0 0 0 0 DC BA ]
+ const __m128i shifted_8 = _mm_packus_epi16(shifted_16, zero);
+
+ // Write to the output
+ xx_storel_32(&dst_y[x], shifted_8);
+ }
+ }
+}
+
+// Note: If the crop width is not a multiple of 4, then, unlike the C version,
+// this function will overwrite some of the padding on the right hand side of
+// the frame. This padding appears to be trashed anyway, so this should not
+// affect the running of the decoder.
+void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w,
+ int h, const int16_t *x_filters,
+ const int x0_qn, const int x_step_qn,
+ int bd) {
+ assert(UPSCALE_NORMATIVE_TAPS == 8);
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+
+ const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i clip_maximum = _mm_set1_epi16((1 << bd) - 1);
+
+ const uint16_t *src_y;
+ uint16_t *dst_y;
+ int x_qn = x0_qn;
+ for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
+ const int x_filter_idx0 =
+ ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx1 =
+ ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx2 =
+ ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx3 =
+ ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+
+ assert(x_filter_idx0 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx1 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx2 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx3 <= RS_SUBPEL_MASK);
+
+ const int16_t *const x_filter0 =
+ &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter1 =
+ &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter2 =
+ &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter3 =
+ &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
+
+ const __m128i fil0_16 = xx_loadu_128(x_filter0);
+ const __m128i fil1_16 = xx_loadu_128(x_filter1);
+ const __m128i fil2_16 = xx_loadu_128(x_filter2);
+ const __m128i fil3_16 = xx_loadu_128(x_filter3);
+
+ src_y = src;
+ dst_y = dst;
+ for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
+ const uint16_t *const src_x0 =
+ &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint16_t *const src_x1 =
+ &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint16_t *const src_x2 =
+ &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint16_t *const src_x3 =
+ &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+
+ // Load up the source data. This is 16-bit input data, so each load
+ // gets 8 pixels.
+ const __m128i src0_16 = xx_loadu_128(src_x0);
+ const __m128i src1_16 = xx_loadu_128(src_x1);
+ const __m128i src2_16 = xx_loadu_128(src_x2);
+ const __m128i src3_16 = xx_loadu_128(src_x3);
+
+ // Multiply by filter coefficients (results in a 32-bit value),
+ // and add adjacent pairs, i.e.
+ // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
+ // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
+ const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
+ const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
+ const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
+ const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
+
+ // Reduce horizontally and add, i.e.
+ // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
+ const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
+ const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
+
+ const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
+
+ // Divide down by (1 << FILTER_BITS), rounding to nearest.
+ const __m128i shifted_32 =
+ _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
+
+ // Pack 32-bit values into 16-bit values, i.e.
+ // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
+ const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
+
+ // Clip the values at (1 << bd) - 1
+ const __m128i clipped_16 = _mm_min_epi16(shifted_16, clip_maximum);
+
+ // Write to the output
+ xx_storel_64(&dst_y[x], clipped_16);
+ }
+ }
+}
diff --git a/test/av1_horz_only_frame_superres_test.cc b/test/av1_horz_only_frame_superres_test.cc
new file mode 100644
index 0000000..7ef2a27
--- /dev/null
+++ b/test/av1_horz_only_frame_superres_test.cc
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <vector>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "./av1_rtcd.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/common/convolve.h"
+#include "av1/common/resize.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+const int kTestIters = 10;
+const int kPerfIters = 1000;
+
+const int kVPad = 32;
+const int kHPad = 32;
+
+using std::tr1::tuple;
+using std::tr1::make_tuple;
+using libaom_test::ACMRandom;
+
+template <typename Pixel>
+class TestImage {
+ public:
+ TestImage(int w_src, int h, int superres_denom, int x0, int bd)
+ : w_src_(w_src), h_(h), superres_denom_(superres_denom), x0_(x0),
+ bd_(bd) {
+ assert(bd < 16);
+ assert(bd <= 8 * static_cast<int>(sizeof(Pixel)));
+ assert(9 <= superres_denom && superres_denom <= 16);
+ assert(SCALE_NUMERATOR == 8);
+ assert(0 <= x0_ && x0_ <= RS_SCALE_SUBPEL_MASK);
+
+ w_dst_ = w_src_;
+ av1_calculate_unscaled_superres_size(&w_dst_, NULL, superres_denom);
+
+ src_stride_ = ALIGN_POWER_OF_TWO(w_src_ + 2 * kHPad, 4);
+ dst_stride_ = ALIGN_POWER_OF_TWO(w_dst_ + 2 * kHPad, 4);
+
+ // Allocate image data
+ src_data_.resize(2 * src_block_size());
+ dst_data_.resize(2 * dst_block_size());
+ }
+
+ void Initialize(ACMRandom *rnd);
+ void Check() const;
+
+ int src_stride() const { return src_stride_; }
+ int dst_stride() const { return dst_stride_; }
+
+ int src_block_size() const { return (h_ + 2 * kVPad) * src_stride(); }
+ int dst_block_size() const { return (h_ + 2 * kVPad) * dst_stride(); }
+
+ int src_width() const { return w_src_; }
+ int dst_width() const { return w_dst_; }
+ int height() const { return h_; }
+ int x0() const { return x0_; }
+
+ const Pixel *GetSrcData(bool ref, bool borders) const {
+ const Pixel *block = &src_data_[ref ? 0 : src_block_size()];
+ return borders ? block : block + kHPad + src_stride_ * kVPad;
+ }
+
+ Pixel *GetDstData(bool ref, bool borders) {
+ Pixel *block = &dst_data_[ref ? 0 : dst_block_size()];
+ return borders ? block : block + kHPad + dst_stride_ * kVPad;
+ }
+
+ private:
+ int w_src_, w_dst_, h_, superres_denom_, x0_, bd_;
+ int src_stride_, dst_stride_;
+
+ std::vector<Pixel> src_data_;
+ std::vector<Pixel> dst_data_;
+};
+
+template <typename Pixel>
+void FillEdge(ACMRandom *rnd, int num_pixels, int bd, bool trash, Pixel *data) {
+ if (!trash) {
+ memset(data, 0, sizeof(*data) * num_pixels);
+ return;
+ }
+ const Pixel mask = (1 << bd) - 1;
+ for (int i = 0; i < num_pixels; ++i) data[i] = rnd->Rand16() & mask;
+}
+
+template <typename Pixel>
+void PrepBuffers(ACMRandom *rnd, int w, int h, int stride, int bd,
+ bool trash_edges, Pixel *data) {
+ assert(rnd);
+ const Pixel mask = (1 << bd) - 1;
+
+ // Fill in the first buffer with random data
+ // Top border
+ FillEdge(rnd, stride * kVPad, bd, trash_edges, data);
+ for (int r = 0; r < h; ++r) {
+ Pixel *row_data = data + (kVPad + r) * stride;
+ // Left border, contents, right border
+ FillEdge(rnd, kHPad, bd, trash_edges, row_data);
+ for (int c = 0; c < w; ++c) row_data[kHPad + c] = rnd->Rand16() & mask;
+ FillEdge(rnd, kHPad, bd, trash_edges, row_data + kHPad + w);
+ }
+ // Bottom border
+ FillEdge(rnd, stride * kVPad, bd, trash_edges, data + stride * (kVPad + h));
+
+ const int bpp = sizeof(*data);
+ const int block_elts = stride * (h + 2 * kVPad);
+ const int block_size = bpp * block_elts;
+
+ // Now copy that to the second buffer
+ memcpy(data + block_elts, data, block_size);
+}
+
+template <typename Pixel>
+void TestImage<Pixel>::Initialize(ACMRandom *rnd) {
+ PrepBuffers(rnd, w_src_, h_, src_stride_, bd_, false, &src_data_[0]);
+ PrepBuffers(rnd, w_dst_, h_, dst_stride_, bd_, true, &dst_data_[0]);
+}
+
+template <typename Pixel>
+void TestImage<Pixel>::Check() const {
+ const int num_pixels = dst_block_size();
+ const Pixel *ref_dst = &dst_data_[0];
+ const Pixel *tst_dst = &dst_data_[num_pixels];
+
+ // If memcmp returns 0, there's nothing to do.
+ if (0 == memcmp(ref_dst, tst_dst, sizeof(*ref_dst) * num_pixels)) return;
+
+ // Otherwise, iterate through the buffer looking for differences, *ignoring
+ // the edges*
+ const int stride = dst_stride_;
+ for (int r = kVPad; r < h_ + kVPad; ++r) {
+ for (int c = kVPad; c < w_dst_ + kHPad; ++c) {
+ const int32_t ref_value = ref_dst[r * stride + c];
+ const int32_t tst_value = tst_dst[r * stride + c];
+
+ EXPECT_EQ(tst_value, ref_value)
+ << "Error at row: " << (r - kVPad) << ", col: " << (c - kHPad)
+ << ", superres_denom: " << superres_denom_ << ", height: " << h_
+ << ", src_width: " << w_src_ << ", dst_width: " << w_dst_
+ << ", x0: " << x0_;
+ }
+ }
+}
+
+template <typename Pixel>
+class ConvolveHorizRSTestBase : public ::testing::Test {
+ public:
+ ConvolveHorizRSTestBase() : image_(NULL) {}
+ virtual ~ConvolveHorizRSTestBase() {}
+ virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ // Implemented by subclasses (SetUp depends on the parameters passed
+ // in and RunOne depends on the function to be tested. These can't
+ // be templated for low/high bit depths because they have different
+ // numbers of parameters)
+ virtual void SetUp() = 0;
+ virtual void RunOne(bool ref) = 0;
+
+ protected:
+ void SetBitDepth(int bd) { bd_ = bd; }
+
+ void CorrectnessTest() {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ for (int i = 0; i < kTestIters; ++i) {
+ for (int superres_denom = 9; superres_denom <= 16; superres_denom++) {
+ // Get a random height between 512 and 767
+ int height = rnd.Rand8() + 512;
+
+ // Get a random src width between 128 and 383
+ int width_src = rnd.Rand8() + 128;
+
+ // x0 is normally calculated by get_upscale_convolve_x0 in
+ // av1/common/resize.c. However, this test should work for
+ // any value of x0 between 0 and RS_SCALE_SUBPEL_MASK
+ // (inclusive), so we choose one at random.
+ int x0 = rnd.Rand16() % (RS_SCALE_SUBPEL_MASK + 1);
+
+ image_ =
+ new TestImage<Pixel>(width_src, height, superres_denom, x0, bd_);
+
+ Prep(&rnd);
+ RunOne(true);
+ RunOne(false);
+ image_->Check();
+
+ delete image_;
+ }
+ }
+ }
+
+ void SpeedTest() {
+ // Pick some specific parameters to test
+ int height = 767;
+ int width_src = 129;
+ int superres_denom = 13;
+ int x0 = RS_SCALE_SUBPEL_MASK >> 1;
+
+ image_ = new TestImage<Pixel>(width_src, height, superres_denom, x0, bd_);
+
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ Prep(&rnd);
+
+ aom_usec_timer ref_timer;
+ aom_usec_timer_start(&ref_timer);
+ for (int i = 0; i < kPerfIters; ++i) RunOne(true);
+ aom_usec_timer_mark(&ref_timer);
+ const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+ aom_usec_timer tst_timer;
+ aom_usec_timer_start(&tst_timer);
+ for (int i = 0; i < kPerfIters; ++i) RunOne(false);
+ aom_usec_timer_mark(&tst_timer);
+ const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+ std::cout << "[ ] C time = " << ref_time / 1000
+ << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+ EXPECT_GT(ref_time, tst_time)
+ << "Error: CDEFSpeedTest, SIMD slower than C.\n"
+ << "C time: " << ref_time << " us\n"
+ << "SIMD time: " << tst_time << " us\n";
+ }
+
+ void Prep(ACMRandom *rnd) {
+ assert(rnd);
+ image_->Initialize(rnd);
+ }
+
+ int bd_;
+ TestImage<Pixel> *image_;
+};
+
+typedef void (*LowBDConvolveHorizRsFunc)(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w,
+ int h, const int16_t *x_filters,
+ const int x0_qn, const int x_step_qn);
+
+// Test parameter list:
+// <tst_fun_>
+typedef tuple<LowBDConvolveHorizRsFunc> LowBDParams;
+
+class LowBDConvolveHorizRSTest
+ : public ConvolveHorizRSTestBase<uint8_t>,
+ public ::testing::WithParamInterface<LowBDParams> {
+ public:
+ virtual ~LowBDConvolveHorizRSTest() {}
+
+ void SetUp() {
+ tst_fun_ = GET_PARAM(0);
+ const int bd = 8;
+ SetBitDepth(bd);
+ }
+
+ void RunOne(bool ref) {
+ const uint8_t *src = image_->GetSrcData(ref, false);
+ uint8_t *dst = image_->GetDstData(ref, false);
+ const int src_stride = image_->src_stride();
+ const int dst_stride = image_->dst_stride();
+ const int width_src = image_->src_width();
+ const int width_dst = image_->dst_width();
+ const int height = image_->height();
+ const int x0_qn = image_->x0();
+
+ const int32_t x_step_qn =
+ av1_get_upscale_convolve_step(width_src, width_dst);
+
+ if (ref) {
+ av1_convolve_horiz_rs_c(src, src_stride, dst, dst_stride, width_dst,
+ height, &av1_resize_filter_normative[0][0], x0_qn,
+ x_step_qn);
+ } else {
+ tst_fun_(src, src_stride, dst, dst_stride, width_dst, height,
+ &av1_resize_filter_normative[0][0], x0_qn, x_step_qn);
+ }
+ }
+
+ private:
+ LowBDConvolveHorizRsFunc tst_fun_;
+};
+
+TEST_P(LowBDConvolveHorizRSTest, Correctness) { CorrectnessTest(); }
+TEST_P(LowBDConvolveHorizRSTest, DISABLED_Speed) { SpeedTest(); }
+
+INSTANTIATE_TEST_CASE_P(SSE4_1, LowBDConvolveHorizRSTest,
+ ::testing::Values(av1_convolve_horiz_rs_sse4_1));
+
+typedef void (*HighBDConvolveHorizRsFunc)(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w,
+ int h, const int16_t *x_filters,
+ const int x0_qn, const int x_step_qn,
+ int bd);
+
+// Test parameter list:
+// <tst_fun_, bd_>
+typedef tuple<HighBDConvolveHorizRsFunc, int> HighBDParams;
+
+class HighBDConvolveHorizRSTest
+ : public ConvolveHorizRSTestBase<uint16_t>,
+ public ::testing::WithParamInterface<HighBDParams> {
+ public:
+ virtual ~HighBDConvolveHorizRSTest() {}
+
+ void SetUp() {
+ tst_fun_ = GET_PARAM(0);
+ const int bd = GET_PARAM(1);
+ SetBitDepth(bd);
+ }
+
+ void RunOne(bool ref) {
+ const uint16_t *src = image_->GetSrcData(ref, false);
+ uint16_t *dst = image_->GetDstData(ref, false);
+ const int src_stride = image_->src_stride();
+ const int dst_stride = image_->dst_stride();
+ const int width_src = image_->src_width();
+ const int width_dst = image_->dst_width();
+ const int height = image_->height();
+ const int x0_qn = image_->x0();
+
+ const int32_t x_step_qn =
+ av1_get_upscale_convolve_step(width_src, width_dst);
+
+ if (ref) {
+ av1_highbd_convolve_horiz_rs_c(
+ src, src_stride, dst, dst_stride, width_dst, height,
+ &av1_resize_filter_normative[0][0], x0_qn, x_step_qn, bd_);
+ } else {
+ tst_fun_(src, src_stride, dst, dst_stride, width_dst, height,
+ &av1_resize_filter_normative[0][0], x0_qn, x_step_qn, bd_);
+ }
+ }
+
+ private:
+ HighBDConvolveHorizRsFunc tst_fun_;
+};
+
+const int kBDs[] = { 8, 10, 12 };
+
+TEST_P(HighBDConvolveHorizRSTest, Correctness) { CorrectnessTest(); }
+TEST_P(HighBDConvolveHorizRSTest, DISABLED_Speed) { SpeedTest(); }
+
+INSTANTIATE_TEST_CASE_P(
+ SSE4_1, HighBDConvolveHorizRSTest,
+ ::testing::Combine(::testing::Values(av1_highbd_convolve_horiz_rs_sse4_1),
+ ::testing::ValuesIn(kBDs)));
+
+} // namespace
diff --git a/test/test.cmake b/test/test.cmake
index 1522436..7722242f 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -253,6 +253,12 @@
"${AOM_ROOT}/test/av1_convolve_scale_test.cc")
endif ()
+ if (HAVE_SSE4_1)
+ set(AOM_UNIT_TEST_ENCODER_SOURCES
+ ${AOM_UNIT_TEST_ENCODER_SOURCES}
+ "${AOM_ROOT}/test/av1_horz_only_frame_superres_test.cc")
+ endif ()
+
set(AOM_UNIT_TEST_ENCODER_SOURCES
${AOM_UNIT_TEST_ENCODER_SOURCES}
"${AOM_ROOT}/test/av1_fht16x32_test.cc"