SSE2 implementation of frame error module
Module gains improved by factor of ~1.13x w.r.t C code.
Change-Id: I716aafb132482a2fc2e15ea70fa94f7982cc440a
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 90952ae..cdab7f5 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -226,7 +226,8 @@
"${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse2.c"
"${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c"
"${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c"
- "${AOM_ROOT}/av1/common/x86/av1_txfm_sse2.h")
+ "${AOM_ROOT}/av1/common/x86/av1_txfm_sse2.h"
+ "${AOM_ROOT}/av1/common/x86/warp_plane_sse2.c")
list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3
"${AOM_ROOT}/av1/common/cdef_block_ssse3.c"
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index d1a38fa..0a38691 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -337,7 +337,7 @@
specialize qw/av1_highbd_warp_affine sse4_1/;
add_proto qw/int64_t av1_calc_frame_error/, "const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride";
-specialize qw/av1_calc_frame_error avx2/;
+specialize qw/av1_calc_frame_error sse2 avx2/;
if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/double av1_compute_cross_correlation/, "unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2";
diff --git a/av1/common/x86/warp_plane_sse2.c b/av1/common/x86/warp_plane_sse2.c
new file mode 100644
index 0000000..6ff6665
--- /dev/null
+++ b/av1/common/x86/warp_plane_sse2.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/warped_motion.h"
+#include "config/av1_rtcd.h"
+
+int64_t av1_calc_frame_error_sse2(const uint8_t *const ref, int ref_stride,
+ const uint8_t *const dst, int p_width,
+ int p_height, int dst_stride) {
+ int64_t sum_error = 0;
+ int i, j;
+ __m128i row_error, col_error;
+ __m128i zero = _mm_set1_epi16(0);
+ __m128i dup_255 = _mm_set1_epi16(255);
+ col_error = zero;
+ for (i = 0; i < (p_height); i++) {
+ row_error = zero;
+ for (j = 0; j < (p_width / 16); j++) {
+ __m128i ref_8 =
+ _mm_load_si128((__m128i *)(ref + (j * 16) + (i * ref_stride)));
+ __m128i dst_8 =
+ _mm_load_si128((__m128i *)(dst + (j * 16) + (i * dst_stride)));
+ __m128i ref_16_lo = _mm_unpacklo_epi8(ref_8, zero);
+ __m128i ref_16_hi = _mm_unpackhi_epi8(ref_8, zero);
+ __m128i dst_16_lo = _mm_unpacklo_epi8(dst_8, zero);
+ __m128i dst_16_hi = _mm_unpackhi_epi8(dst_8, zero);
+
+ __m128i diff_1 =
+ _mm_add_epi16(_mm_sub_epi16(dst_16_lo, ref_16_lo), dup_255);
+ __m128i diff_2 =
+ _mm_add_epi16(_mm_sub_epi16(dst_16_hi, ref_16_hi), dup_255);
+
+ __m128i error_1_lo =
+ _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_1, 3)],
+ error_measure_lut[_mm_extract_epi16(diff_1, 2)],
+ error_measure_lut[_mm_extract_epi16(diff_1, 1)],
+ error_measure_lut[_mm_extract_epi16(diff_1, 0)]);
+ __m128i error_1_hi =
+ _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_1, 7)],
+ error_measure_lut[_mm_extract_epi16(diff_1, 6)],
+ error_measure_lut[_mm_extract_epi16(diff_1, 5)],
+ error_measure_lut[_mm_extract_epi16(diff_1, 4)]);
+ __m128i error_2_lo =
+ _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_2, 3)],
+ error_measure_lut[_mm_extract_epi16(diff_2, 2)],
+ error_measure_lut[_mm_extract_epi16(diff_2, 1)],
+ error_measure_lut[_mm_extract_epi16(diff_2, 0)]);
+ __m128i error_2_hi =
+ _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_2, 7)],
+ error_measure_lut[_mm_extract_epi16(diff_2, 6)],
+ error_measure_lut[_mm_extract_epi16(diff_2, 5)],
+ error_measure_lut[_mm_extract_epi16(diff_2, 4)]);
+
+ __m128i error_1 = _mm_add_epi32(error_1_lo, error_1_hi);
+ __m128i error_2 = _mm_add_epi32(error_2_lo, error_2_hi);
+ __m128i error_1_2 = _mm_add_epi32(error_1, error_2);
+
+ row_error = _mm_add_epi32(row_error, error_1_2);
+ }
+ __m128i col_error_lo = _mm_unpacklo_epi32(row_error, zero);
+ __m128i col_error_hi = _mm_unpackhi_epi32(row_error, zero);
+ __m128i col_error_temp = _mm_add_epi64(col_error_lo, col_error_hi);
+ col_error = _mm_add_epi64(col_error, col_error_temp);
+ // Error summation for remaining width, which is not multiple of 16
+ if (p_width & 0xf) {
+ for (int l = j * 16; l < p_width; ++l) {
+ sum_error += (int64_t)error_measure(dst[l + i * dst_stride] -
+ ref[l + i * ref_stride]);
+ }
+ }
+ }
+ int64_t sum_error_d_0, sum_error_d_1;
+ xx_storel_64(&sum_error_d_0, col_error);
+ xx_storel_64(&sum_error_d_1, _mm_srli_si128(col_error, 8));
+ sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
+ return sum_error;
+}
diff --git a/test/frame_error_test.cc b/test/frame_error_test.cc
index edfacf2..73b880a 100644
--- a/test/frame_error_test.cc
+++ b/test/frame_error_test.cc
@@ -27,7 +27,7 @@
typedef int64_t (*frame_error_func)(const uint8_t *const ref, int stride,
const uint8_t *const dst, int p_width,
int p_height, int p_stride);
-#if HAVE_AVX2
+#if HAVE_AVX2 || HAVE_SSE2
const int kBlockWidth[] = {
832, 834, 640, 1280, 1920,
};
@@ -145,6 +145,14 @@
RunSpeedTest(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
}
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+ SSE2, AV1FrameErrorTest,
+ ::testing::Combine(::testing::Values(&av1_calc_frame_error_sse2),
+ ::testing::ValuesIn(kBlockWidth),
+ ::testing::ValuesIn(kBlockHeight)));
+#endif
+
#if HAVE_AVX2
INSTANTIATE_TEST_CASE_P(
AVX2, AV1FrameErrorTest,