Add sse2 for av1_block_error and tests.
Change-Id: Ide1cfa5c70d83071d72d0910abeaf7b5d565aa50
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 53bb4fb..0e33221 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -210,7 +210,7 @@
# the transform coefficients are held in 32-bit
# values, so the assembler code for av1_block_error can no longer be used.
add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
- specialize qw/av1_block_error avx2 neon/;
+ specialize qw/av1_block_error sse2 avx2 neon/;
add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/av1_quantize_fp sse2 avx2 neon/;
diff --git a/av1/encoder/x86/error_sse2.asm b/av1/encoder/x86/error_sse2.asm
index 72e9e22..f4b4968 100644
--- a/av1/encoder/x86/error_sse2.asm
+++ b/av1/encoder/x86/error_sse2.asm
@@ -11,6 +11,21 @@
;
+; Increment %1 by sizeof() tran_low_t * %2.
+%macro INCREMENT_ELEMENTS_TRAN_LOW 2
+ lea %1, [%1 + %2 * 4]
+%endmacro
+
+; Load %2 + %3 into m%1.
+; %3 is the offset in elements, not bytes.
+; If tran_low_t is 16 bits (low bit depth configuration) then load the value
+; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack
+; the values down to 16 bits.
+%macro LOAD_TRAN_LOW 3
+ mova m%1, [%2 + (%3) * 4]
+ packssdw m%1, [%2 + (%3) * 4 + 16]
+%endmacro
+
%define private_prefix av1
%include "third_party/x86inc/x86inc.asm"
@@ -25,14 +40,14 @@
pxor m4, m4 ; sse accumulator
pxor m6, m6 ; ssz accumulator
pxor m5, m5 ; dedicated zero register
- lea uqcq, [uqcq+sizeq*2]
- lea dqcq, [dqcq+sizeq*2]
- neg sizeq
.loop:
- mova m2, [uqcq+sizeq*2]
- mova m0, [dqcq+sizeq*2]
- mova m3, [uqcq+sizeq*2+mmsize]
- mova m1, [dqcq+sizeq*2+mmsize]
+ LOAD_TRAN_LOW 2, uqcq, 0
+ LOAD_TRAN_LOW 0, dqcq, 0
+ LOAD_TRAN_LOW 3, uqcq, 8
+ LOAD_TRAN_LOW 1, dqcq, 8
+ INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
+ INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
+ sub sizeq, 16
psubw m0, m2
psubw m1, m3
; individual errors are max. 15bit+sign, so squares are 30bit, and
@@ -41,25 +56,19 @@
pmaddwd m1, m1
pmaddwd m2, m2
pmaddwd m3, m3
+ ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
+ paddd m0, m1
+ paddd m2, m3
; accumulate in 64bit
punpckldq m7, m0, m5
punpckhdq m0, m5
paddq m4, m7
- punpckldq m7, m1, m5
- paddq m4, m0
- punpckhdq m1, m5
- paddq m4, m7
punpckldq m7, m2, m5
- paddq m4, m1
+ paddq m4, m0
punpckhdq m2, m5
paddq m6, m7
- punpckldq m7, m3, m5
paddq m6, m2
- punpckhdq m3, m5
- paddq m6, m7
- paddq m6, m3
- add sizeq, mmsize
- jl .loop
+ jg .loop
; accumulate horizontally and store in return value
movhlps m5, m4
diff --git a/test/error_block_test.cc b/test/error_block_test.cc
index d98cc09..da2ceba 100644
--- a/test/error_block_test.cc
+++ b/test/error_block_test.cc
@@ -234,7 +234,7 @@
using ::testing::make_tuple;
-#if (HAVE_SSE2 || HAVE_AVX)
+#if (HAVE_SSE2)
INSTANTIATE_TEST_CASE_P(
SSE2, ErrorBlockTest,
::testing::Values(make_tuple(&av1_highbd_block_error_sse2,
@@ -242,7 +242,10 @@
make_tuple(&av1_highbd_block_error_sse2,
&av1_highbd_block_error_c, AOM_BITS_12),
make_tuple(&av1_highbd_block_error_sse2,
- &av1_highbd_block_error_c, AOM_BITS_8)));
+ &av1_highbd_block_error_c, AOM_BITS_8),
+ make_tuple(&BlockError8BitWrapper<av1_block_error_sse2>,
+ &BlockError8BitWrapper<av1_block_error_c>,
+ AOM_BITS_8)));
#endif // HAVE_SSE2
#if (HAVE_AVX2)