Add sse2 assembly for highbd_sad_4xn(_avg)
Size | Speed
4x16 | 5.13X
4x8 | 4.71X
4x4 | 2.34X
bug=aomedia:2432
Change-Id: I87a2d4c64bfc9ec30d94b2994b3ae951af5d23e5
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index ad16f01..aa97ef6 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -723,6 +723,8 @@
specialize qw/aom_highbd_sad16x16 avx2 sse2/;
specialize qw/aom_highbd_sad16x8 avx2 sse2/;
specialize qw/aom_highbd_sad8x4 sse2/;
+ specialize qw/aom_highbd_sad4x8 sse2/;
+ specialize qw/aom_highbd_sad4x4 sse2/;
specialize qw/aom_highbd_sad128x128_avg avx2/;
specialize qw/aom_highbd_sad128x64_avg avx2/;
@@ -736,13 +738,17 @@
specialize qw/aom_highbd_sad16x16_avg avx2 sse2/;
specialize qw/aom_highbd_sad16x8_avg avx2 sse2/;
specialize qw/aom_highbd_sad8x4_avg sse2/;
+ specialize qw/aom_highbd_sad4x8_avg sse2/;
+ specialize qw/aom_highbd_sad4x4_avg sse2/;
+ specialize qw/aom_highbd_sad4x16 sse2/;
specialize qw/aom_highbd_sad16x4 avx2 sse2/;
specialize qw/aom_highbd_sad8x32 sse2/;
specialize qw/aom_highbd_sad32x8 avx2 sse2/;
specialize qw/aom_highbd_sad16x64 avx2 sse2/;
specialize qw/aom_highbd_sad64x16 avx2 sse2/;
+ specialize qw/aom_highbd_sad4x16_avg sse2/;
specialize qw/aom_highbd_sad16x4_avg avx2 sse2/;
specialize qw/aom_highbd_sad8x32_avg sse2/;
specialize qw/aom_highbd_sad32x8_avg avx2 sse2/;
diff --git a/aom_dsp/x86/highbd_sad_sse2.asm b/aom_dsp/x86/highbd_sad_sse2.asm
index 3398d8a..09e64d5 100644
--- a/aom_dsp/x86/highbd_sad_sse2.asm
+++ b/aom_dsp/x86/highbd_sad_sse2.asm
@@ -372,3 +372,71 @@
HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2
HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2
HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2
+
+; unsigned int aom_highbd_sad4x{4,8,16}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD4XN 1-2 0
+ HIGH_SAD_FN 4, %1, 7, %2
+ mov n_rowsd, %1/4
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ movq m1, [refq]
+ movq m2, [refq+ref_strideq*2]
+ movq m3, [refq+ref_strideq*4]
+ movq m4, [refq+ref_stride3q*2]
+ punpcklwd m1, m3
+ punpcklwd m2, m4
+%if %2 == 1
+ movq m3, [second_predq+8*0]
+ movq m5, [second_predq+8*2]
+ punpcklwd m3, m5
+ movq m4, [second_predq+8*1]
+ movq m5, [second_predq+8*3]
+ punpcklwd m4, m5
+ lea second_predq, [second_predq+8*4]
+ pavgw m1, m3
+ pavgw m2, m4
+%endif
+ movq m5, [srcq]
+ movq m3, [srcq+src_strideq*4]
+ punpcklwd m5, m3
+ movdqa m3, m1
+ psubusw m1, m5
+ psubusw m5, m3
+ por m1, m5
+ movq m5, [srcq+src_strideq*2]
+ movq m4, [srcq+src_stride3q*2]
+ punpcklwd m5, m4
+ movdqa m4, m2
+ psubusw m2, m5
+ psubusw m5, m4
+ por m2, m5
+ paddw m1, m2
+ movdqa m2, m1
+ punpcklwd m1, m6
+ punpckhwd m2, m6
+ lea refq, [refq+ref_strideq*8]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*8]
+ paddd m0, m2
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD4XN 16 ; highbd_sad4x16_sse2
+HIGH_SAD4XN 8 ; highbd_sad4x8_sse2
+HIGH_SAD4XN 4 ; highbd_sad4x4_sse2
+HIGH_SAD4XN 16, 1 ; highbd_sad4x16_avg_sse2
+HIGH_SAD4XN 8, 1 ; highbd_sad4x8_avg_sse2
+HIGH_SAD4XN 4, 1 ; highbd_sad4x4_avg_sse2
diff --git a/test/sad_test.cc b/test/sad_test.cc
index a4fd08d..ad971cc 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1213,6 +1213,8 @@
make_tuple(8, 16, &aom_highbd_sad8x16_sse2, 8),
make_tuple(8, 8, &aom_highbd_sad8x8_sse2, 8),
make_tuple(8, 4, &aom_highbd_sad8x4_sse2, 8),
+ make_tuple(4, 8, &aom_highbd_sad4x8_sse2, 8),
+ make_tuple(4, 4, &aom_highbd_sad4x4_sse2, 8),
make_tuple(64, 64, &aom_highbd_sad64x64_sse2, 10),
make_tuple(64, 32, &aom_highbd_sad64x32_sse2, 10),
make_tuple(32, 64, &aom_highbd_sad32x64_sse2, 10),
@@ -1224,6 +1226,8 @@
make_tuple(8, 16, &aom_highbd_sad8x16_sse2, 10),
make_tuple(8, 8, &aom_highbd_sad8x8_sse2, 10),
make_tuple(8, 4, &aom_highbd_sad8x4_sse2, 10),
+ make_tuple(4, 8, &aom_highbd_sad4x8_sse2, 10),
+ make_tuple(4, 4, &aom_highbd_sad4x4_sse2, 10),
make_tuple(64, 64, &aom_highbd_sad64x64_sse2, 12),
make_tuple(64, 32, &aom_highbd_sad64x32_sse2, 12),
make_tuple(32, 64, &aom_highbd_sad32x64_sse2, 12),
@@ -1235,6 +1239,8 @@
make_tuple(8, 16, &aom_highbd_sad8x16_sse2, 12),
make_tuple(8, 8, &aom_highbd_sad8x8_sse2, 12),
make_tuple(8, 4, &aom_highbd_sad8x4_sse2, 12),
+ make_tuple(4, 8, &aom_highbd_sad4x8_sse2, 12),
+ make_tuple(4, 4, &aom_highbd_sad4x4_sse2, 12),
make_tuple(64, 16, &aom_sad64x16_sse2, -1),
make_tuple(16, 64, &aom_sad16x64_sse2, -1),
@@ -1255,13 +1261,13 @@
make_tuple(8, 32, &aom_highbd_sad8x32_sse2, 12),
make_tuple(16, 4, &aom_sad16x4_sse2, -1),
- // make_tuple(4, 16, &aom_sad4x16_sse2, -1),
+ make_tuple(4, 16, &aom_sad4x16_sse2, -1),
make_tuple(16, 4, &aom_highbd_sad16x4_sse2, 8),
- // make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 8),
+ make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 8),
make_tuple(16, 4, &aom_highbd_sad16x4_sse2, 10),
- // make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 10),
+ make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 10),
make_tuple(16, 4, &aom_highbd_sad16x4_sse2, 12),
- // make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 12),
+ make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 12),
};
INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
@@ -1293,6 +1299,8 @@
make_tuple(8, 16, &aom_highbd_sad8x16_avg_sse2, 8),
make_tuple(8, 8, &aom_highbd_sad8x8_avg_sse2, 8),
make_tuple(8, 4, &aom_highbd_sad8x4_avg_sse2, 8),
+ make_tuple(4, 8, &aom_highbd_sad4x8_avg_sse2, 8),
+ make_tuple(4, 4, &aom_highbd_sad4x4_avg_sse2, 8),
make_tuple(64, 64, &aom_highbd_sad64x64_avg_sse2, 10),
make_tuple(64, 32, &aom_highbd_sad64x32_avg_sse2, 10),
make_tuple(32, 64, &aom_highbd_sad32x64_avg_sse2, 10),
@@ -1304,6 +1312,8 @@
make_tuple(8, 16, &aom_highbd_sad8x16_avg_sse2, 10),
make_tuple(8, 8, &aom_highbd_sad8x8_avg_sse2, 10),
make_tuple(8, 4, &aom_highbd_sad8x4_avg_sse2, 10),
+ make_tuple(4, 8, &aom_highbd_sad4x8_avg_sse2, 10),
+ make_tuple(4, 4, &aom_highbd_sad4x4_avg_sse2, 10),
make_tuple(64, 64, &aom_highbd_sad64x64_avg_sse2, 12),
make_tuple(64, 32, &aom_highbd_sad64x32_avg_sse2, 12),
make_tuple(32, 64, &aom_highbd_sad32x64_avg_sse2, 12),
@@ -1315,6 +1325,8 @@
make_tuple(8, 16, &aom_highbd_sad8x16_avg_sse2, 12),
make_tuple(8, 8, &aom_highbd_sad8x8_avg_sse2, 12),
make_tuple(8, 4, &aom_highbd_sad8x4_avg_sse2, 12),
+ make_tuple(4, 8, &aom_highbd_sad4x8_avg_sse2, 12),
+ make_tuple(4, 4, &aom_highbd_sad4x4_avg_sse2, 12),
make_tuple(64, 16, &aom_sad64x16_avg_sse2, -1),
make_tuple(16, 64, &aom_sad16x64_avg_sse2, -1),
@@ -1335,13 +1347,13 @@
make_tuple(8, 32, &aom_highbd_sad8x32_avg_sse2, 12),
make_tuple(16, 4, &aom_sad16x4_avg_sse2, -1),
- // make_tuple(4, 16, &aom_sad4x16_avg_sse2, -1),
+ make_tuple(4, 16, &aom_sad4x16_avg_sse2, -1),
make_tuple(16, 4, &aom_highbd_sad16x4_avg_sse2, 8),
- // make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 8),
+ make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 8),
make_tuple(16, 4, &aom_highbd_sad16x4_avg_sse2, 10),
- // make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 10),
+ make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 10),
make_tuple(16, 4, &aom_highbd_sad16x4_avg_sse2, 12),
- // make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 12),
+ make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 12),
};
INSTANTIATE_TEST_CASE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests));