Add sse2 assembly for highbd_sad_4xn(_avg)

Size | Speed
4x16 | 5.13X
4x8  | 4.71X
4x4  | 2.34X

bug=aomedia:2432

Change-Id: I87a2d4c64bfc9ec30d94b2994b3ae951af5d23e5
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index ad16f01..aa97ef6e 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -723,6 +723,8 @@
     specialize qw/aom_highbd_sad16x16   avx2 sse2/;
     specialize qw/aom_highbd_sad16x8    avx2 sse2/;
     specialize qw/aom_highbd_sad8x4     sse2/;
+    specialize qw/aom_highbd_sad4x8     sse2/;
+    specialize qw/aom_highbd_sad4x4     sse2/;
 
     specialize qw/aom_highbd_sad128x128_avg avx2/;
     specialize qw/aom_highbd_sad128x64_avg  avx2/;
@@ -736,13 +738,17 @@
     specialize qw/aom_highbd_sad16x16_avg   avx2 sse2/;
     specialize qw/aom_highbd_sad16x8_avg    avx2 sse2/;
     specialize qw/aom_highbd_sad8x4_avg     sse2/;
+    specialize qw/aom_highbd_sad4x8_avg     sse2/;
+    specialize qw/aom_highbd_sad4x4_avg     sse2/;
 
+    specialize qw/aom_highbd_sad4x16        sse2/;
     specialize qw/aom_highbd_sad16x4        avx2 sse2/;
     specialize qw/aom_highbd_sad8x32        sse2/;
     specialize qw/aom_highbd_sad32x8        avx2 sse2/;
     specialize qw/aom_highbd_sad16x64       avx2 sse2/;
     specialize qw/aom_highbd_sad64x16       avx2 sse2/;
 
+    specialize qw/aom_highbd_sad4x16_avg    sse2/;
     specialize qw/aom_highbd_sad16x4_avg    avx2 sse2/;
     specialize qw/aom_highbd_sad8x32_avg    sse2/;
     specialize qw/aom_highbd_sad32x8_avg    avx2 sse2/;
diff --git a/aom_dsp/x86/highbd_sad_sse2.asm b/aom_dsp/x86/highbd_sad_sse2.asm
index 3398d8a..09e64d5 100644
--- a/aom_dsp/x86/highbd_sad_sse2.asm
+++ b/aom_dsp/x86/highbd_sad_sse2.asm
@@ -372,3 +372,71 @@
 HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
 HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2
 HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2
+
+; unsigned int aom_highbd_sad4x{4,8,16}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD4XN 1-2 0
+  HIGH_SAD_FN 4, %1, 7, %2
+  mov              n_rowsd, %1/4
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movq                  m1, [refq]
+  movq                  m2, [refq+ref_strideq*2]
+  movq                  m3, [refq+ref_strideq*4]
+  movq                  m4, [refq+ref_stride3q*2]
+  punpcklwd             m1, m3
+  punpcklwd             m2, m4
+%if %2 == 1
+  movq                  m3, [second_predq+8*0]
+  movq                  m5, [second_predq+8*2]
+  punpcklwd             m3, m5
+  movq                  m4, [second_predq+8*1]
+  movq                  m5, [second_predq+8*3]
+  punpcklwd             m4, m5
+  lea         second_predq, [second_predq+8*4]
+  pavgw                 m1, m3
+  pavgw                 m2, m4
+%endif
+  movq                  m5, [srcq]
+  movq                  m3, [srcq+src_strideq*4]
+  punpcklwd             m5, m3
+  movdqa                m3, m1
+  psubusw               m1, m5
+  psubusw               m5, m3
+  por                   m1, m5
+  movq                  m5, [srcq+src_strideq*2]
+  movq                  m4, [srcq+src_stride3q*2]
+  punpcklwd             m5, m4
+  movdqa                m4, m2
+  psubusw               m2, m5
+  psubusw               m5, m4
+  por                   m2, m5
+  paddw                 m1, m2
+  movdqa                m2, m1
+  punpcklwd             m1, m6
+  punpckhwd             m2, m6
+  lea                 refq, [refq+ref_strideq*8]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*8]
+  paddd                 m0, m2
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD4XN 16 ; highbd_sad4x16_sse2
+HIGH_SAD4XN  8 ; highbd_sad4x8_sse2
+HIGH_SAD4XN  4 ; highbd_sad4x4_sse2
+HIGH_SAD4XN 16, 1 ; highbd_sad4x16_avg_sse2
+HIGH_SAD4XN  8, 1 ; highbd_sad4x8_avg_sse2
+HIGH_SAD4XN  4, 1 ; highbd_sad4x4_avg_sse2
diff --git a/test/sad_test.cc b/test/sad_test.cc
index a4fd08d..ad971cc 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1213,6 +1213,8 @@
   make_tuple(8, 16, &aom_highbd_sad8x16_sse2, 8),
   make_tuple(8, 8, &aom_highbd_sad8x8_sse2, 8),
   make_tuple(8, 4, &aom_highbd_sad8x4_sse2, 8),
+  make_tuple(4, 8, &aom_highbd_sad4x8_sse2, 8),
+  make_tuple(4, 4, &aom_highbd_sad4x4_sse2, 8),
   make_tuple(64, 64, &aom_highbd_sad64x64_sse2, 10),
   make_tuple(64, 32, &aom_highbd_sad64x32_sse2, 10),
   make_tuple(32, 64, &aom_highbd_sad32x64_sse2, 10),
@@ -1224,6 +1226,8 @@
   make_tuple(8, 16, &aom_highbd_sad8x16_sse2, 10),
   make_tuple(8, 8, &aom_highbd_sad8x8_sse2, 10),
   make_tuple(8, 4, &aom_highbd_sad8x4_sse2, 10),
+  make_tuple(4, 8, &aom_highbd_sad4x8_sse2, 10),
+  make_tuple(4, 4, &aom_highbd_sad4x4_sse2, 10),
   make_tuple(64, 64, &aom_highbd_sad64x64_sse2, 12),
   make_tuple(64, 32, &aom_highbd_sad64x32_sse2, 12),
   make_tuple(32, 64, &aom_highbd_sad32x64_sse2, 12),
@@ -1235,6 +1239,8 @@
   make_tuple(8, 16, &aom_highbd_sad8x16_sse2, 12),
   make_tuple(8, 8, &aom_highbd_sad8x8_sse2, 12),
   make_tuple(8, 4, &aom_highbd_sad8x4_sse2, 12),
+  make_tuple(4, 8, &aom_highbd_sad4x8_sse2, 12),
+  make_tuple(4, 4, &aom_highbd_sad4x4_sse2, 12),
 
   make_tuple(64, 16, &aom_sad64x16_sse2, -1),
   make_tuple(16, 64, &aom_sad16x64_sse2, -1),
@@ -1255,13 +1261,13 @@
   make_tuple(8, 32, &aom_highbd_sad8x32_sse2, 12),
 
   make_tuple(16, 4, &aom_sad16x4_sse2, -1),
-  // make_tuple(4, 16, &aom_sad4x16_sse2, -1),
+  make_tuple(4, 16, &aom_sad4x16_sse2, -1),
   make_tuple(16, 4, &aom_highbd_sad16x4_sse2, 8),
-  // make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 8),
+  make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 8),
   make_tuple(16, 4, &aom_highbd_sad16x4_sse2, 10),
-  // make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 10),
+  make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 10),
   make_tuple(16, 4, &aom_highbd_sad16x4_sse2, 12),
-  // make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 12),
+  make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 12),
 };
 INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
 
@@ -1293,6 +1299,8 @@
   make_tuple(8, 16, &aom_highbd_sad8x16_avg_sse2, 8),
   make_tuple(8, 8, &aom_highbd_sad8x8_avg_sse2, 8),
   make_tuple(8, 4, &aom_highbd_sad8x4_avg_sse2, 8),
+  make_tuple(4, 8, &aom_highbd_sad4x8_avg_sse2, 8),
+  make_tuple(4, 4, &aom_highbd_sad4x4_avg_sse2, 8),
   make_tuple(64, 64, &aom_highbd_sad64x64_avg_sse2, 10),
   make_tuple(64, 32, &aom_highbd_sad64x32_avg_sse2, 10),
   make_tuple(32, 64, &aom_highbd_sad32x64_avg_sse2, 10),
@@ -1304,6 +1312,8 @@
   make_tuple(8, 16, &aom_highbd_sad8x16_avg_sse2, 10),
   make_tuple(8, 8, &aom_highbd_sad8x8_avg_sse2, 10),
   make_tuple(8, 4, &aom_highbd_sad8x4_avg_sse2, 10),
+  make_tuple(4, 8, &aom_highbd_sad4x8_avg_sse2, 10),
+  make_tuple(4, 4, &aom_highbd_sad4x4_avg_sse2, 10),
   make_tuple(64, 64, &aom_highbd_sad64x64_avg_sse2, 12),
   make_tuple(64, 32, &aom_highbd_sad64x32_avg_sse2, 12),
   make_tuple(32, 64, &aom_highbd_sad32x64_avg_sse2, 12),
@@ -1315,6 +1325,8 @@
   make_tuple(8, 16, &aom_highbd_sad8x16_avg_sse2, 12),
   make_tuple(8, 8, &aom_highbd_sad8x8_avg_sse2, 12),
   make_tuple(8, 4, &aom_highbd_sad8x4_avg_sse2, 12),
+  make_tuple(4, 8, &aom_highbd_sad4x8_avg_sse2, 12),
+  make_tuple(4, 4, &aom_highbd_sad4x4_avg_sse2, 12),
 
   make_tuple(64, 16, &aom_sad64x16_avg_sse2, -1),
   make_tuple(16, 64, &aom_sad16x64_avg_sse2, -1),
@@ -1335,13 +1347,13 @@
   make_tuple(8, 32, &aom_highbd_sad8x32_avg_sse2, 12),
 
   make_tuple(16, 4, &aom_sad16x4_avg_sse2, -1),
-  // make_tuple(4, 16, &aom_sad4x16_avg_sse2, -1),
+  make_tuple(4, 16, &aom_sad4x16_avg_sse2, -1),
   make_tuple(16, 4, &aom_highbd_sad16x4_avg_sse2, 8),
-  // make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 8),
+  make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 8),
   make_tuple(16, 4, &aom_highbd_sad16x4_avg_sse2, 10),
-  // make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 10),
+  make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 10),
   make_tuple(16, 4, &aom_highbd_sad16x4_avg_sse2, 12),
-  // make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 12),
+  make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 12),
 };
 INSTANTIATE_TEST_CASE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests));