sadx4d_sse2.asm: remove remaining x4d_avg support
Bug: aomedia:2563
Bug: aomedia:3416
Change-Id: I53191ef53e1e774924db13a6e0740be6d8fef607
diff --git a/aom_dsp/x86/sad4d_sse2.asm b/aom_dsp/x86/sad4d_sse2.asm
index 1e89521..6696c40 100644
--- a/aom_dsp/x86/sad4d_sse2.asm
+++ b/aom_dsp/x86/sad4d_sse2.asm
@@ -15,13 +15,6 @@
SECTION .text
-%macro AVG_4x2x4 2
- movh m2, [second_predq]
- movlhps m2, m2
- pavgb %1, m2
- pavgb %2, m2
- lea second_predq, [second_predq+8]
-%endmacro
; 'spill_src_stride' affect a lot how the code works.
;
; When 'spill_src_stride' is false, the 'src_strideq' resides in
@@ -64,8 +57,8 @@
lea ref4q, [ref4q+ref_strideq*2]
%endmacro
-; PROCESS_4x2x4 first, do_avg
-%macro PROCESS_4x2x4 2
+; PROCESS_4x2x4 first
+%macro PROCESS_4x2x4 1
movd m0, [srcq]
HANDLE_SECOND_OFFSET
%if %1 == 1
@@ -87,9 +80,6 @@
movlhps m0, m0
movlhps m6, m4
movlhps m7, m5
-%if %2 == 1
- AVG_4x2x4 m6, m7
-%endif
psadbw m6, m0
psadbw m7, m0
%else
@@ -110,9 +100,6 @@
movlhps m0, m0
movlhps m1, m2
movlhps m3, m4
-%if %2 == 1
- AVG_4x2x4 m1, m3
-%endif
psadbw m1, m0
psadbw m3, m0
paddd m6, m1
@@ -120,8 +107,8 @@
%endif
%endmacro
-; PROCESS_8x2x4 first, do_avg
-%macro PROCESS_8x2x4 2
+; PROCESS_8x2x4 first
+%macro PROCESS_8x2x4 1
movh m0, [srcq]
HANDLE_SECOND_OFFSET
%if %1 == 1
@@ -134,14 +121,6 @@
movhps m5, [ref2q+ref_strideq]
movhps m6, [ref3q+ref_strideq]
movhps m7, [ref4q+ref_strideq]
-%if %2 == 1
- movu m3, [second_predq]
- pavgb m4, m3
- pavgb m5, m3
- pavgb m6, m3
- pavgb m7, m3
- lea second_predq, [second_predq+mmsize]
-%endif
psadbw m4, m0
psadbw m5, m0
psadbw m6, m0
@@ -152,11 +131,6 @@
movhps m0, [srcq + second_offset]
movhps m1, [ref1q+ref_strideq]
movhps m2, [ref2q+ref_strideq]
-%if %2 == 1
- movu m3, [second_predq]
- pavgb m1, m3
- pavgb m2, m3
-%endif
psadbw m1, m0
psadbw m2, m0
paddd m4, m1
@@ -166,11 +140,6 @@
movhps m1, [ref3q+ref_strideq]
movh m2, [ref4q]
movhps m2, [ref4q+ref_strideq]
-%if %2 == 1
- pavgb m1, m3
- pavgb m2, m3
- lea second_predq, [second_predq+mmsize]
-%endif
psadbw m1, m0
psadbw m2, m0
paddd m6, m1
@@ -178,37 +147,24 @@
%endif
%endmacro
-; PROCESS_FIRST_MMSIZE do_avg
-%macro PROCESS_FIRST_MMSIZE 1
+; PROCESS_FIRST_MMSIZE
+%macro PROCESS_FIRST_MMSIZE 0
mova m0, [srcq]
movu m4, [ref1q]
movu m5, [ref2q]
movu m6, [ref3q]
movu m7, [ref4q]
-%if %1 == 1
- movu m3, [second_predq]
- pavgb m4, m3
- pavgb m5, m3
- pavgb m6, m3
- pavgb m7, m3
- lea second_predq, [second_predq+mmsize]
-%endif
psadbw m4, m0
psadbw m5, m0
psadbw m6, m0
psadbw m7, m0
%endmacro
-; PROCESS_16x1x4 offset, do_avg
-%macro PROCESS_16x1x4 2
+; PROCESS_16x1x4 offset
+%macro PROCESS_16x1x4 1
mova m0, [srcq + %1]
movu m1, [ref1q + ref_offsetq + %1]
movu m2, [ref2q + ref_offsetq + %1]
-%if %2 == 1
- movu m3, [second_predq]
- pavgb m1, m3
- pavgb m2, m3
-%endif
psadbw m1, m0
psadbw m2, m0
paddd m4, m1
@@ -216,11 +172,6 @@
movu m1, [ref3q + ref_offsetq + %1]
movu m2, [ref4q + ref_offsetq + %1]
-%if %2 == 1
- pavgb m1, m3
- pavgb m2, m3
- lea second_predq, [second_predq+mmsize]
-%endif
psadbw m1, m0
psadbw m2, m0
paddd m6, m1
@@ -233,10 +184,8 @@
; Macro Arguments:
; 1: Width
; 2: Height
-; 3: If 0, then normal sad, else avg
-; 4: If 0, then normal sad, else skip rows
-; TODO(jzern): remove the remnants of avg and merge %3 & %4
-%macro SADNXN4D 2-4 0,0
+; 3: If 0, then normal sad, else skip rows
+%macro SADNXN4D 2-3 0
%define spill_src_stride 0
%define spill_ref_stride 0
@@ -250,7 +199,7 @@
; Remove loops in the 4x4 and 8x4 case
%define use_loop (use_ref_offset || %2 > 4)
-%if %4 == 1 ; skip rows
+%if %3 == 1 ; skip rows
%if ARCH_X86_64
%if use_ref_offset
cglobal sad_skip_%1x%2x4d, 5, 10, 8, src, src_stride, ref1, ref_stride, res, \
@@ -278,7 +227,6 @@
%endif
%endif
%else ; normal sad
-ASSERT %3 == 0
%if ARCH_X86_64
%if use_ref_offset
cglobal sad%1x%2x4d, 5, 10, 8, src, src_stride, ref1, ref_stride, res, ref2, \
@@ -319,7 +267,7 @@
%define cntd word [rsp]
%endif
-%if %4 == 1
+%if %3 == 1
sal src_strided, 1
sal ref_strided, 1
%endif
@@ -336,14 +284,12 @@
%define external_loop (use_ref_offset && %1 > mmsize && %1 != %2)
%if use_ref_offset
- PROCESS_FIRST_MMSIZE %3
+ PROCESS_FIRST_MMSIZE
%if %1 > mmsize
mov ref_offsetq, 0
- mov cntd, %2 >> %4
+ mov cntd, %2 >> %3
; Jump part way into the loop for the square version of this width
%if %3 == 1
- jmp mangle(private_prefix %+ _sad%1x%1x4d_avg %+ SUFFIX).midloop
-%elif %4 == 1
jmp mangle(private_prefix %+ _sad_skip_%1x%1x4d %+ SUFFIX).midloop
%else
jmp mangle(private_prefix %+ _sad%1x%1x4d %+ SUFFIX).midloop
@@ -351,14 +297,14 @@
%else
mov ref_offsetq, ref_strideq
add srcq, src_strideq
- mov cntd, (%2 >> %4) - 1
+ mov cntd, (%2 >> %3) - 1
%endif
%if external_loop == 0
.loop:
; Unrolled horizontal loop
%assign h_offset 0
%rep %1/mmsize
- PROCESS_16x1x4 h_offset, %3
+ PROCESS_16x1x4 h_offset
%if h_offset == 0
; The first row of the first column is done outside the loop and jumps here
.midloop:
@@ -372,13 +318,13 @@
jnz .loop
%endif
%else
- PROCESS_%1x2x4 1, %3
+ PROCESS_%1x2x4 1
ADVANCE_END_OF_TWO_LINES
%if use_loop
- mov cntd, (%2/2 >> %4) - 1
+ mov cntd, (%2/2 >> %3) - 1
.loop:
%endif
- PROCESS_%1x2x4 0, %3
+ PROCESS_%1x2x4 0
%if use_loop
ADVANCE_END_OF_TWO_LINES
sub cntd, 1
@@ -398,7 +344,7 @@
%endif
; Undo modifications on parameters on the stack
-%if %4 == 1
+%if %3 == 1
%if spill_src_stride
shr src_strided, 1
%endif
@@ -417,7 +363,7 @@
punpcklqdq m4, m6
punpckhqdq m5, m7
paddd m4, m5
-%if %4 == 1
+%if %3 == 1
pslld m4, 1
%endif
movifnidn resultq, resultmp
@@ -426,7 +372,7 @@
%else
pshufd m6, m6, 0x08
pshufd m7, m7, 0x08
-%if %4 == 1
+%if %3 == 1
pslld m6, 1
pslld m7, 1
%endif
@@ -463,29 +409,29 @@
SADNXN4D 16, 64
SADNXN4D 64, 16
%endif
-SADNXN4D 128, 128, 0, 1
-SADNXN4D 128, 64, 0, 1
-SADNXN4D 64, 128, 0, 1
-SADNXN4D 64, 64, 0, 1
-SADNXN4D 64, 32, 0, 1
-SADNXN4D 32, 64, 0, 1
-SADNXN4D 32, 32, 0, 1
-SADNXN4D 32, 16, 0, 1
-SADNXN4D 16, 32, 0, 1
-SADNXN4D 16, 16, 0, 1
-SADNXN4D 16, 8, 0, 1
-SADNXN4D 8, 16, 0, 1
-SADNXN4D 8, 8, 0, 1
-SADNXN4D 4, 8, 0, 1
+SADNXN4D 128, 128, 1
+SADNXN4D 128, 64, 1
+SADNXN4D 64, 128, 1
+SADNXN4D 64, 64, 1
+SADNXN4D 64, 32, 1
+SADNXN4D 32, 64, 1
+SADNXN4D 32, 32, 1
+SADNXN4D 32, 16, 1
+SADNXN4D 16, 32, 1
+SADNXN4D 16, 16, 1
+SADNXN4D 16, 8, 1
+SADNXN4D 8, 16, 1
+SADNXN4D 8, 8, 1
+SADNXN4D 4, 8, 1
%if CONFIG_REALTIME_ONLY==0
-SADNXN4D 4, 16, 0, 1
-SADNXN4D 8, 32, 0, 1
-SADNXN4D 32, 8, 0, 1
-SADNXN4D 16, 64, 0, 1
-SADNXN4D 64, 16, 0, 1
+SADNXN4D 4, 16, 1
+SADNXN4D 8, 32, 1
+SADNXN4D 32, 8, 1
+SADNXN4D 16, 64, 1
+SADNXN4D 64, 16, 1
%endif
; Different assembly is needed when the height gets subsampled to 2
-; SADNXN4D 16, 4, 0, 1
-; SADNXN4D 8, 4, 0, 1
-; SADNXN4D 4, 4, 0, 1
+; SADNXN4D 16, 4, 1
+; SADNXN4D 8, 4, 1
+; SADNXN4D 4, 4, 1