sadx4d_sse2.asm: remove remaining x4d_avg support

Bug: aomedia:2563
Bug: aomedia:3416
Change-Id: I53191ef53e1e774924db13a6e0740be6d8fef607
diff --git a/aom_dsp/x86/sad4d_sse2.asm b/aom_dsp/x86/sad4d_sse2.asm
index 1e89521..6696c40 100644
--- a/aom_dsp/x86/sad4d_sse2.asm
+++ b/aom_dsp/x86/sad4d_sse2.asm
@@ -15,13 +15,6 @@
 
 SECTION .text
 
-%macro AVG_4x2x4 2
-  movh                  m2, [second_predq]
-  movlhps               m2, m2
-  pavgb                 %1, m2
-  pavgb                 %2, m2
-  lea                   second_predq, [second_predq+8]
-%endmacro
 ; 'spill_src_stride' affect a lot how the code works.
 ;
 ; When 'spill_src_stride' is false, the 'src_strideq' resides in
@@ -64,8 +57,8 @@
   lea                ref4q, [ref4q+ref_strideq*2]
 %endmacro
 
-; PROCESS_4x2x4 first, do_avg
-%macro PROCESS_4x2x4 2
+; PROCESS_4x2x4 first
+%macro PROCESS_4x2x4 1
   movd                  m0, [srcq]
   HANDLE_SECOND_OFFSET
 %if %1 == 1
@@ -87,9 +80,6 @@
   movlhps               m0, m0
   movlhps               m6, m4
   movlhps               m7, m5
-%if %2 == 1
-  AVG_4x2x4             m6, m7
-%endif
   psadbw                m6, m0
   psadbw                m7, m0
 %else
@@ -110,9 +100,6 @@
   movlhps               m0, m0
   movlhps               m1, m2
   movlhps               m3, m4
-%if %2 == 1
-  AVG_4x2x4             m1, m3
-%endif
   psadbw                m1, m0
   psadbw                m3, m0
   paddd                 m6, m1
@@ -120,8 +107,8 @@
 %endif
 %endmacro
 
-; PROCESS_8x2x4 first, do_avg
-%macro PROCESS_8x2x4 2
+; PROCESS_8x2x4 first
+%macro PROCESS_8x2x4 1
   movh                  m0, [srcq]
   HANDLE_SECOND_OFFSET
 %if %1 == 1
@@ -134,14 +121,6 @@
   movhps                m5, [ref2q+ref_strideq]
   movhps                m6, [ref3q+ref_strideq]
   movhps                m7, [ref4q+ref_strideq]
-%if %2 == 1
-  movu                  m3, [second_predq]
-  pavgb                 m4, m3
-  pavgb                 m5, m3
-  pavgb                 m6, m3
-  pavgb                 m7, m3
-  lea                   second_predq, [second_predq+mmsize]
-%endif
   psadbw                m4, m0
   psadbw                m5, m0
   psadbw                m6, m0
@@ -152,11 +131,6 @@
   movhps                m0, [srcq + second_offset]
   movhps                m1, [ref1q+ref_strideq]
   movhps                m2, [ref2q+ref_strideq]
-%if %2 == 1
-  movu                  m3, [second_predq]
-  pavgb                 m1, m3
-  pavgb                 m2, m3
-%endif
   psadbw                m1, m0
   psadbw                m2, m0
   paddd                 m4, m1
@@ -166,11 +140,6 @@
   movhps                m1, [ref3q+ref_strideq]
   movh                  m2, [ref4q]
   movhps                m2, [ref4q+ref_strideq]
-%if %2 == 1
-  pavgb                 m1, m3
-  pavgb                 m2, m3
-  lea                   second_predq, [second_predq+mmsize]
-%endif
   psadbw                m1, m0
   psadbw                m2, m0
   paddd                 m6, m1
@@ -178,37 +147,24 @@
 %endif
 %endmacro
 
-; PROCESS_FIRST_MMSIZE do_avg
-%macro PROCESS_FIRST_MMSIZE 1
+; PROCESS_FIRST_MMSIZE
+%macro PROCESS_FIRST_MMSIZE 0
   mova                  m0, [srcq]
   movu                  m4, [ref1q]
   movu                  m5, [ref2q]
   movu                  m6, [ref3q]
   movu                  m7, [ref4q]
-%if %1 == 1
-  movu                  m3, [second_predq]
-  pavgb                 m4, m3
-  pavgb                 m5, m3
-  pavgb                 m6, m3
-  pavgb                 m7, m3
-  lea                   second_predq, [second_predq+mmsize]
-%endif
   psadbw                m4, m0
   psadbw                m5, m0
   psadbw                m6, m0
   psadbw                m7, m0
 %endmacro
 
-; PROCESS_16x1x4 offset, do_avg
-%macro PROCESS_16x1x4 2
+; PROCESS_16x1x4 offset
+%macro PROCESS_16x1x4 1
   mova                  m0, [srcq + %1]
   movu                  m1, [ref1q + ref_offsetq + %1]
   movu                  m2, [ref2q + ref_offsetq + %1]
-%if %2 == 1
-  movu                  m3, [second_predq]
-  pavgb                 m1, m3
-  pavgb                 m2, m3
-%endif
   psadbw                m1, m0
   psadbw                m2, m0
   paddd                 m4, m1
@@ -216,11 +172,6 @@
 
   movu                  m1, [ref3q + ref_offsetq + %1]
   movu                  m2, [ref4q + ref_offsetq + %1]
-%if %2 == 1
-  pavgb                 m1, m3
-  pavgb                 m2, m3
-  lea                   second_predq, [second_predq+mmsize]
-%endif
   psadbw                m1, m0
   psadbw                m2, m0
   paddd                 m6, m1
@@ -233,10 +184,8 @@
 ; Macro Arguments:
 ;   1: Width
 ;   2: Height
-;   3: If 0, then normal sad, else avg
-;   4: If 0, then normal sad, else skip rows
-;   TODO(jzern): remove the remnants of avg and merge %3 & %4
-%macro SADNXN4D 2-4 0,0
+;   3: If 0, then normal sad, else skip rows
+%macro SADNXN4D 2-3 0
 
 %define spill_src_stride 0
 %define spill_ref_stride 0
@@ -250,7 +199,7 @@
 ; Remove loops in the 4x4 and 8x4 case
 %define use_loop (use_ref_offset || %2 > 4)
 
-%if %4 == 1  ; skip rows
+%if %3 == 1  ; skip rows
 %if ARCH_X86_64
 %if use_ref_offset
 cglobal sad_skip_%1x%2x4d, 5, 10, 8, src, src_stride, ref1, ref_stride, res, \
@@ -278,7 +227,6 @@
 %endif
 %endif
 %else ; normal sad
-ASSERT %3 == 0
 %if ARCH_X86_64
 %if use_ref_offset
 cglobal sad%1x%2x4d, 5, 10, 8, src, src_stride, ref1, ref_stride, res, ref2, \
@@ -319,7 +267,7 @@
   %define cntd word [rsp]
 %endif
 
-%if %4 == 1
+%if %3 == 1
   sal          src_strided, 1
   sal          ref_strided, 1
 %endif
@@ -336,14 +284,12 @@
 %define external_loop (use_ref_offset && %1 > mmsize && %1 != %2)
 
 %if use_ref_offset
-  PROCESS_FIRST_MMSIZE %3
+  PROCESS_FIRST_MMSIZE
 %if %1 > mmsize
   mov          ref_offsetq, 0
-  mov                 cntd, %2 >> %4
+  mov                 cntd, %2 >> %3
 ; Jump part way into the loop for the square version of this width
 %if %3 == 1
-  jmp mangle(private_prefix %+ _sad%1x%1x4d_avg %+ SUFFIX).midloop
-%elif %4 == 1
   jmp mangle(private_prefix %+ _sad_skip_%1x%1x4d %+ SUFFIX).midloop
 %else
   jmp mangle(private_prefix %+ _sad%1x%1x4d %+ SUFFIX).midloop
@@ -351,14 +297,14 @@
 %else
   mov          ref_offsetq, ref_strideq
   add                 srcq, src_strideq
-  mov                 cntd, (%2 >> %4) - 1
+  mov                 cntd, (%2 >> %3) - 1
 %endif
 %if external_loop == 0
 .loop:
 ; Unrolled horizontal loop
 %assign h_offset 0
 %rep %1/mmsize
-  PROCESS_16x1x4 h_offset, %3
+  PROCESS_16x1x4 h_offset
 %if h_offset == 0
 ; The first row of the first column is done outside the loop and jumps here
 .midloop:
@@ -372,13 +318,13 @@
   jnz .loop
 %endif
 %else
-  PROCESS_%1x2x4 1, %3
+  PROCESS_%1x2x4 1
   ADVANCE_END_OF_TWO_LINES
 %if use_loop
-  mov                 cntd, (%2/2 >> %4) - 1
+  mov                 cntd, (%2/2 >> %3) - 1
 .loop:
 %endif
-  PROCESS_%1x2x4 0, %3
+  PROCESS_%1x2x4 0
 %if use_loop
   ADVANCE_END_OF_TWO_LINES
   sub                 cntd, 1
@@ -398,7 +344,7 @@
 %endif
 
 ; Undo modifications on parameters on the stack
-%if %4 == 1
+%if %3 == 1
 %if spill_src_stride
   shr          src_strided, 1
 %endif
@@ -417,7 +363,7 @@
   punpcklqdq            m4, m6
   punpckhqdq            m5, m7
   paddd                 m4, m5
-%if %4 == 1
+%if %3 == 1
   pslld                 m4, 1
 %endif
   movifnidn             resultq, resultmp
@@ -426,7 +372,7 @@
 %else
   pshufd            m6, m6, 0x08
   pshufd            m7, m7, 0x08
-%if %4 == 1
+%if %3 == 1
   pslld                 m6, 1
   pslld                 m7, 1
 %endif
@@ -463,29 +409,29 @@
 SADNXN4D  16,  64
 SADNXN4D  64,  16
 %endif
-SADNXN4D 128, 128, 0, 1
-SADNXN4D 128,  64, 0, 1
-SADNXN4D  64, 128, 0, 1
-SADNXN4D  64,  64, 0, 1
-SADNXN4D  64,  32, 0, 1
-SADNXN4D  32,  64, 0, 1
-SADNXN4D  32,  32, 0, 1
-SADNXN4D  32,  16, 0, 1
-SADNXN4D  16,  32, 0, 1
-SADNXN4D  16,  16, 0, 1
-SADNXN4D  16,   8, 0, 1
-SADNXN4D   8,  16, 0, 1
-SADNXN4D   8,   8, 0, 1
-SADNXN4D   4,   8, 0, 1
+SADNXN4D 128, 128, 1
+SADNXN4D 128,  64, 1
+SADNXN4D  64, 128, 1
+SADNXN4D  64,  64, 1
+SADNXN4D  64,  32, 1
+SADNXN4D  32,  64, 1
+SADNXN4D  32,  32, 1
+SADNXN4D  32,  16, 1
+SADNXN4D  16,  32, 1
+SADNXN4D  16,  16, 1
+SADNXN4D  16,   8, 1
+SADNXN4D   8,  16, 1
+SADNXN4D   8,   8, 1
+SADNXN4D   4,   8, 1
 %if CONFIG_REALTIME_ONLY==0
-SADNXN4D   4,  16, 0, 1
-SADNXN4D   8,  32, 0, 1
-SADNXN4D  32,   8, 0, 1
-SADNXN4D  16,  64, 0, 1
-SADNXN4D  64,  16, 0, 1
+SADNXN4D   4,  16, 1
+SADNXN4D   8,  32, 1
+SADNXN4D  32,   8, 1
+SADNXN4D  16,  64, 1
+SADNXN4D  64,  16, 1
 %endif
 
 ; Different assembly is needed when the height gets subsampled to 2
-; SADNXN4D 16,  4, 0, 1
-; SADNXN4D  8,  4, 0, 1
-; SADNXN4D  4,  4, 0, 1
+; SADNXN4D 16,  4, 1
+; SADNXN4D  8,  4, 1
+; SADNXN4D  4,  4, 1