Clean up aom_highbd_upsampled_pred_sse2

Removed redundant memset from aom_highbd_upsampled_pred_sse2 function.
Enabled sse2 implementation of 'aom_highbd_convolve8_horiz’ and
'aom_highbd_convolve8_vert' for 32-bit built.

BUG=aomedia:2216

Change-Id: Iae9e68ff6a3c533d228fd43cf78c6fba64bfd3f8
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 704e358..59d0620 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -362,10 +362,10 @@
 specialize qw/aom_highbd_convolve_copy sse2 avx2/;
 
 add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-specialize qw/aom_highbd_convolve8_horiz avx2/, "$sse2_x86_64";
+specialize qw/aom_highbd_convolve8_horiz sse2 avx2/;
 
 add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-specialize qw/aom_highbd_convolve8_vert avx2/, "$sse2_x86_64";
+specialize qw/aom_highbd_convolve8_vert sse2 avx2/;
 
 #
 # Loopfilter
diff --git a/aom_dsp/x86/aom_asm_stubs.c b/aom_dsp/x86/aom_asm_stubs.c
index 22cfe8b..3bf7b55 100644
--- a/aom_dsp/x86/aom_asm_stubs.c
+++ b/aom_dsp/x86/aom_asm_stubs.c
@@ -49,7 +49,6 @@
 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
 
-#if ARCH_X86_64
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
@@ -92,5 +91,4 @@
 HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
 HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
 
-#endif  // ARCH_X86_64
 #endif  // HAVE_SSE2
diff --git a/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
index 7b3fe64..a7152be 100644
--- a/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
+++ b/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
@@ -67,7 +67,6 @@
     dec         rcx
 %endm
 
-%if ARCH_X86_64
 %macro HIGH_GET_PARAM 0
     mov         rdx, arg(5)                 ;filter ptr
     mov         rsi, arg(0)                 ;src_ptr
@@ -86,14 +85,17 @@
 
     mov         rdx, 0x00010001
     movsxd      rcx, DWORD PTR arg(6)       ;bps
-    movq        xmm8, rdx
+    movq        xmm3, rdx
     movq        xmm5, rcx
-    pshufd      xmm8, xmm8, 0b
-    movdqa      xmm1, xmm8
-    psllw       xmm8, xmm5
-    psubw       xmm8, xmm1                  ;max value (for clamping)
+    pshufd      xmm3, xmm3, 0b
+    movdqa      xmm1, xmm3
+    psllw       xmm3, xmm5
+    psubw       xmm3, xmm1                  ;max value (for clamping)
     pxor        xmm5, xmm5                  ;min value (for clamping)
 
+    movdqa      max, xmm3
+    movdqa      min, xmm5
+
     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
     movsxd      rcx, DWORD PTR arg(4)       ;output_height
@@ -113,8 +115,8 @@
     packssdw    xmm0, xmm6                  ;pack back to word
 
     ;clamp the values
-    pminsw      xmm0, xmm8
-    pmaxsw      xmm0, xmm5
+    pminsw      xmm0, max
+    pmaxsw      xmm0, min
 
 %if %1
     movdqu      xmm1, [rdi]
@@ -128,36 +130,36 @@
 %endm
 
 %macro HIGH_APPLY_FILTER_16 1
-    movdqa      xmm9, xmm0
+    movdqa      xmm5, xmm0
     movdqa      xmm6, xmm2
-    punpckhwd   xmm9, xmm1
+    punpckhwd   xmm5, xmm1
     punpckhwd   xmm6, xmm3
     punpcklwd   xmm0, xmm1
     punpcklwd   xmm2, xmm3
 
-    pmaddwd     xmm9, xmm7
+    pmaddwd     xmm5, xmm7
     pmaddwd     xmm6, xmm7
     pmaddwd     xmm0, xmm7
     pmaddwd     xmm2, xmm7
 
-    paddd       xmm9, xmm4                  ;rounding
+    paddd       xmm5, xmm4                  ;rounding
     paddd       xmm6, xmm4
     paddd       xmm0, xmm4
     paddd       xmm2, xmm4
 
-    psrad       xmm9, 7                     ;shift
+    psrad       xmm5, 7                     ;shift
     psrad       xmm6, 7
     psrad       xmm0, 7
     psrad       xmm2, 7
 
-    packssdw    xmm0, xmm9                  ;pack back to word
+    packssdw    xmm0, xmm5                  ;pack back to word
     packssdw    xmm2, xmm6                  ;pack back to word
 
     ;clamp the values
-    pminsw      xmm0, xmm8
-    pmaxsw      xmm0, xmm5
-    pminsw      xmm2, xmm8
-    pmaxsw      xmm2, xmm5
+    pminsw      xmm0, max
+    pmaxsw      xmm0, min
+    pminsw      xmm2, max
+    pmaxsw      xmm2, min
 
 %if %1
     movdqu      xmm1, [rdi]
@@ -172,7 +174,6 @@
     lea         rdi, [rdi + 2*rdx]
     dec         rcx
 %endm
-%endif
 
 SECTION .text
 
@@ -200,7 +201,6 @@
     pop         rbp
     ret
 
-%if ARCH_X86_64
 global sym(aom_highbd_filter_block1d8_v2_sse2) PRIVATE
 sym(aom_highbd_filter_block1d8_v2_sse2):
     push        rbp
@@ -211,6 +211,11 @@
     push        rdi
     ; end prolog
 
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 2
+    %define max [rsp + 16 * 0]
+    %define min [rsp + 16 * 1]
+
     HIGH_GET_PARAM
 .loop:
     movdqu      xmm0, [rsi]                 ;0
@@ -219,6 +224,9 @@
     HIGH_APPLY_FILTER_8 0
     jnz         .loop
 
+    add rsp, 16 * 2
+    pop rsp
+
     ; begin epilog
     pop         rdi
     pop         rsi
@@ -237,6 +245,11 @@
     push        rdi
     ; end prolog
 
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 2
+    %define max [rsp + 16 * 0]
+    %define min [rsp + 16 * 1]
+
     HIGH_GET_PARAM
 .loop:
     movdqu        xmm0, [rsi]               ;0
@@ -247,6 +260,9 @@
     HIGH_APPLY_FILTER_16 0
     jnz         .loop
 
+    add rsp, 16 * 2
+    pop rsp
+
     ; begin epilog
     pop         rdi
     pop         rsi
@@ -254,7 +270,6 @@
     UNSHADOW_ARGS
     pop         rbp
     ret
-%endif
 
 global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE
 sym(aom_highbd_filter_block1d4_h2_sse2):
@@ -281,7 +296,6 @@
     pop         rbp
     ret
 
-%if ARCH_X86_64
 global sym(aom_highbd_filter_block1d8_h2_sse2) PRIVATE
 sym(aom_highbd_filter_block1d8_h2_sse2):
     push        rbp
@@ -292,6 +306,11 @@
     push        rdi
     ; end prolog
 
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 2
+    %define max [rsp + 16 * 0]
+    %define min [rsp + 16 * 1]
+
     HIGH_GET_PARAM
 .loop:
     movdqu      xmm0, [rsi]                 ;load src
@@ -300,6 +319,9 @@
     HIGH_APPLY_FILTER_8 0
     jnz         .loop
 
+    add rsp, 16 * 2
+    pop rsp
+
     ; begin epilog
     pop         rdi
     pop         rsi
@@ -318,6 +340,11 @@
     push        rdi
     ; end prolog
 
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 2
+    %define max [rsp + 16 * 0]
+    %define min [rsp + 16 * 1]
+
     HIGH_GET_PARAM
 .loop:
     movdqu      xmm0,   [rsi]               ;load src
@@ -328,6 +355,9 @@
     HIGH_APPLY_FILTER_16 0
     jnz         .loop
 
+    add rsp, 16 * 2
+    pop rsp
+
     ; begin epilog
     pop         rdi
     pop         rsi
@@ -335,4 +365,3 @@
     UNSHADOW_ARGS
     pop         rbp
     ret
-%endif
diff --git a/aom_dsp/x86/highbd_variance_sse2.c b/aom_dsp/x86/highbd_variance_sse2.c
index b17573e..226576b 100644
--- a/aom_dsp/x86/highbd_variance_sse2.c
+++ b/aom_dsp/x86/highbd_variance_sse2.c
@@ -734,16 +734,6 @@
     const int intermediate_height =
         (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
     assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    // TODO(Sachin): Remove the memset below when we have
-    // 4 tap simd for avx2.
-    if (subpel_search <= USE_4_TAPS) {
-      memset(temp_start_vert - 3 * MAX_SB_SIZE, 0, width * sizeof(uint16_t));
-      memset(temp_start_vert - 2 * MAX_SB_SIZE, 0, width * sizeof(uint16_t));
-      memset(temp_start_vert + (height + 2) * MAX_SB_SIZE, 0,
-             width * sizeof(uint16_t));
-      memset(temp_start_vert + (height + 3) * MAX_SB_SIZE, 0,
-             width * sizeof(uint16_t));
-    }
     aom_highbd_convolve8_horiz(
         ref_start, ref_stride, CONVERT_TO_BYTEPTR(temp_start_horiz),
         MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd);