Clean up aom_highbd_upsampled_pred_sse2
Removed redundant memset from aom_highbd_upsampled_pred_sse2 function.
Enabled sse2 implementation of 'aom_highbd_convolve8_horiz’ and
'aom_highbd_convolve8_vert' for 32-bit built.
BUG=aomedia:2216
Change-Id: Iae9e68ff6a3c533d228fd43cf78c6fba64bfd3f8
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 704e358..59d0620 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -362,10 +362,10 @@
specialize qw/aom_highbd_convolve_copy sse2 avx2/;
add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-specialize qw/aom_highbd_convolve8_horiz avx2/, "$sse2_x86_64";
+specialize qw/aom_highbd_convolve8_horiz sse2 avx2/;
add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-specialize qw/aom_highbd_convolve8_vert avx2/, "$sse2_x86_64";
+specialize qw/aom_highbd_convolve8_vert sse2 avx2/;
#
# Loopfilter
diff --git a/aom_dsp/x86/aom_asm_stubs.c b/aom_dsp/x86/aom_asm_stubs.c
index 22cfe8b..3bf7b55 100644
--- a/aom_dsp/x86/aom_asm_stubs.c
+++ b/aom_dsp/x86/aom_asm_stubs.c
@@ -49,7 +49,6 @@
FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-#if ARCH_X86_64
highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
@@ -92,5 +91,4 @@
HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-#endif // ARCH_X86_64
#endif // HAVE_SSE2
diff --git a/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
index 7b3fe64..a7152be 100644
--- a/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
+++ b/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
@@ -67,7 +67,6 @@
dec rcx
%endm
-%if ARCH_X86_64
%macro HIGH_GET_PARAM 0
mov rdx, arg(5) ;filter ptr
mov rsi, arg(0) ;src_ptr
@@ -86,14 +85,17 @@
mov rdx, 0x00010001
movsxd rcx, DWORD PTR arg(6) ;bps
- movq xmm8, rdx
+ movq xmm3, rdx
movq xmm5, rcx
- pshufd xmm8, xmm8, 0b
- movdqa xmm1, xmm8
- psllw xmm8, xmm5
- psubw xmm8, xmm1 ;max value (for clamping)
+ pshufd xmm3, xmm3, 0b
+ movdqa xmm1, xmm3
+ psllw xmm3, xmm5
+ psubw xmm3, xmm1 ;max value (for clamping)
pxor xmm5, xmm5 ;min value (for clamping)
+ movdqa max, xmm3
+ movdqa min, xmm5
+
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
movsxd rdx, DWORD PTR arg(3) ;out_pitch
movsxd rcx, DWORD PTR arg(4) ;output_height
@@ -113,8 +115,8 @@
packssdw xmm0, xmm6 ;pack back to word
;clamp the values
- pminsw xmm0, xmm8
- pmaxsw xmm0, xmm5
+ pminsw xmm0, max
+ pmaxsw xmm0, min
%if %1
movdqu xmm1, [rdi]
@@ -128,36 +130,36 @@
%endm
%macro HIGH_APPLY_FILTER_16 1
- movdqa xmm9, xmm0
+ movdqa xmm5, xmm0
movdqa xmm6, xmm2
- punpckhwd xmm9, xmm1
+ punpckhwd xmm5, xmm1
punpckhwd xmm6, xmm3
punpcklwd xmm0, xmm1
punpcklwd xmm2, xmm3
- pmaddwd xmm9, xmm7
+ pmaddwd xmm5, xmm7
pmaddwd xmm6, xmm7
pmaddwd xmm0, xmm7
pmaddwd xmm2, xmm7
- paddd xmm9, xmm4 ;rounding
+ paddd xmm5, xmm4 ;rounding
paddd xmm6, xmm4
paddd xmm0, xmm4
paddd xmm2, xmm4
- psrad xmm9, 7 ;shift
+ psrad xmm5, 7 ;shift
psrad xmm6, 7
psrad xmm0, 7
psrad xmm2, 7
- packssdw xmm0, xmm9 ;pack back to word
+ packssdw xmm0, xmm5 ;pack back to word
packssdw xmm2, xmm6 ;pack back to word
;clamp the values
- pminsw xmm0, xmm8
- pmaxsw xmm0, xmm5
- pminsw xmm2, xmm8
- pmaxsw xmm2, xmm5
+ pminsw xmm0, max
+ pmaxsw xmm0, min
+ pminsw xmm2, max
+ pmaxsw xmm2, min
%if %1
movdqu xmm1, [rdi]
@@ -172,7 +174,6 @@
lea rdi, [rdi + 2*rdx]
dec rcx
%endm
-%endif
SECTION .text
@@ -200,7 +201,6 @@
pop rbp
ret
-%if ARCH_X86_64
global sym(aom_highbd_filter_block1d8_v2_sse2) PRIVATE
sym(aom_highbd_filter_block1d8_v2_sse2):
push rbp
@@ -211,6 +211,11 @@
push rdi
; end prolog
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 2
+ %define max [rsp + 16 * 0]
+ %define min [rsp + 16 * 1]
+
HIGH_GET_PARAM
.loop:
movdqu xmm0, [rsi] ;0
@@ -219,6 +224,9 @@
HIGH_APPLY_FILTER_8 0
jnz .loop
+ add rsp, 16 * 2
+ pop rsp
+
; begin epilog
pop rdi
pop rsi
@@ -237,6 +245,11 @@
push rdi
; end prolog
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 2
+ %define max [rsp + 16 * 0]
+ %define min [rsp + 16 * 1]
+
HIGH_GET_PARAM
.loop:
movdqu xmm0, [rsi] ;0
@@ -247,6 +260,9 @@
HIGH_APPLY_FILTER_16 0
jnz .loop
+ add rsp, 16 * 2
+ pop rsp
+
; begin epilog
pop rdi
pop rsi
@@ -254,7 +270,6 @@
UNSHADOW_ARGS
pop rbp
ret
-%endif
global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE
sym(aom_highbd_filter_block1d4_h2_sse2):
@@ -281,7 +296,6 @@
pop rbp
ret
-%if ARCH_X86_64
global sym(aom_highbd_filter_block1d8_h2_sse2) PRIVATE
sym(aom_highbd_filter_block1d8_h2_sse2):
push rbp
@@ -292,6 +306,11 @@
push rdi
; end prolog
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 2
+ %define max [rsp + 16 * 0]
+ %define min [rsp + 16 * 1]
+
HIGH_GET_PARAM
.loop:
movdqu xmm0, [rsi] ;load src
@@ -300,6 +319,9 @@
HIGH_APPLY_FILTER_8 0
jnz .loop
+ add rsp, 16 * 2
+ pop rsp
+
; begin epilog
pop rdi
pop rsi
@@ -318,6 +340,11 @@
push rdi
; end prolog
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 2
+ %define max [rsp + 16 * 0]
+ %define min [rsp + 16 * 1]
+
HIGH_GET_PARAM
.loop:
movdqu xmm0, [rsi] ;load src
@@ -328,6 +355,9 @@
HIGH_APPLY_FILTER_16 0
jnz .loop
+ add rsp, 16 * 2
+ pop rsp
+
; begin epilog
pop rdi
pop rsi
@@ -335,4 +365,3 @@
UNSHADOW_ARGS
pop rbp
ret
-%endif
diff --git a/aom_dsp/x86/highbd_variance_sse2.c b/aom_dsp/x86/highbd_variance_sse2.c
index b17573e..226576b 100644
--- a/aom_dsp/x86/highbd_variance_sse2.c
+++ b/aom_dsp/x86/highbd_variance_sse2.c
@@ -734,16 +734,6 @@
const int intermediate_height =
(((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
- // TODO(Sachin): Remove the memset below when we have
- // 4 tap simd for avx2.
- if (subpel_search <= USE_4_TAPS) {
- memset(temp_start_vert - 3 * MAX_SB_SIZE, 0, width * sizeof(uint16_t));
- memset(temp_start_vert - 2 * MAX_SB_SIZE, 0, width * sizeof(uint16_t));
- memset(temp_start_vert + (height + 2) * MAX_SB_SIZE, 0,
- width * sizeof(uint16_t));
- memset(temp_start_vert + (height + 3) * MAX_SB_SIZE, 0,
- width * sizeof(uint16_t));
- }
aom_highbd_convolve8_horiz(
ref_start, ref_stride, CONVERT_TO_BYTEPTR(temp_start_horiz),
MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd);