blob: 7bd5b23ad2c104a3fa13490f8abde780f558d3f9 [file] [log] [blame]
Yunqing Wang4db20762010-10-18 14:15:15 -04001;
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002; Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yunqing Wang4db20762010-10-18 14:15:15 -04003;
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
Yunqing Wang4db20762010-10-18 14:15:15 -040012;
13
Ronald S. Bultje25c588b2013-06-21 09:35:37 -070014%include "third_party/x86inc/x86inc.asm"
Yunqing Wang4db20762010-10-18 14:15:15 -040015
Ronald S. Bultje25c588b2013-06-21 09:35:37 -070016SECTION .text
Yunqing Wang4db20762010-10-18 14:15:15 -040017
Yaowu Xuf883b422016-08-30 14:01:10 -070018; void aom_subtract_block(int rows, int cols,
Ronald S. Bultje25c588b2013-06-21 09:35:37 -070019; int16_t *diff, ptrdiff_t diff_stride,
20; const uint8_t *src, ptrdiff_t src_stride,
21; const uint8_t *pred, ptrdiff_t pred_stride)
Yunqing Wang4db20762010-10-18 14:15:15 -040022
Ronald S. Bultje25c588b2013-06-21 09:35:37 -070023INIT_XMM sse2
24cglobal subtract_block, 7, 7, 8, \
25 rows, cols, diff, diff_stride, src, src_stride, \
26 pred, pred_stride
27%define pred_str colsq
28 pxor m7, m7 ; dedicated zero register
29 cmp colsd, 4
30 je .case_4
31 cmp colsd, 8
32 je .case_8
33 cmp colsd, 16
34 je .case_16
35 cmp colsd, 32
36 je .case_32
Geza Lore552d5cd2016-03-07 13:46:39 +000037%if CONFIG_EXT_PARTITION
38 cmp colsd, 64
39 je .case_64
40%endif
Yunqing Wang4db20762010-10-18 14:15:15 -040041
Ronald S. Bultje25c588b2013-06-21 09:35:37 -070042%macro loop16 6
43 mova m0, [srcq+%1]
44 mova m4, [srcq+%2]
45 mova m1, [predq+%3]
46 mova m5, [predq+%4]
47 punpckhbw m2, m0, m7
48 punpckhbw m3, m1, m7
49 punpcklbw m0, m7
50 punpcklbw m1, m7
51 psubw m2, m3
52 psubw m0, m1
53 punpckhbw m1, m4, m7
54 punpckhbw m3, m5, m7
55 punpcklbw m4, m7
56 punpcklbw m5, m7
57 psubw m1, m3
58 psubw m4, m5
59 mova [diffq+mmsize*0+%5], m0
60 mova [diffq+mmsize*1+%5], m2
61 mova [diffq+mmsize*0+%6], m4
62 mova [diffq+mmsize*1+%6], m1
63%endmacro
Yunqing Wang4db20762010-10-18 14:15:15 -040064
Geza Lore552d5cd2016-03-07 13:46:39 +000065%if CONFIG_EXT_PARTITION
66 mov pred_str, pred_stridemp
67.loop_128:
68 loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
69 loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
70 loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize, 8*mmsize, 10*mmsize
71 loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize
72 lea diffq, [diffq+diff_strideq*2]
73 add predq, pred_str
74 add srcq, src_strideq
75 sub rowsd, 1
76 jnz .loop_128
77 RET
78
79.case_64:
80%endif
Ronald S. Bultje25c588b2013-06-21 09:35:37 -070081 mov pred_str, pred_stridemp
82.loop_64:
83 loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
84 loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
85 lea diffq, [diffq+diff_strideq*2]
86 add predq, pred_str
87 add srcq, src_strideq
88 dec rowsd
89 jg .loop_64
90 RET
Yunqing Wang4db20762010-10-18 14:15:15 -040091
Ronald S. Bultje25c588b2013-06-21 09:35:37 -070092.case_32:
93 mov pred_str, pred_stridemp
94.loop_32:
95 loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
96 lea diffq, [diffq+diff_strideq*2]
97 add predq, pred_str
98 add srcq, src_strideq
99 dec rowsd
100 jg .loop_32
101 RET
Yunqing Wang4db20762010-10-18 14:15:15 -0400102
Ronald S. Bultje25c588b2013-06-21 09:35:37 -0700103.case_16:
104 mov pred_str, pred_stridemp
105.loop_16:
106 loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
107 lea diffq, [diffq+diff_strideq*4]
108 lea predq, [predq+pred_str*2]
109 lea srcq, [srcq+src_strideq*2]
110 sub rowsd, 2
111 jg .loop_16
112 RET
Yunqing Wang4db20762010-10-18 14:15:15 -0400113
Ronald S. Bultje25c588b2013-06-21 09:35:37 -0700114%macro loop_h 0
115 movh m0, [srcq]
116 movh m2, [srcq+src_strideq]
117 movh m1, [predq]
118 movh m3, [predq+pred_str]
119 punpcklbw m0, m7
120 punpcklbw m1, m7
121 punpcklbw m2, m7
122 punpcklbw m3, m7
123 psubw m0, m1
124 psubw m2, m3
125 mova [diffq], m0
126 mova [diffq+diff_strideq*2], m2
127%endmacro
Yunqing Wang4db20762010-10-18 14:15:15 -0400128
Ronald S. Bultje25c588b2013-06-21 09:35:37 -0700129.case_8:
130 mov pred_str, pred_stridemp
131.loop_8:
132 loop_h
133 lea diffq, [diffq+diff_strideq*4]
134 lea srcq, [srcq+src_strideq*2]
135 lea predq, [predq+pred_str*2]
136 sub rowsd, 2
137 jg .loop_8
138 RET
Yunqing Wang4db20762010-10-18 14:15:15 -0400139
Ronald S. Bultje25c588b2013-06-21 09:35:37 -0700140INIT_MMX
141.case_4:
142 mov pred_str, pred_stridemp
143.loop_4:
144 loop_h
145 lea diffq, [diffq+diff_strideq*4]
146 lea srcq, [srcq+src_strideq*2]
147 lea predq, [predq+pred_str*2]
148 sub rowsd, 2
149 jg .loop_4
Ronald S. Bultje25c588b2013-06-21 09:35:37 -0700150 RET