Yunqing Wang | 4db2076 | 2010-10-18 14:15:15 -0400 | [diff] [blame] | 1 | ; |
Yaowu Xu | 9c01aa1 | 2016-09-01 14:32:49 -0700 | [diff] [blame] | 2 | ; Copyright (c) 2016, Alliance for Open Media. All rights reserved |
Yunqing Wang | 4db2076 | 2010-10-18 14:15:15 -0400 | [diff] [blame] | 3 | ; |
Yaowu Xu | 9c01aa1 | 2016-09-01 14:32:49 -0700 | [diff] [blame] | 4 | ; This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | ; was not distributed with this source code in the LICENSE file, you can |
| 7 | ; obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | ; Media Patent License 1.0 was not distributed with this source code in the |
| 9 | ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| 10 | ; |
| 11 | |
Yunqing Wang | 4db2076 | 2010-10-18 14:15:15 -0400 | [diff] [blame] | 12 | ; |
| 13 | |
Ronald S. Bultje | 25c588b | 2013-06-21 09:35:37 -0700 | [diff] [blame] | 14 | %include "third_party/x86inc/x86inc.asm" |
Yunqing Wang | 4db2076 | 2010-10-18 14:15:15 -0400 | [diff] [blame] | 15 | |
Ronald S. Bultje | 25c588b | 2013-06-21 09:35:37 -0700 | [diff] [blame] | 16 | SECTION .text |
Yunqing Wang | 4db2076 | 2010-10-18 14:15:15 -0400 | [diff] [blame] | 17 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 18 | ; void aom_subtract_block(int rows, int cols, |
Ronald S. Bultje | 25c588b | 2013-06-21 09:35:37 -0700 | [diff] [blame] | 19 | ; int16_t *diff, ptrdiff_t diff_stride, |
| 20 | ; const uint8_t *src, ptrdiff_t src_stride, |
| 21 | ; const uint8_t *pred, ptrdiff_t pred_stride) |
Yunqing Wang | 4db2076 | 2010-10-18 14:15:15 -0400 | [diff] [blame] | 22 | |
Ronald S. Bultje | 25c588b | 2013-06-21 09:35:37 -0700 | [diff] [blame] | 23 | INIT_XMM sse2 |
| 24 | cglobal subtract_block, 7, 7, 8, \ |
| 25 | rows, cols, diff, diff_stride, src, src_stride, \ |
| 26 | pred, pred_stride |
| 27 | %define pred_str colsq |
| 28 | pxor m7, m7 ; dedicated zero register |
| 29 | cmp colsd, 4 |
| 30 | je .case_4 |
| 31 | cmp colsd, 8 |
| 32 | je .case_8 |
| 33 | cmp colsd, 16 |
| 34 | je .case_16 |
| 35 | cmp colsd, 32 |
| 36 | je .case_32 |
Geza Lore | 552d5cd | 2016-03-07 13:46:39 +0000 | [diff] [blame] | 37 | %if CONFIG_EXT_PARTITION |
| 38 | cmp colsd, 64 |
| 39 | je .case_64 |
| 40 | %endif |
Yunqing Wang | 4db2076 | 2010-10-18 14:15:15 -0400 | [diff] [blame] | 41 | |
Ronald S. Bultje | 25c588b | 2013-06-21 09:35:37 -0700 | [diff] [blame] | 42 | %macro loop16 6 |
| 43 | mova m0, [srcq+%1] |
| 44 | mova m4, [srcq+%2] |
| 45 | mova m1, [predq+%3] |
| 46 | mova m5, [predq+%4] |
| 47 | punpckhbw m2, m0, m7 |
| 48 | punpckhbw m3, m1, m7 |
| 49 | punpcklbw m0, m7 |
| 50 | punpcklbw m1, m7 |
| 51 | psubw m2, m3 |
| 52 | psubw m0, m1 |
| 53 | punpckhbw m1, m4, m7 |
| 54 | punpckhbw m3, m5, m7 |
| 55 | punpcklbw m4, m7 |
| 56 | punpcklbw m5, m7 |
| 57 | psubw m1, m3 |
| 58 | psubw m4, m5 |
| 59 | mova [diffq+mmsize*0+%5], m0 |
| 60 | mova [diffq+mmsize*1+%5], m2 |
| 61 | mova [diffq+mmsize*0+%6], m4 |
| 62 | mova [diffq+mmsize*1+%6], m1 |
| 63 | %endmacro |
Yunqing Wang | 4db2076 | 2010-10-18 14:15:15 -0400 | [diff] [blame] | 64 | |
Geza Lore | 552d5cd | 2016-03-07 13:46:39 +0000 | [diff] [blame] | 65 | %if CONFIG_EXT_PARTITION |
| 66 | mov pred_str, pred_stridemp |
| 67 | .loop_128: |
| 68 | loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize |
| 69 | loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize |
| 70 | loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize, 8*mmsize, 10*mmsize |
| 71 | loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize |
| 72 | lea diffq, [diffq+diff_strideq*2] |
| 73 | add predq, pred_str |
| 74 | add srcq, src_strideq |
| 75 | sub rowsd, 1 |
| 76 | jnz .loop_128 |
| 77 | RET |
| 78 | |
| 79 | .case_64: |
| 80 | %endif |
Ronald S. Bultje | 25c588b | 2013-06-21 09:35:37 -0700 | [diff] [blame] | 81 | mov pred_str, pred_stridemp |
| 82 | .loop_64: |
| 83 | loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize |
| 84 | loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize |
| 85 | lea diffq, [diffq+diff_strideq*2] |
| 86 | add predq, pred_str |
| 87 | add srcq, src_strideq |
| 88 | dec rowsd |
| 89 | jg .loop_64 |
| 90 | RET |
Yunqing Wang | 4db2076 | 2010-10-18 14:15:15 -0400 | [diff] [blame] | 91 | |
Ronald S. Bultje | 25c588b | 2013-06-21 09:35:37 -0700 | [diff] [blame] | 92 | .case_32: |
| 93 | mov pred_str, pred_stridemp |
| 94 | .loop_32: |
| 95 | loop16 0, mmsize, 0, mmsize, 0, 2*mmsize |
| 96 | lea diffq, [diffq+diff_strideq*2] |
| 97 | add predq, pred_str |
| 98 | add srcq, src_strideq |
| 99 | dec rowsd |
| 100 | jg .loop_32 |
| 101 | RET |
Yunqing Wang | 4db2076 | 2010-10-18 14:15:15 -0400 | [diff] [blame] | 102 | |
Ronald S. Bultje | 25c588b | 2013-06-21 09:35:37 -0700 | [diff] [blame] | 103 | .case_16: |
| 104 | mov pred_str, pred_stridemp |
| 105 | .loop_16: |
| 106 | loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2 |
| 107 | lea diffq, [diffq+diff_strideq*4] |
| 108 | lea predq, [predq+pred_str*2] |
| 109 | lea srcq, [srcq+src_strideq*2] |
| 110 | sub rowsd, 2 |
| 111 | jg .loop_16 |
| 112 | RET |
Yunqing Wang | 4db2076 | 2010-10-18 14:15:15 -0400 | [diff] [blame] | 113 | |
Ronald S. Bultje | 25c588b | 2013-06-21 09:35:37 -0700 | [diff] [blame] | 114 | %macro loop_h 0 |
| 115 | movh m0, [srcq] |
| 116 | movh m2, [srcq+src_strideq] |
| 117 | movh m1, [predq] |
| 118 | movh m3, [predq+pred_str] |
| 119 | punpcklbw m0, m7 |
| 120 | punpcklbw m1, m7 |
| 121 | punpcklbw m2, m7 |
| 122 | punpcklbw m3, m7 |
| 123 | psubw m0, m1 |
| 124 | psubw m2, m3 |
| 125 | mova [diffq], m0 |
| 126 | mova [diffq+diff_strideq*2], m2 |
| 127 | %endmacro |
Yunqing Wang | 4db2076 | 2010-10-18 14:15:15 -0400 | [diff] [blame] | 128 | |
Ronald S. Bultje | 25c588b | 2013-06-21 09:35:37 -0700 | [diff] [blame] | 129 | .case_8: |
| 130 | mov pred_str, pred_stridemp |
| 131 | .loop_8: |
| 132 | loop_h |
| 133 | lea diffq, [diffq+diff_strideq*4] |
| 134 | lea srcq, [srcq+src_strideq*2] |
| 135 | lea predq, [predq+pred_str*2] |
| 136 | sub rowsd, 2 |
| 137 | jg .loop_8 |
| 138 | RET |
Yunqing Wang | 4db2076 | 2010-10-18 14:15:15 -0400 | [diff] [blame] | 139 | |
Ronald S. Bultje | 25c588b | 2013-06-21 09:35:37 -0700 | [diff] [blame] | 140 | INIT_MMX |
| 141 | .case_4: |
| 142 | mov pred_str, pred_stridemp |
| 143 | .loop_4: |
| 144 | loop_h |
| 145 | lea diffq, [diffq+diff_strideq*4] |
| 146 | lea srcq, [srcq+src_strideq*2] |
| 147 | lea predq, [predq+pred_str*2] |
| 148 | sub rowsd, 2 |
| 149 | jg .loop_4 |
Ronald S. Bultje | 25c588b | 2013-06-21 09:35:37 -0700 | [diff] [blame] | 150 | RET |