Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 1 | /* |
Yaowu Xu | 9c01aa1 | 2016-09-01 14:32:49 -0700 | [diff] [blame] | 2 | * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 3 | * |
Yaowu Xu | 9c01aa1 | 2016-09-01 14:32:49 -0700 | [diff] [blame] | 4 | * This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | * was not distributed with this source code in the LICENSE file, you can |
| 7 | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | * Media Patent License 1.0 was not distributed with this source code in the |
| 9 | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 10 | */ |
| 11 | |
| 12 | #include <assert.h> |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 13 | #include "./aom_dsp_rtcd.h" |
| 14 | #include "aom_dsp/mips/aom_convolve_msa.h" |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 15 | |
| 16 | const uint8_t mc_filt_mask_arr[16 * 3] = { |
| 17 | /* 8 width cases */ |
| 18 | 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, |
| 19 | /* 4 width cases */ |
| 20 | 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, |
| 21 | /* 4 width cases */ |
| 22 | 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 |
| 23 | }; |
| 24 | |
| 25 | static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride, |
| 26 | uint8_t *dst, int32_t dst_stride, |
| 27 | int8_t *filter_horiz, int8_t *filter_vert, |
| 28 | int32_t height) { |
| 29 | uint32_t loop_cnt; |
| 30 | v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; |
| 31 | v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; |
| 32 | v16u8 mask0, mask1, mask2, mask3, out; |
| 33 | v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; |
| 34 | v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4; |
| 35 | v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; |
| 36 | |
| 37 | mask0 = LD_UB(&mc_filt_mask_arr[16]); |
| 38 | src -= (3 + 3 * src_stride); |
| 39 | |
| 40 | /* rearranging filter */ |
| 41 | filt = LD_SH(filter_horiz); |
| 42 | SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); |
| 43 | |
| 44 | mask1 = mask0 + 2; |
| 45 | mask2 = mask0 + 4; |
| 46 | mask3 = mask0 + 6; |
| 47 | |
| 48 | LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); |
| 49 | XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); |
| 50 | src += (7 * src_stride); |
| 51 | |
| 52 | hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, |
| 53 | filt_hz1, filt_hz2, filt_hz3); |
| 54 | hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, |
| 55 | filt_hz1, filt_hz2, filt_hz3); |
| 56 | hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, |
| 57 | filt_hz1, filt_hz2, filt_hz3); |
| 58 | hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, |
| 59 | filt_hz1, filt_hz2, filt_hz3); |
| 60 | SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); |
| 61 | |
| 62 | filt = LD_SH(filter_vert); |
| 63 | SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); |
| 64 | |
| 65 | ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); |
| 66 | out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); |
| 67 | |
| 68 | for (loop_cnt = (height >> 2); loop_cnt--;) { |
| 69 | LD_SB4(src, src_stride, src7, src8, src9, src10); |
| 70 | XORI_B4_128_SB(src7, src8, src9, src10); |
| 71 | src += (4 * src_stride); |
| 72 | |
| 73 | hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, |
| 74 | filt_hz1, filt_hz2, filt_hz3); |
| 75 | hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); |
| 76 | out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); |
| 77 | tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, |
| 78 | filt_vt2, filt_vt3); |
| 79 | |
| 80 | hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, |
| 81 | filt_hz1, filt_hz2, filt_hz3); |
| 82 | hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); |
| 83 | out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); |
| 84 | tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1, |
| 85 | filt_vt2, filt_vt3); |
| 86 | SRARI_H2_SH(tmp0, tmp1, FILTER_BITS); |
| 87 | SAT_SH2_SH(tmp0, tmp1, 7); |
| 88 | out = PCKEV_XORI128_UB(tmp0, tmp1); |
| 89 | ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); |
| 90 | dst += (4 * dst_stride); |
| 91 | |
| 92 | hz_out5 = hz_out9; |
| 93 | out0 = out2; |
| 94 | out1 = out3; |
| 95 | out2 = out4; |
| 96 | } |
| 97 | } |
| 98 | |
| 99 | static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride, |
| 100 | uint8_t *dst, int32_t dst_stride, |
| 101 | int8_t *filter_horiz, int8_t *filter_vert, |
| 102 | int32_t height) { |
| 103 | uint32_t loop_cnt; |
| 104 | v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; |
| 105 | v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; |
| 106 | v16u8 mask0, mask1, mask2, mask3, vec0, vec1; |
| 107 | v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; |
| 108 | v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; |
| 109 | v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; |
| 110 | v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; |
| 111 | |
| 112 | mask0 = LD_UB(&mc_filt_mask_arr[0]); |
| 113 | src -= (3 + 3 * src_stride); |
| 114 | |
| 115 | /* rearranging filter */ |
| 116 | filt = LD_SH(filter_horiz); |
| 117 | SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); |
| 118 | |
| 119 | mask1 = mask0 + 2; |
| 120 | mask2 = mask0 + 4; |
| 121 | mask3 = mask0 + 6; |
| 122 | |
| 123 | LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); |
| 124 | src += (7 * src_stride); |
| 125 | |
| 126 | XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); |
| 127 | hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, |
| 128 | filt_hz1, filt_hz2, filt_hz3); |
| 129 | hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, |
| 130 | filt_hz1, filt_hz2, filt_hz3); |
| 131 | hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, |
| 132 | filt_hz1, filt_hz2, filt_hz3); |
| 133 | hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, |
| 134 | filt_hz1, filt_hz2, filt_hz3); |
| 135 | hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, |
| 136 | filt_hz1, filt_hz2, filt_hz3); |
| 137 | hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, |
| 138 | filt_hz1, filt_hz2, filt_hz3); |
| 139 | hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, |
| 140 | filt_hz1, filt_hz2, filt_hz3); |
| 141 | |
| 142 | filt = LD_SH(filter_vert); |
| 143 | SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); |
| 144 | |
| 145 | ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); |
| 146 | ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); |
| 147 | ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); |
| 148 | |
| 149 | for (loop_cnt = (height >> 2); loop_cnt--;) { |
| 150 | LD_SB4(src, src_stride, src7, src8, src9, src10); |
| 151 | src += (4 * src_stride); |
| 152 | |
| 153 | XORI_B4_128_SB(src7, src8, src9, src10); |
| 154 | |
| 155 | hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, |
| 156 | filt_hz1, filt_hz2, filt_hz3); |
| 157 | out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); |
| 158 | tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, |
| 159 | filt_vt2, filt_vt3); |
| 160 | |
| 161 | hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, |
| 162 | filt_hz1, filt_hz2, filt_hz3); |
| 163 | out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); |
| 164 | tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, |
| 165 | filt_vt2, filt_vt3); |
| 166 | |
| 167 | hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, |
| 168 | filt_hz1, filt_hz2, filt_hz3); |
| 169 | out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); |
| 170 | tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, |
| 171 | filt_vt2, filt_vt3); |
| 172 | |
| 173 | hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, |
| 174 | filt_hz0, filt_hz1, filt_hz2, filt_hz3); |
| 175 | out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9); |
| 176 | tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, |
| 177 | filt_vt2, filt_vt3); |
| 178 | SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); |
| 179 | SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); |
| 180 | vec0 = PCKEV_XORI128_UB(tmp0, tmp1); |
| 181 | vec1 = PCKEV_XORI128_UB(tmp2, tmp3); |
| 182 | ST8x4_UB(vec0, vec1, dst, dst_stride); |
| 183 | dst += (4 * dst_stride); |
| 184 | |
| 185 | hz_out6 = hz_out10; |
| 186 | out0 = out2; |
| 187 | out1 = out3; |
| 188 | out2 = out8; |
| 189 | out4 = out6; |
| 190 | out5 = out7; |
| 191 | out6 = out9; |
| 192 | } |
| 193 | } |
| 194 | |
| 195 | static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride, |
| 196 | uint8_t *dst, int32_t dst_stride, |
| 197 | int8_t *filter_horiz, int8_t *filter_vert, |
| 198 | int32_t height) { |
| 199 | int32_t multiple8_cnt; |
| 200 | for (multiple8_cnt = 2; multiple8_cnt--;) { |
| 201 | common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, |
| 202 | filter_vert, height); |
| 203 | src += 8; |
| 204 | dst += 8; |
| 205 | } |
| 206 | } |
| 207 | |
| 208 | static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride, |
| 209 | uint8_t *dst, int32_t dst_stride, |
| 210 | int8_t *filter_horiz, int8_t *filter_vert, |
| 211 | int32_t height) { |
| 212 | int32_t multiple8_cnt; |
| 213 | for (multiple8_cnt = 4; multiple8_cnt--;) { |
| 214 | common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, |
| 215 | filter_vert, height); |
| 216 | src += 8; |
| 217 | dst += 8; |
| 218 | } |
| 219 | } |
| 220 | |
| 221 | static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride, |
| 222 | uint8_t *dst, int32_t dst_stride, |
| 223 | int8_t *filter_horiz, int8_t *filter_vert, |
| 224 | int32_t height) { |
| 225 | int32_t multiple8_cnt; |
| 226 | for (multiple8_cnt = 8; multiple8_cnt--;) { |
| 227 | common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, |
| 228 | filter_vert, height); |
| 229 | src += 8; |
| 230 | dst += 8; |
| 231 | } |
| 232 | } |
| 233 | |
| 234 | static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride, |
| 235 | uint8_t *dst, int32_t dst_stride, |
| 236 | int8_t *filter_horiz, |
| 237 | int8_t *filter_vert) { |
| 238 | v16i8 src0, src1, src2, src3, src4, mask; |
| 239 | v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1; |
| 240 | v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1; |
| 241 | |
| 242 | mask = LD_SB(&mc_filt_mask_arr[16]); |
| 243 | |
| 244 | /* rearranging filter */ |
| 245 | filt = LD_UH(filter_horiz); |
| 246 | filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); |
| 247 | |
| 248 | filt = LD_UH(filter_vert); |
| 249 | filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); |
| 250 | |
| 251 | LD_SB5(src, src_stride, src0, src1, src2, src3, src4); |
| 252 | hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); |
| 253 | hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); |
| 254 | hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
| 255 | hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); |
| 256 | hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); |
| 257 | |
| 258 | ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
| 259 | DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
| 260 | SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 261 | PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); |
| 262 | ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); |
| 263 | } |
| 264 | |
| 265 | static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, |
| 266 | uint8_t *dst, int32_t dst_stride, |
| 267 | int8_t *filter_horiz, |
| 268 | int8_t *filter_vert) { |
| 269 | v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; |
| 270 | v16i8 res0, res1, res2, res3; |
| 271 | v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; |
| 272 | v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; |
| 273 | v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt; |
| 274 | |
| 275 | mask = LD_SB(&mc_filt_mask_arr[16]); |
| 276 | |
| 277 | /* rearranging filter */ |
| 278 | filt = LD_UH(filter_horiz); |
| 279 | filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); |
| 280 | |
| 281 | filt = LD_UH(filter_vert); |
| 282 | filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); |
| 283 | |
| 284 | LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); |
| 285 | src += (8 * src_stride); |
| 286 | src8 = LD_SB(src); |
| 287 | |
| 288 | hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); |
| 289 | hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); |
| 290 | hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); |
| 291 | hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); |
| 292 | hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); |
| 293 | SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, |
| 294 | hz_out3, hz_out5, 8); |
| 295 | hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); |
| 296 | |
| 297 | ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
| 298 | ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); |
| 299 | DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4, |
| 300 | vec5, vec6, vec7); |
| 301 | SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); |
| 302 | PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, |
| 303 | res3); |
| 304 | ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); |
| 305 | dst += (4 * dst_stride); |
| 306 | ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); |
| 307 | } |
| 308 | |
| 309 | static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride, |
| 310 | uint8_t *dst, int32_t dst_stride, |
| 311 | int8_t *filter_horiz, int8_t *filter_vert, |
| 312 | int32_t height) { |
| 313 | if (4 == height) { |
| 314 | common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz, |
| 315 | filter_vert); |
| 316 | } else if (8 == height) { |
| 317 | common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz, |
| 318 | filter_vert); |
| 319 | } |
| 320 | } |
| 321 | |
| 322 | static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride, |
| 323 | uint8_t *dst, int32_t dst_stride, |
| 324 | int8_t *filter_horiz, |
| 325 | int8_t *filter_vert) { |
| 326 | v16i8 src0, src1, src2, src3, src4, mask, out0, out1; |
| 327 | v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; |
| 328 | v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; |
| 329 | v8i16 filt; |
| 330 | |
| 331 | mask = LD_SB(&mc_filt_mask_arr[0]); |
| 332 | |
| 333 | /* rearranging filter */ |
| 334 | filt = LD_SH(filter_horiz); |
| 335 | filt_hz = (v16u8)__msa_splati_h(filt, 0); |
| 336 | |
| 337 | filt = LD_SH(filter_vert); |
| 338 | filt_vt = (v16u8)__msa_splati_h(filt, 0); |
| 339 | |
| 340 | LD_SB5(src, src_stride, src0, src1, src2, src3, src4); |
| 341 | |
| 342 | hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); |
| 343 | hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); |
| 344 | vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
| 345 | tmp0 = __msa_dotp_u_h(vec0, filt_vt); |
| 346 | |
| 347 | hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); |
| 348 | vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
| 349 | tmp1 = __msa_dotp_u_h(vec1, filt_vt); |
| 350 | |
| 351 | hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); |
| 352 | vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
| 353 | tmp2 = __msa_dotp_u_h(vec2, filt_vt); |
| 354 | |
| 355 | hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
| 356 | vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
| 357 | tmp3 = __msa_dotp_u_h(vec3, filt_vt); |
| 358 | |
| 359 | SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); |
| 360 | PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); |
| 361 | ST8x4_UB(out0, out1, dst, dst_stride); |
| 362 | } |
| 363 | |
| 364 | static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, |
| 365 | int32_t src_stride, uint8_t *dst, |
| 366 | int32_t dst_stride, |
| 367 | int8_t *filter_horiz, |
| 368 | int8_t *filter_vert, int32_t height) { |
| 369 | uint32_t loop_cnt; |
| 370 | v16i8 src0, src1, src2, src3, src4, mask, out0, out1; |
| 371 | v16u8 filt_hz, filt_vt, vec0; |
| 372 | v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; |
| 373 | v8i16 filt; |
| 374 | |
| 375 | mask = LD_SB(&mc_filt_mask_arr[0]); |
| 376 | |
| 377 | /* rearranging filter */ |
| 378 | filt = LD_SH(filter_horiz); |
| 379 | filt_hz = (v16u8)__msa_splati_h(filt, 0); |
| 380 | |
| 381 | filt = LD_SH(filter_vert); |
| 382 | filt_vt = (v16u8)__msa_splati_h(filt, 0); |
| 383 | |
| 384 | src0 = LD_SB(src); |
| 385 | src += src_stride; |
| 386 | |
| 387 | hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); |
| 388 | |
| 389 | for (loop_cnt = (height >> 3); loop_cnt--;) { |
| 390 | LD_SB4(src, src_stride, src1, src2, src3, src4); |
| 391 | src += (4 * src_stride); |
| 392 | |
| 393 | hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); |
| 394 | vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
| 395 | tmp1 = __msa_dotp_u_h(vec0, filt_vt); |
| 396 | |
| 397 | hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); |
| 398 | vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
| 399 | tmp2 = __msa_dotp_u_h(vec0, filt_vt); |
| 400 | |
| 401 | SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); |
| 402 | |
| 403 | hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); |
| 404 | vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
| 405 | tmp3 = __msa_dotp_u_h(vec0, filt_vt); |
| 406 | |
| 407 | hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
| 408 | LD_SB4(src, src_stride, src1, src2, src3, src4); |
| 409 | src += (4 * src_stride); |
| 410 | vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
| 411 | tmp4 = __msa_dotp_u_h(vec0, filt_vt); |
| 412 | |
| 413 | SRARI_H2_UH(tmp3, tmp4, FILTER_BITS); |
| 414 | PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1); |
| 415 | ST8x4_UB(out0, out1, dst, dst_stride); |
| 416 | dst += (4 * dst_stride); |
| 417 | |
| 418 | hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); |
| 419 | vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
| 420 | tmp5 = __msa_dotp_u_h(vec0, filt_vt); |
| 421 | |
| 422 | hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); |
| 423 | vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
| 424 | tmp6 = __msa_dotp_u_h(vec0, filt_vt); |
| 425 | |
| 426 | hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); |
| 427 | vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
| 428 | tmp7 = __msa_dotp_u_h(vec0, filt_vt); |
| 429 | |
| 430 | hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
| 431 | vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
| 432 | tmp8 = __msa_dotp_u_h(vec0, filt_vt); |
| 433 | |
| 434 | SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS); |
| 435 | PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1); |
| 436 | ST8x4_UB(out0, out1, dst, dst_stride); |
| 437 | dst += (4 * dst_stride); |
| 438 | } |
| 439 | } |
| 440 | |
| 441 | static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride, |
| 442 | uint8_t *dst, int32_t dst_stride, |
| 443 | int8_t *filter_horiz, int8_t *filter_vert, |
| 444 | int32_t height) { |
| 445 | if (4 == height) { |
| 446 | common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz, |
| 447 | filter_vert); |
| 448 | } else { |
| 449 | common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride, |
| 450 | filter_horiz, filter_vert, height); |
| 451 | } |
| 452 | } |
| 453 | |
| 454 | static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride, |
| 455 | uint8_t *dst, int32_t dst_stride, |
| 456 | int8_t *filter_horiz, int8_t *filter_vert, |
| 457 | int32_t height) { |
| 458 | uint32_t loop_cnt; |
| 459 | v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; |
| 460 | v16u8 filt_hz, filt_vt, vec0, vec1; |
| 461 | v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3; |
| 462 | v8i16 filt; |
| 463 | |
| 464 | mask = LD_SB(&mc_filt_mask_arr[0]); |
| 465 | |
| 466 | /* rearranging filter */ |
| 467 | filt = LD_SH(filter_horiz); |
| 468 | filt_hz = (v16u8)__msa_splati_h(filt, 0); |
| 469 | |
| 470 | filt = LD_SH(filter_vert); |
| 471 | filt_vt = (v16u8)__msa_splati_h(filt, 0); |
| 472 | |
| 473 | LD_SB2(src, 8, src0, src1); |
| 474 | src += src_stride; |
| 475 | |
| 476 | hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); |
| 477 | hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); |
| 478 | |
| 479 | for (loop_cnt = (height >> 2); loop_cnt--;) { |
| 480 | LD_SB4(src, src_stride, src0, src2, src4, src6); |
| 481 | LD_SB4(src + 8, src_stride, src1, src3, src5, src7); |
| 482 | src += (4 * src_stride); |
| 483 | |
| 484 | hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); |
| 485 | hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); |
| 486 | ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
| 487 | DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); |
| 488 | SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); |
| 489 | PCKEV_ST_SB(tmp1, tmp2, dst); |
| 490 | dst += dst_stride; |
| 491 | |
| 492 | hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); |
| 493 | hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); |
| 494 | ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); |
| 495 | DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); |
| 496 | SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); |
| 497 | PCKEV_ST_SB(tmp1, tmp2, dst); |
| 498 | dst += dst_stride; |
| 499 | |
| 500 | hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
| 501 | hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); |
| 502 | ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
| 503 | DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); |
| 504 | SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); |
| 505 | PCKEV_ST_SB(tmp1, tmp2, dst); |
| 506 | dst += dst_stride; |
| 507 | |
| 508 | hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); |
| 509 | hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); |
| 510 | ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); |
| 511 | DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); |
| 512 | SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); |
| 513 | PCKEV_ST_SB(tmp1, tmp2, dst); |
| 514 | dst += dst_stride; |
| 515 | } |
| 516 | } |
| 517 | |
| 518 | static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride, |
| 519 | uint8_t *dst, int32_t dst_stride, |
| 520 | int8_t *filter_horiz, int8_t *filter_vert, |
| 521 | int32_t height) { |
| 522 | int32_t multiple8_cnt; |
| 523 | for (multiple8_cnt = 2; multiple8_cnt--;) { |
| 524 | common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz, |
| 525 | filter_vert, height); |
| 526 | src += 16; |
| 527 | dst += 16; |
| 528 | } |
| 529 | } |
| 530 | |
| 531 | static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride, |
| 532 | uint8_t *dst, int32_t dst_stride, |
| 533 | int8_t *filter_horiz, int8_t *filter_vert, |
| 534 | int32_t height) { |
| 535 | int32_t multiple8_cnt; |
| 536 | for (multiple8_cnt = 4; multiple8_cnt--;) { |
| 537 | common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz, |
| 538 | filter_vert, height); |
| 539 | src += 16; |
| 540 | dst += 16; |
| 541 | } |
| 542 | } |
| 543 | |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 544 | void aom_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 545 | ptrdiff_t dst_stride, const int16_t *filter_x, |
| 546 | int32_t x_step_q4, const int16_t *filter_y, |
| 547 | int32_t y_step_q4, int32_t w, int32_t h) { |
| 548 | int8_t cnt, filt_hor[8], filt_ver[8]; |
| 549 | |
| 550 | assert(x_step_q4 == 16); |
| 551 | assert(y_step_q4 == 16); |
| 552 | assert(((const int32_t *)filter_x)[1] != 0x800000); |
| 553 | assert(((const int32_t *)filter_y)[1] != 0x800000); |
| 554 | |
| 555 | for (cnt = 0; cnt < 8; ++cnt) { |
| 556 | filt_hor[cnt] = filter_x[cnt]; |
| 557 | filt_ver[cnt] = filter_y[cnt]; |
| 558 | } |
| 559 | |
| 560 | if (((const int32_t *)filter_x)[0] == 0 && |
| 561 | ((const int32_t *)filter_y)[0] == 0) { |
| 562 | switch (w) { |
| 563 | case 4: |
| 564 | common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst, |
| 565 | (int32_t)dst_stride, &filt_hor[3], |
| 566 | &filt_ver[3], (int32_t)h); |
| 567 | break; |
| 568 | case 8: |
| 569 | common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, dst, |
| 570 | (int32_t)dst_stride, &filt_hor[3], |
| 571 | &filt_ver[3], (int32_t)h); |
| 572 | break; |
| 573 | case 16: |
| 574 | common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, dst, |
| 575 | (int32_t)dst_stride, &filt_hor[3], |
| 576 | &filt_ver[3], (int32_t)h); |
| 577 | break; |
| 578 | case 32: |
| 579 | common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, dst, |
| 580 | (int32_t)dst_stride, &filt_hor[3], |
| 581 | &filt_ver[3], (int32_t)h); |
| 582 | break; |
| 583 | case 64: |
| 584 | common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, dst, |
| 585 | (int32_t)dst_stride, &filt_hor[3], |
| 586 | &filt_ver[3], (int32_t)h); |
| 587 | break; |
| 588 | default: |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 589 | aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 590 | filter_y, y_step_q4, w, h); |
| 591 | break; |
| 592 | } |
| 593 | } else if (((const int32_t *)filter_x)[0] == 0 || |
| 594 | ((const int32_t *)filter_y)[0] == 0) { |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 595 | aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 596 | filter_y, y_step_q4, w, h); |
| 597 | } else { |
| 598 | switch (w) { |
| 599 | case 4: |
| 600 | common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, dst, |
| 601 | (int32_t)dst_stride, filt_hor, filt_ver, |
| 602 | (int32_t)h); |
| 603 | break; |
| 604 | case 8: |
| 605 | common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, dst, |
| 606 | (int32_t)dst_stride, filt_hor, filt_ver, |
| 607 | (int32_t)h); |
| 608 | break; |
| 609 | case 16: |
| 610 | common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, dst, |
| 611 | (int32_t)dst_stride, filt_hor, filt_ver, |
| 612 | (int32_t)h); |
| 613 | break; |
| 614 | case 32: |
| 615 | common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, dst, |
| 616 | (int32_t)dst_stride, filt_hor, filt_ver, |
| 617 | (int32_t)h); |
| 618 | break; |
| 619 | case 64: |
| 620 | common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, dst, |
| 621 | (int32_t)dst_stride, filt_hor, filt_ver, |
| 622 | (int32_t)h); |
| 623 | break; |
| 624 | default: |
Yaowu Xu | f883b42 | 2016-08-30 14:01:10 -0700 | [diff] [blame] | 625 | aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, |
Yaowu Xu | c27fc14 | 2016-08-22 16:08:15 -0700 | [diff] [blame] | 626 | filter_y, y_step_q4, w, h); |
| 627 | break; |
| 628 | } |
| 629 | } |
| 630 | } |