blob: 9e57c7176f33b6c4076fb0dc8a4511c69cbc077b [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xuc27fc142016-08-22 16:08:15 -070010 */
11
12#include <arm_neon.h>
13
Yaowu Xuf883b422016-08-30 14:01:10 -070014#include "./aom_dsp_rtcd.h"
15#include "aom/aom_integer.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070016
Yaowu Xuf883b422016-08-30 14:01:10 -070017void aom_convolve_copy_neon(const uint8_t *src, // r0
Yaowu Xuc27fc142016-08-22 16:08:15 -070018 ptrdiff_t src_stride, // r1
19 uint8_t *dst, // r2
20 ptrdiff_t dst_stride, // r3
21 const int16_t *filter_x, int filter_x_stride,
22 const int16_t *filter_y, int filter_y_stride, int w,
23 int h) {
24 uint8x8_t d0u8, d2u8;
25 uint8x16_t q0u8, q1u8, q2u8, q3u8;
26 (void)filter_x;
27 (void)filter_x_stride;
28 (void)filter_y;
29 (void)filter_y_stride;
30
31 if (w > 32) { // copy64
32 for (; h > 0; h--) {
33 q0u8 = vld1q_u8(src);
34 q1u8 = vld1q_u8(src + 16);
35 q2u8 = vld1q_u8(src + 32);
36 q3u8 = vld1q_u8(src + 48);
37 src += src_stride;
38
39 vst1q_u8(dst, q0u8);
40 vst1q_u8(dst + 16, q1u8);
41 vst1q_u8(dst + 32, q2u8);
42 vst1q_u8(dst + 48, q3u8);
43 dst += dst_stride;
44 }
45 } else if (w == 32) { // copy32
46 for (; h > 0; h -= 2) {
47 q0u8 = vld1q_u8(src);
48 q1u8 = vld1q_u8(src + 16);
49 src += src_stride;
50 q2u8 = vld1q_u8(src);
51 q3u8 = vld1q_u8(src + 16);
52 src += src_stride;
53
54 vst1q_u8(dst, q0u8);
55 vst1q_u8(dst + 16, q1u8);
56 dst += dst_stride;
57 vst1q_u8(dst, q2u8);
58 vst1q_u8(dst + 16, q3u8);
59 dst += dst_stride;
60 }
61 } else if (w > 8) { // copy16
62 for (; h > 0; h -= 2) {
63 q0u8 = vld1q_u8(src);
64 src += src_stride;
65 q1u8 = vld1q_u8(src);
66 src += src_stride;
67
68 vst1q_u8(dst, q0u8);
69 dst += dst_stride;
70 vst1q_u8(dst, q1u8);
71 dst += dst_stride;
72 }
73 } else if (w == 8) { // copy8
74 for (; h > 0; h -= 2) {
75 d0u8 = vld1_u8(src);
76 src += src_stride;
77 d2u8 = vld1_u8(src);
78 src += src_stride;
79
80 vst1_u8(dst, d0u8);
81 dst += dst_stride;
82 vst1_u8(dst, d2u8);
83 dst += dst_stride;
84 }
85 } else { // copy4
86 for (; h > 0; h--) {
87 *(uint32_t *)dst = *(const uint32_t *)src;
88 src += src_stride;
89 dst += dst_stride;
90 }
91 }
92 return;
93}