Blame - aom_dsp/mips/convolve8_dspr2.c - avm

blob: 80c5df757f7f5ef02d54faaf38b6f415b4561925 [file] [log] [blame]

Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	1	/*
Yaowu Xu	9c01aa1	2016-09-01 14:32:49 -0700	[diff] [blame]	2	* Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	3	*
Yaowu Xu	9c01aa1	2016-09-01 14:32:49 -0700	[diff] [blame]	4	* This source code is subject to the terms of the BSD 2 Clause License and
				5	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
				6	* was not distributed with this source code in the LICENSE file, you can
				7	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
				8	* Media Patent License 1.0 was not distributed with this source code in the
				9	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	10	*/
				11
				12	#include <assert.h>
				13	#include <stdio.h>
				14
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	15	#include "./aom_dsp_rtcd.h"
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	16	#include "aom_dsp/mips/convolve_common_dspr2.h"
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	17	#include "aom_dsp/aom_dsp_common.h"
				18	#include "aom_dsp/aom_filter.h"
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	19	#include "aom_ports/mem.h"
				20
				21	#if HAVE_DSPR2
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	22	void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	23	uint8_t *dst, ptrdiff_t dst_stride,
				24	const int16_t *filter_x, int filter_x_stride,
				25	const int16_t *filter_y, int filter_y_stride,
				26	int w, int h) {
				27	int x, y;
				28
Tom Finegan	3b41136	2017-09-13 12:12:56 -0700	[diff] [blame]	29	(void)filter_x;
				30	(void)filter_x_stride;
				31	(void)filter_y;
				32	(void)filter_y_stride;
				33
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	34	/* prefetch data to cache memory */
				35	prefetch_load(src);
				36	prefetch_load(src + 32);
				37	prefetch_store(dst);
				38
				39	switch (w) {
				40	case 4: {
				41	uint32_t tp1;
				42
				43	/* 1 word storage */
				44	for (y = h; y--;) {
				45	prefetch_load(src + src_stride);
				46	prefetch_load(src + src_stride + 32);
				47	prefetch_store(dst + dst_stride);
				48
				49	__asm__ __volatile__(
				50	"ulw %[tp1], (%[src]) \n\t"
				51	"sw %[tp1], (%[dst]) \n\t" /* store */
				52
				53	: [tp1] "=&r"(tp1)
				54	: [src] "r"(src), [dst] "r"(dst));
				55
				56	src += src_stride;
				57	dst += dst_stride;
				58	}
				59	} break;
				60	case 8: {
				61	uint32_t tp1, tp2;
				62
				63	/* 2 word storage */
				64	for (y = h; y--;) {
				65	prefetch_load(src + src_stride);
				66	prefetch_load(src + src_stride + 32);
				67	prefetch_store(dst + dst_stride);
				68
				69	__asm__ __volatile__(
				70	"ulw %[tp1], 0(%[src]) \n\t"
				71	"ulw %[tp2], 4(%[src]) \n\t"
				72	"sw %[tp1], 0(%[dst]) \n\t" /* store */
				73	"sw %[tp2], 4(%[dst]) \n\t" /* store */
				74
				75	: [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
				76	: [src] "r"(src), [dst] "r"(dst));
				77
				78	src += src_stride;
				79	dst += dst_stride;
				80	}
				81	} break;
				82	case 16: {
				83	uint32_t tp1, tp2, tp3, tp4;
				84
				85	/* 4 word storage */
				86	for (y = h; y--;) {
				87	prefetch_load(src + src_stride);
				88	prefetch_load(src + src_stride + 32);
				89	prefetch_store(dst + dst_stride);
				90
				91	__asm__ __volatile__(
				92	"ulw %[tp1], 0(%[src]) \n\t"
				93	"ulw %[tp2], 4(%[src]) \n\t"
				94	"ulw %[tp3], 8(%[src]) \n\t"
				95	"ulw %[tp4], 12(%[src]) \n\t"
				96
				97	"sw %[tp1], 0(%[dst]) \n\t" /* store */
				98	"sw %[tp2], 4(%[dst]) \n\t" /* store */
				99	"sw %[tp3], 8(%[dst]) \n\t" /* store */
				100	"sw %[tp4], 12(%[dst]) \n\t" /* store */
				101
				102	: [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
				103	[tp4] "=&r"(tp4)
				104	: [src] "r"(src), [dst] "r"(dst));
				105
				106	src += src_stride;
				107	dst += dst_stride;
				108	}
				109	} break;
				110	case 32: {
				111	uint32_t tp1, tp2, tp3, tp4;
				112	uint32_t tp5, tp6, tp7, tp8;
				113
				114	/* 8 word storage */
				115	for (y = h; y--;) {
				116	prefetch_load(src + src_stride);
				117	prefetch_load(src + src_stride + 32);
				118	prefetch_store(dst + dst_stride);
				119
				120	__asm__ __volatile__(
				121	"ulw %[tp1], 0(%[src]) \n\t"
				122	"ulw %[tp2], 4(%[src]) \n\t"
				123	"ulw %[tp3], 8(%[src]) \n\t"
				124	"ulw %[tp4], 12(%[src]) \n\t"
				125	"ulw %[tp5], 16(%[src]) \n\t"
				126	"ulw %[tp6], 20(%[src]) \n\t"
				127	"ulw %[tp7], 24(%[src]) \n\t"
				128	"ulw %[tp8], 28(%[src]) \n\t"
				129
				130	"sw %[tp1], 0(%[dst]) \n\t" /* store */
				131	"sw %[tp2], 4(%[dst]) \n\t" /* store */
				132	"sw %[tp3], 8(%[dst]) \n\t" /* store */
				133	"sw %[tp4], 12(%[dst]) \n\t" /* store */
				134	"sw %[tp5], 16(%[dst]) \n\t" /* store */
				135	"sw %[tp6], 20(%[dst]) \n\t" /* store */
				136	"sw %[tp7], 24(%[dst]) \n\t" /* store */
				137	"sw %[tp8], 28(%[dst]) \n\t" /* store */
				138
				139	: [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
				140	[tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
				141	[tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
				142	: [src] "r"(src), [dst] "r"(dst));
				143
				144	src += src_stride;
				145	dst += dst_stride;
				146	}
				147	} break;
				148	case 64: {
				149	uint32_t tp1, tp2, tp3, tp4;
				150	uint32_t tp5, tp6, tp7, tp8;
				151
				152	prefetch_load(src + 64);
				153	prefetch_store(dst + 32);
				154
				155	/* 16 word storage */
				156	for (y = h; y--;) {
				157	prefetch_load(src + src_stride);
				158	prefetch_load(src + src_stride + 32);
				159	prefetch_load(src + src_stride + 64);
				160	prefetch_store(dst + dst_stride);
				161	prefetch_store(dst + dst_stride + 32);
				162
				163	__asm__ __volatile__(
				164	"ulw %[tp1], 0(%[src]) \n\t"
				165	"ulw %[tp2], 4(%[src]) \n\t"
				166	"ulw %[tp3], 8(%[src]) \n\t"
				167	"ulw %[tp4], 12(%[src]) \n\t"
				168	"ulw %[tp5], 16(%[src]) \n\t"
				169	"ulw %[tp6], 20(%[src]) \n\t"
				170	"ulw %[tp7], 24(%[src]) \n\t"
				171	"ulw %[tp8], 28(%[src]) \n\t"
				172
				173	"sw %[tp1], 0(%[dst]) \n\t" /* store */
				174	"sw %[tp2], 4(%[dst]) \n\t" /* store */
				175	"sw %[tp3], 8(%[dst]) \n\t" /* store */
				176	"sw %[tp4], 12(%[dst]) \n\t" /* store */
				177	"sw %[tp5], 16(%[dst]) \n\t" /* store */
				178	"sw %[tp6], 20(%[dst]) \n\t" /* store */
				179	"sw %[tp7], 24(%[dst]) \n\t" /* store */
				180	"sw %[tp8], 28(%[dst]) \n\t" /* store */
				181
				182	"ulw %[tp1], 32(%[src]) \n\t"
				183	"ulw %[tp2], 36(%[src]) \n\t"
				184	"ulw %[tp3], 40(%[src]) \n\t"
				185	"ulw %[tp4], 44(%[src]) \n\t"
				186	"ulw %[tp5], 48(%[src]) \n\t"
				187	"ulw %[tp6], 52(%[src]) \n\t"
				188	"ulw %[tp7], 56(%[src]) \n\t"
				189	"ulw %[tp8], 60(%[src]) \n\t"
				190
				191	"sw %[tp1], 32(%[dst]) \n\t" /* store */
				192	"sw %[tp2], 36(%[dst]) \n\t" /* store */
				193	"sw %[tp3], 40(%[dst]) \n\t" /* store */
				194	"sw %[tp4], 44(%[dst]) \n\t" /* store */
				195	"sw %[tp5], 48(%[dst]) \n\t" /* store */
				196	"sw %[tp6], 52(%[dst]) \n\t" /* store */
				197	"sw %[tp7], 56(%[dst]) \n\t" /* store */
				198	"sw %[tp8], 60(%[dst]) \n\t" /* store */
				199
				200	: [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
				201	[tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
				202	[tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
				203	: [src] "r"(src), [dst] "r"(dst));
				204
				205	src += src_stride;
				206	dst += dst_stride;
				207	}
				208	} break;
				209	default:
				210	for (y = h; y--;) {
				211	for (x = 0; x < w; ++x) {
				212	dst[x] = src[x];
				213	}
				214
				215	src += src_stride;
				216	dst += dst_stride;
				217	}
				218	break;
				219	}
				220	}
				221	#endif