blob: 3f0d2d82286045ac22c6eaaee845d9b223345ca0 [file] [log] [blame] [edit]
/*
* Copyright (c) 2021, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 3-Clause Clear License
* and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
* License was not distributed with this source code in the LICENSE file, you
* can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
* Alliance for Open Media Patent License 1.0 was not distributed with this
* source code in the PATENTS file, you can obtain it at
* aomedia.org/license/patent-license/.
*/
#include <assert.h>
#include <stdio.h>
#include "config/aom_dsp_rtcd.h"
#include "aom_dsp/mips/convolve_common_dspr2.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "aom_ports/mem.h"
#if HAVE_DSPR2
void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
int x, y;
/* prefetch data to cache memory */
prefetch_load(src);
prefetch_load(src + 32);
prefetch_store(dst);
switch (w) {
case 4: {
uint32_t tp1;
/* 1 word storage */
for (y = h; y--;) {
prefetch_load(src + src_stride);
prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride);
__asm__ __volatile__(
"ulw %[tp1], (%[src]) \n\t"
"sw %[tp1], (%[dst]) \n\t" /* store */
: [tp1] "=&r"(tp1)
: [src] "r"(src), [dst] "r"(dst));
src += src_stride;
dst += dst_stride;
}
} break;
case 8: {
uint32_t tp1, tp2;
/* 2 word storage */
for (y = h; y--;) {
prefetch_load(src + src_stride);
prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride);
__asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t"
"sw %[tp1], 0(%[dst]) \n\t" /* store */
"sw %[tp2], 4(%[dst]) \n\t" /* store */
: [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
: [src] "r"(src), [dst] "r"(dst));
src += src_stride;
dst += dst_stride;
}
} break;
case 16: {
uint32_t tp1, tp2, tp3, tp4;
/* 4 word storage */
for (y = h; y--;) {
prefetch_load(src + src_stride);
prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride);
__asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t"
"ulw %[tp3], 8(%[src]) \n\t"
"ulw %[tp4], 12(%[src]) \n\t"
"sw %[tp1], 0(%[dst]) \n\t" /* store */
"sw %[tp2], 4(%[dst]) \n\t" /* store */
"sw %[tp3], 8(%[dst]) \n\t" /* store */
"sw %[tp4], 12(%[dst]) \n\t" /* store */
: [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
[tp4] "=&r"(tp4)
: [src] "r"(src), [dst] "r"(dst));
src += src_stride;
dst += dst_stride;
}
} break;
case 32: {
uint32_t tp1, tp2, tp3, tp4;
uint32_t tp5, tp6, tp7, tp8;
/* 8 word storage */
for (y = h; y--;) {
prefetch_load(src + src_stride);
prefetch_load(src + src_stride + 32);
prefetch_store(dst + dst_stride);
__asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t"
"ulw %[tp3], 8(%[src]) \n\t"
"ulw %[tp4], 12(%[src]) \n\t"
"ulw %[tp5], 16(%[src]) \n\t"
"ulw %[tp6], 20(%[src]) \n\t"
"ulw %[tp7], 24(%[src]) \n\t"
"ulw %[tp8], 28(%[src]) \n\t"
"sw %[tp1], 0(%[dst]) \n\t" /* store */
"sw %[tp2], 4(%[dst]) \n\t" /* store */
"sw %[tp3], 8(%[dst]) \n\t" /* store */
"sw %[tp4], 12(%[dst]) \n\t" /* store */
"sw %[tp5], 16(%[dst]) \n\t" /* store */
"sw %[tp6], 20(%[dst]) \n\t" /* store */
"sw %[tp7], 24(%[dst]) \n\t" /* store */
"sw %[tp8], 28(%[dst]) \n\t" /* store */
: [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
[tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
[tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
: [src] "r"(src), [dst] "r"(dst));
src += src_stride;
dst += dst_stride;
}
} break;
case 64: {
uint32_t tp1, tp2, tp3, tp4;
uint32_t tp5, tp6, tp7, tp8;
prefetch_load(src + 64);
prefetch_store(dst + 32);
/* 16 word storage */
for (y = h; y--;) {
prefetch_load(src + src_stride);
prefetch_load(src + src_stride + 32);
prefetch_load(src + src_stride + 64);
prefetch_store(dst + dst_stride);
prefetch_store(dst + dst_stride + 32);
__asm__ __volatile__(
"ulw %[tp1], 0(%[src]) \n\t"
"ulw %[tp2], 4(%[src]) \n\t"
"ulw %[tp3], 8(%[src]) \n\t"
"ulw %[tp4], 12(%[src]) \n\t"
"ulw %[tp5], 16(%[src]) \n\t"
"ulw %[tp6], 20(%[src]) \n\t"
"ulw %[tp7], 24(%[src]) \n\t"
"ulw %[tp8], 28(%[src]) \n\t"
"sw %[tp1], 0(%[dst]) \n\t" /* store */
"sw %[tp2], 4(%[dst]) \n\t" /* store */
"sw %[tp3], 8(%[dst]) \n\t" /* store */
"sw %[tp4], 12(%[dst]) \n\t" /* store */
"sw %[tp5], 16(%[dst]) \n\t" /* store */
"sw %[tp6], 20(%[dst]) \n\t" /* store */
"sw %[tp7], 24(%[dst]) \n\t" /* store */
"sw %[tp8], 28(%[dst]) \n\t" /* store */
"ulw %[tp1], 32(%[src]) \n\t"
"ulw %[tp2], 36(%[src]) \n\t"
"ulw %[tp3], 40(%[src]) \n\t"
"ulw %[tp4], 44(%[src]) \n\t"
"ulw %[tp5], 48(%[src]) \n\t"
"ulw %[tp6], 52(%[src]) \n\t"
"ulw %[tp7], 56(%[src]) \n\t"
"ulw %[tp8], 60(%[src]) \n\t"
"sw %[tp1], 32(%[dst]) \n\t" /* store */
"sw %[tp2], 36(%[dst]) \n\t" /* store */
"sw %[tp3], 40(%[dst]) \n\t" /* store */
"sw %[tp4], 44(%[dst]) \n\t" /* store */
"sw %[tp5], 48(%[dst]) \n\t" /* store */
"sw %[tp6], 52(%[dst]) \n\t" /* store */
"sw %[tp7], 56(%[dst]) \n\t" /* store */
"sw %[tp8], 60(%[dst]) \n\t" /* store */
: [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
[tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
[tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
: [src] "r"(src), [dst] "r"(dst));
src += src_stride;
dst += dst_stride;
}
} break;
default:
for (y = h; y--;) {
for (x = 0; x < w; ++x) {
dst[x] = src[x];
}
src += src_stride;
dst += dst_stride;
}
break;
}
}
#endif