| /* |
| * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
| * |
| * This source code is subject to the terms of the BSD 2 Clause License and |
| * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| * was not distributed with this source code in the LICENSE file, you can |
| * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| * Media Patent License 1.0 was not distributed with this source code in the |
| * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| */ |
| |
| #include <assert.h> |
| #include <stdio.h> |
| |
| #include "config/aom_dsp_rtcd.h" |
| |
| #include "aom_dsp/mips/convolve_common_dspr2.h" |
| #include "aom_dsp/aom_dsp_common.h" |
| #include "aom_dsp/aom_filter.h" |
| #include "aom_ports/mem.h" |
| |
| #if HAVE_DSPR2 |
| void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, |
| uint8_t *dst, ptrdiff_t dst_stride, int w, int h) { |
| int x, y; |
| |
| /* prefetch data to cache memory */ |
| prefetch_load(src); |
| prefetch_load(src + 32); |
| prefetch_store(dst); |
| |
| switch (w) { |
| case 4: { |
| uint32_t tp1; |
| |
| /* 1 word storage */ |
| for (y = h; y--;) { |
| prefetch_load(src + src_stride); |
| prefetch_load(src + src_stride + 32); |
| prefetch_store(dst + dst_stride); |
| |
| __asm__ __volatile__( |
| "ulw %[tp1], (%[src]) \n\t" |
| "sw %[tp1], (%[dst]) \n\t" /* store */ |
| |
| : [tp1] "=&r"(tp1) |
| : [src] "r"(src), [dst] "r"(dst)); |
| |
| src += src_stride; |
| dst += dst_stride; |
| } |
| } break; |
| case 8: { |
| uint32_t tp1, tp2; |
| |
| /* 2 word storage */ |
| for (y = h; y--;) { |
| prefetch_load(src + src_stride); |
| prefetch_load(src + src_stride + 32); |
| prefetch_store(dst + dst_stride); |
| |
| __asm__ __volatile__( |
| "ulw %[tp1], 0(%[src]) \n\t" |
| "ulw %[tp2], 4(%[src]) \n\t" |
| "sw %[tp1], 0(%[dst]) \n\t" /* store */ |
| "sw %[tp2], 4(%[dst]) \n\t" /* store */ |
| |
| : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2) |
| : [src] "r"(src), [dst] "r"(dst)); |
| |
| src += src_stride; |
| dst += dst_stride; |
| } |
| } break; |
| case 16: { |
| uint32_t tp1, tp2, tp3, tp4; |
| |
| /* 4 word storage */ |
| for (y = h; y--;) { |
| prefetch_load(src + src_stride); |
| prefetch_load(src + src_stride + 32); |
| prefetch_store(dst + dst_stride); |
| |
| __asm__ __volatile__( |
| "ulw %[tp1], 0(%[src]) \n\t" |
| "ulw %[tp2], 4(%[src]) \n\t" |
| "ulw %[tp3], 8(%[src]) \n\t" |
| "ulw %[tp4], 12(%[src]) \n\t" |
| |
| "sw %[tp1], 0(%[dst]) \n\t" /* store */ |
| "sw %[tp2], 4(%[dst]) \n\t" /* store */ |
| "sw %[tp3], 8(%[dst]) \n\t" /* store */ |
| "sw %[tp4], 12(%[dst]) \n\t" /* store */ |
| |
| : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), |
| [tp4] "=&r"(tp4) |
| : [src] "r"(src), [dst] "r"(dst)); |
| |
| src += src_stride; |
| dst += dst_stride; |
| } |
| } break; |
| case 32: { |
| uint32_t tp1, tp2, tp3, tp4; |
| uint32_t tp5, tp6, tp7, tp8; |
| |
| /* 8 word storage */ |
| for (y = h; y--;) { |
| prefetch_load(src + src_stride); |
| prefetch_load(src + src_stride + 32); |
| prefetch_store(dst + dst_stride); |
| |
| __asm__ __volatile__( |
| "ulw %[tp1], 0(%[src]) \n\t" |
| "ulw %[tp2], 4(%[src]) \n\t" |
| "ulw %[tp3], 8(%[src]) \n\t" |
| "ulw %[tp4], 12(%[src]) \n\t" |
| "ulw %[tp5], 16(%[src]) \n\t" |
| "ulw %[tp6], 20(%[src]) \n\t" |
| "ulw %[tp7], 24(%[src]) \n\t" |
| "ulw %[tp8], 28(%[src]) \n\t" |
| |
| "sw %[tp1], 0(%[dst]) \n\t" /* store */ |
| "sw %[tp2], 4(%[dst]) \n\t" /* store */ |
| "sw %[tp3], 8(%[dst]) \n\t" /* store */ |
| "sw %[tp4], 12(%[dst]) \n\t" /* store */ |
| "sw %[tp5], 16(%[dst]) \n\t" /* store */ |
| "sw %[tp6], 20(%[dst]) \n\t" /* store */ |
| "sw %[tp7], 24(%[dst]) \n\t" /* store */ |
| "sw %[tp8], 28(%[dst]) \n\t" /* store */ |
| |
| : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), |
| [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6), |
| [tp7] "=&r"(tp7), [tp8] "=&r"(tp8) |
| : [src] "r"(src), [dst] "r"(dst)); |
| |
| src += src_stride; |
| dst += dst_stride; |
| } |
| } break; |
| case 64: { |
| uint32_t tp1, tp2, tp3, tp4; |
| uint32_t tp5, tp6, tp7, tp8; |
| |
| prefetch_load(src + 64); |
| prefetch_store(dst + 32); |
| |
| /* 16 word storage */ |
| for (y = h; y--;) { |
| prefetch_load(src + src_stride); |
| prefetch_load(src + src_stride + 32); |
| prefetch_load(src + src_stride + 64); |
| prefetch_store(dst + dst_stride); |
| prefetch_store(dst + dst_stride + 32); |
| |
| __asm__ __volatile__( |
| "ulw %[tp1], 0(%[src]) \n\t" |
| "ulw %[tp2], 4(%[src]) \n\t" |
| "ulw %[tp3], 8(%[src]) \n\t" |
| "ulw %[tp4], 12(%[src]) \n\t" |
| "ulw %[tp5], 16(%[src]) \n\t" |
| "ulw %[tp6], 20(%[src]) \n\t" |
| "ulw %[tp7], 24(%[src]) \n\t" |
| "ulw %[tp8], 28(%[src]) \n\t" |
| |
| "sw %[tp1], 0(%[dst]) \n\t" /* store */ |
| "sw %[tp2], 4(%[dst]) \n\t" /* store */ |
| "sw %[tp3], 8(%[dst]) \n\t" /* store */ |
| "sw %[tp4], 12(%[dst]) \n\t" /* store */ |
| "sw %[tp5], 16(%[dst]) \n\t" /* store */ |
| "sw %[tp6], 20(%[dst]) \n\t" /* store */ |
| "sw %[tp7], 24(%[dst]) \n\t" /* store */ |
| "sw %[tp8], 28(%[dst]) \n\t" /* store */ |
| |
| "ulw %[tp1], 32(%[src]) \n\t" |
| "ulw %[tp2], 36(%[src]) \n\t" |
| "ulw %[tp3], 40(%[src]) \n\t" |
| "ulw %[tp4], 44(%[src]) \n\t" |
| "ulw %[tp5], 48(%[src]) \n\t" |
| "ulw %[tp6], 52(%[src]) \n\t" |
| "ulw %[tp7], 56(%[src]) \n\t" |
| "ulw %[tp8], 60(%[src]) \n\t" |
| |
| "sw %[tp1], 32(%[dst]) \n\t" /* store */ |
| "sw %[tp2], 36(%[dst]) \n\t" /* store */ |
| "sw %[tp3], 40(%[dst]) \n\t" /* store */ |
| "sw %[tp4], 44(%[dst]) \n\t" /* store */ |
| "sw %[tp5], 48(%[dst]) \n\t" /* store */ |
| "sw %[tp6], 52(%[dst]) \n\t" /* store */ |
| "sw %[tp7], 56(%[dst]) \n\t" /* store */ |
| "sw %[tp8], 60(%[dst]) \n\t" /* store */ |
| |
| : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), |
| [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6), |
| [tp7] "=&r"(tp7), [tp8] "=&r"(tp8) |
| : [src] "r"(src), [dst] "r"(dst)); |
| |
| src += src_stride; |
| dst += dst_stride; |
| } |
| } break; |
| default: |
| for (y = h; y--;) { |
| for (x = 0; x < w; ++x) { |
| dst[x] = src[x]; |
| } |
| |
| src += src_stride; |
| dst += dst_stride; |
| } |
| break; |
| } |
| } |
| #endif |