separate compound/singleref convolve functions
First part of a clean-up to remove conv_params->dst.
This is a similar change as
https://aomedia-review.googlesource.com/c/aom/+/107922
but for the master branch.
1. Directly reference convolves and clean-up unused parameters
(instead of the indirect sf->convolve map)
2. Remove av1_convolve_2d_copy - redundant with
aom_convolve_copy. Other redundant functions will be removed
in future changes.
3. Enable MIPS optimizations for aom_convolve_copy -- they
are already present, but were not enabled.
4. Re-work the convolve testing framework. The time
to run the tests has been reduced by ~8%; the code is less
compact but I believe it is easier to read.
BUG=aomedia:2634
Change-Id: I515526437e9dc40db3b54a264c8cde416e3a1e4c
diff --git a/aom_dsp/mips/aom_convolve_copy_dspr2.c b/aom_dsp/mips/aom_convolve_copy_dspr2.c
new file mode 100644
index 0000000..12a213e
--- /dev/null
+++ b/aom_dsp/mips/aom_convolve_copy_dspr2.c
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/convolve_common_dspr2.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
+ int x, y;
+
+ /* prefetch data to cache memory */
+ prefetch_load(src);
+ prefetch_load(src + 32);
+ prefetch_store(dst);
+
+ switch (w) {
+ case 4: {
+ uint32_t tp1;
+
+ /* 1 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], (%[src]) \n\t"
+ "sw %[tp1], (%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ } break;
+ case 8: {
+ uint32_t tp1, tp2;
+
+ /* 2 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+ "sw %[tp1], 0(%[dst]) \n\t" /* store */
+ "sw %[tp2], 4(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ } break;
+ case 16: {
+ uint32_t tp1, tp2, tp3, tp4;
+
+ /* 4 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "ulw %[tp4], 12(%[src]) \n\t"
+
+ "sw %[tp1], 0(%[dst]) \n\t" /* store */
+ "sw %[tp2], 4(%[dst]) \n\t" /* store */
+ "sw %[tp3], 8(%[dst]) \n\t" /* store */
+ "sw %[tp4], 12(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ } break;
+ case 32: {
+ uint32_t tp1, tp2, tp3, tp4;
+ uint32_t tp5, tp6, tp7, tp8;
+
+ /* 8 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "ulw %[tp4], 12(%[src]) \n\t"
+ "ulw %[tp5], 16(%[src]) \n\t"
+ "ulw %[tp6], 20(%[src]) \n\t"
+ "ulw %[tp7], 24(%[src]) \n\t"
+ "ulw %[tp8], 28(%[src]) \n\t"
+
+ "sw %[tp1], 0(%[dst]) \n\t" /* store */
+ "sw %[tp2], 4(%[dst]) \n\t" /* store */
+ "sw %[tp3], 8(%[dst]) \n\t" /* store */
+ "sw %[tp4], 12(%[dst]) \n\t" /* store */
+ "sw %[tp5], 16(%[dst]) \n\t" /* store */
+ "sw %[tp6], 20(%[dst]) \n\t" /* store */
+ "sw %[tp7], 24(%[dst]) \n\t" /* store */
+ "sw %[tp8], 28(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
+ [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ } break;
+ case 64: {
+ uint32_t tp1, tp2, tp3, tp4;
+ uint32_t tp5, tp6, tp7, tp8;
+
+ prefetch_load(src + 64);
+ prefetch_store(dst + 32);
+
+ /* 16 word storage */
+ for (y = h; y--;) {
+ prefetch_load(src + src_stride);
+ prefetch_load(src + src_stride + 32);
+ prefetch_load(src + src_stride + 64);
+ prefetch_store(dst + dst_stride);
+ prefetch_store(dst + dst_stride + 32);
+
+ __asm__ __volatile__(
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "ulw %[tp4], 12(%[src]) \n\t"
+ "ulw %[tp5], 16(%[src]) \n\t"
+ "ulw %[tp6], 20(%[src]) \n\t"
+ "ulw %[tp7], 24(%[src]) \n\t"
+ "ulw %[tp8], 28(%[src]) \n\t"
+
+ "sw %[tp1], 0(%[dst]) \n\t" /* store */
+ "sw %[tp2], 4(%[dst]) \n\t" /* store */
+ "sw %[tp3], 8(%[dst]) \n\t" /* store */
+ "sw %[tp4], 12(%[dst]) \n\t" /* store */
+ "sw %[tp5], 16(%[dst]) \n\t" /* store */
+ "sw %[tp6], 20(%[dst]) \n\t" /* store */
+ "sw %[tp7], 24(%[dst]) \n\t" /* store */
+ "sw %[tp8], 28(%[dst]) \n\t" /* store */
+
+ "ulw %[tp1], 32(%[src]) \n\t"
+ "ulw %[tp2], 36(%[src]) \n\t"
+ "ulw %[tp3], 40(%[src]) \n\t"
+ "ulw %[tp4], 44(%[src]) \n\t"
+ "ulw %[tp5], 48(%[src]) \n\t"
+ "ulw %[tp6], 52(%[src]) \n\t"
+ "ulw %[tp7], 56(%[src]) \n\t"
+ "ulw %[tp8], 60(%[src]) \n\t"
+
+ "sw %[tp1], 32(%[dst]) \n\t" /* store */
+ "sw %[tp2], 36(%[dst]) \n\t" /* store */
+ "sw %[tp3], 40(%[dst]) \n\t" /* store */
+ "sw %[tp4], 44(%[dst]) \n\t" /* store */
+ "sw %[tp5], 48(%[dst]) \n\t" /* store */
+ "sw %[tp6], 52(%[dst]) \n\t" /* store */
+ "sw %[tp7], 56(%[dst]) \n\t" /* store */
+ "sw %[tp8], 60(%[dst]) \n\t" /* store */
+
+ : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+ [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
+ [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
+ : [src] "r"(src), [dst] "r"(dst));
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ } break;
+ default:
+ for (y = h; y--;) {
+ for (x = 0; x < w; ++x) {
+ dst[x] = src[x];
+ }
+
+ src += src_stride;
+ dst += dst_stride;
+ }
+ break;
+ }
+}
+#endif