Merge "Clean-ups in rdopt.c"
diff --git a/vp9/common/mips/dspr2/vp9_common_dspr2.h b/vp9/common/mips/dspr2/vp9_common_dspr2.h
index d2fa4c1..dc88f16 100644
--- a/vp9/common/mips/dspr2/vp9_common_dspr2.h
+++ b/vp9/common/mips/dspr2/vp9_common_dspr2.h
@@ -81,5 +81,34 @@
);
}
+void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+void vp9_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+void vp9_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+void vp9_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter,
+ int w, int h);
+
+void vp9_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
#endif // #if HAVE_DSPR2
#endif // VP9_COMMON_VP9_COMMON_DSPR2_H_
diff --git a/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c
new file mode 100644
index 0000000..91d62bc
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_y,
+ int32_t w,
+ int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2;
+ uint32_t p1, p2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t Temp1, Temp2;
+ const int16_t *filter = &filter_y[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_store(dst + dst_stride);
+
+ for (x = 0; x < w; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__ (
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
+
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
+ "extp %[Temp2], $ac3, 31 \n\t"
+ "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2),
+ [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [store1] "=&r" (store1), [store2] "=&r" (store2),
+ [src_ptr] "+r" (src_ptr)
+ : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+ [src_stride] "r" (src_stride), [cm] "r" (cm),
+ [dst_ptr] "r" (dst_ptr)
+ );
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_y,
+ int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2;
+ uint32_t p1, p2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t Temp1, Temp2;
+ const int16_t *filter = &filter_y[3];
+ uint32_t filter45;;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_store(dst + dst_stride);
+ vp9_prefetch_store(dst + dst_stride + 32);
+
+ for (x = 0; x < 64; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__ (
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
+
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
+ "extp %[Temp2], $ac3, 31 \n\t"
+ "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2),
+ [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [store1] "=&r" (store1), [store2] "=&r" (store2),
+ [src_ptr] "+r" (src_ptr)
+ : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+ [src_stride] "r" (src_stride), [cm] "r" (cm),
+ [dst_ptr] "r" (dst_ptr)
+ );
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp9_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (16 == y_step_q4) {
+ uint32_t pos = 38;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ vp9_prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ case 8:
+ case 16:
+ case 32:
+ convolve_bi_avg_vert_4_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_y, w, h);
+ break;
+ case 64:
+ vp9_prefetch_store(dst + 32);
+ convolve_bi_avg_vert_64_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_y, h);
+ break;
+ default:
+ vp9_convolve8_avg_vert_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else {
+ vp9_convolve8_avg_vert_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ }
+}
+#endif
diff --git a/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
new file mode 100644
index 0000000..148b20f
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
@@ -0,0 +1,833 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vp9_ff_cropTbl;
+ int32_t Temp1, Temp2, Temp3, Temp4;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2, p3;
+ uint32_t tn1, tn2;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */
+
+ /* odd 1. pixel */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p3], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */
+
+ /* odd 2. pixel */
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */
+ "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t"
+ "extp %[Temp4], $ac2, 31 \n\t"
+
+ "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */
+ "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */
+
+ /* clamp */
+ "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */
+ "lbux %[p3], %[Temp4](%[cm]) \n\t" /* odd 2 */
+ "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */
+
+ "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */
+ "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */
+
+ "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */
+ "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+ [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+ : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector4a = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t tp1, tp2, tp3, tp4;
+ uint32_t p1, p2, p3, p4, n1;
+ uint32_t st0, st1;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+ "lbu %[Temp2], 0(%[dst]) \n\t"
+ "lbu %[tp4], 2(%[dst]) \n\t"
+
+ /* even 2. pixel */
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* even 3. pixel */
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lbux %[st1], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
+ "addqh_r.w %[tp4], %[tp4], %[st1] \n\t"
+ "sb %[Temp2], 0(%[dst]) \n\t"
+ "sb %[tp4], 2(%[dst]) \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "balign %[tp3], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+ "lbu %[Temp2], 4(%[dst]) \n\t"
+ "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
+
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sb %[Temp2], 4(%[dst]) \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tp3] \n\t"
+ "preceu.ph.qbl %[p4], %[tp3] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "lbu %[tp1], 6(%[dst]) \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "lbux %[st0], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac1, 31 \n\t"
+
+ "lbu %[tp2], 1(%[dst]) \n\t"
+ "lbu %[tp3], 3(%[dst]) \n\t"
+ "addqh_r.w %[tp1], %[tp1], %[st0] \n\t"
+
+ /* odd 3. pixel */
+ "lbux %[st1], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
+ "addqh_r.w %[tp2], %[tp2], %[st1] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "lbu %[tp4], 5(%[dst]) \n\t"
+
+ /* odd 4. pixel */
+ "sb %[tp2], 1(%[dst]) \n\t"
+ "sb %[tp1], 6(%[dst]) \n\t"
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbu %[tp1], 7(%[dst]) \n\t"
+
+ /* clamp */
+ "lbux %[p4], %[Temp3](%[cm]) \n\t"
+ "addqh_r.w %[tp3], %[tp3], %[p4] \n\t"
+
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[tp4], %[tp4], %[p2] \n\t"
+
+ "lbux %[p1], %[Temp1](%[cm]) \n\t"
+ "addqh_r.w %[tp1], %[tp1], %[p1] \n\t"
+
+ /* store bytes */
+ "sb %[tp3], 3(%[dst]) \n\t"
+ "sb %[tp4], 5(%[dst]) \n\t"
+ "sb %[tp1], 7(%[dst]) \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+ [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+ [st0] "=&r" (st0), [st1] "=&r" (st1),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [n1] "=&r" (n1),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+ : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h,
+ int32_t count) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src_ptr + src_stride);
+ vp9_prefetch_load(src_ptr + src_stride + 32);
+ vp9_prefetch_store(dst_ptr + dst_stride);
+
+ for (c = 0; c < count; c++) {
+ __asm__ __volatile__ (
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+ "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
+ "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
+ "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
+ "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
+ "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
+ "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
+ "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
+ "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
+ "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
+ "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
+ "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
+
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
+
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
+
+ "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
+ "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
+ "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
+
+ : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
+ [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [qload3] "=&r" (qload3), [p5] "=&r" (p5),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+ : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src_ptr + src_stride);
+ vp9_prefetch_load(src_ptr + src_stride + 32);
+ vp9_prefetch_load(src_ptr + src_stride + 64);
+ vp9_prefetch_store(dst_ptr + dst_stride);
+ vp9_prefetch_store(dst_ptr + dst_stride + 32);
+
+ for (c = 0; c < 4; c++) {
+ __asm__ __volatile__ (
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+ "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
+ "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
+ "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
+ "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
+ "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
+ "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
+ "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
+ "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
+ "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
+ "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
+ "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
+
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
+
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
+
+ "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
+ "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
+ "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
+
+ : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
+ [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [qload3] "=&r" (qload3), [p5] "=&r" (p5),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+ : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+void vp9_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (16 == x_step_q4) {
+ uint32_t pos = 38;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src);
+ vp9_prefetch_load(src + 32);
+ vp9_prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ convolve_bi_avg_horiz_4_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, h);
+ break;
+ case 8:
+ convolve_bi_avg_horiz_8_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, h);
+ break;
+ case 16:
+ convolve_bi_avg_horiz_16_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, h, 1);
+ break;
+ case 32:
+ convolve_bi_avg_horiz_16_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, h, 2);
+ break;
+ case 64:
+ vp9_prefetch_load(src + 64);
+ vp9_prefetch_store(dst + 32);
+
+ convolve_bi_avg_horiz_64_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, h);
+ break;
+ default:
+ vp9_convolve8_avg_horiz_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else {
+ vp9_convolve8_avg_horiz_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ }
+}
+#endif
diff --git a/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c
new file mode 100644
index 0000000..bc422bc
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c
@@ -0,0 +1,784 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint8_t *dst_ptr;
+ int32_t Temp1, Temp2;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ dst_ptr = dst;
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp2], $ac2, 31 \n\t"
+
+ /* odd 1. pixel */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "lbux %[tp2], %[Temp2](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp2], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p1], %[Temp1](%[cm]) \n\t"
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ "sb %[p1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ "sb %[tp2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ "sb %[p2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [dst_ptr] "+r" (dst_ptr)
+ : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
+ [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
+ );
+
+ /* Next row... */
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint8_t *dst_ptr;
+ uint32_t vector4a = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t tp1, tp2, tp3;
+ uint32_t p1, p2, p3, p4;
+ uint8_t *odd_dst;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+
+ dst_ptr = dst;
+ odd_dst = (dst_ptr + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* even 3. pixel */
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "balign %[tp3], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
+ "lbux %[tp1], %[Temp3](%[cm]) \n\t"
+ "extp %[p3], $ac1, 31 \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sb %[Temp2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ "lbux %[Temp1], %[p3](%[cm]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tp3] \n\t"
+ "preceu.ph.qbl %[p4], %[tp3] \n\t"
+ "sb %[Temp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "lbux %[tp1], %[Temp3](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+ "extp %[Temp3], $ac1, 31 \n\t"
+
+ /* odd 3. pixel */
+ "lbux %[tp3], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 4. pixel */
+ "sb %[tp3], 0(%[odd_dst]) \n\t"
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p4], %[Temp3](%[cm]) \n\t"
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+ "lbux %[p1], %[Temp1](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[p4], 0(%[odd_dst]) \n\t"
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[p2], 0(%[odd_dst]) \n\t"
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[p1], 0(%[odd_dst]) \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
+ : [filter45] "r" (filter45),[vector4a] "r" (vector4a), [cm] "r" (cm),
+ [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+ );
+
+ /* Next row... */
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h,
+ int32_t count) {
+ int32_t c, y;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+ uint8_t *odd_dst;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src_ptr + src_stride);
+ vp9_prefetch_load(src_ptr + src_stride + 32);
+
+ src = src_ptr;
+ dst = dst_ptr;
+
+ odd_dst = (dst + dst_stride);
+
+ for (c = 0; c < count; c++) {
+ __asm__ __volatile__ (
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload1], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p5], %[qload1] \n\t"
+ "ulw %[qload2], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[qload2] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[qload2] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* even 2 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 0(%[dst]) \n\t" /* even 3 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 4 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload1], 20(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p5], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* even 5 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[st3], 0(%[dst]) \n\t" /* even 6 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 7 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload2], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload2] \n\t"
+ "preceu.ph.qbl %[p5], %[qload2] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload1], 21(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p5], %[qload1] \n\t"
+ "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+
+ "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */
+
+ : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
+ [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
+ : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+ [cm] "r" (cm),
+ [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+ );
+
+ src += 16;
+ dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+ odd_dst = (dst + dst_stride);
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += 1;
+ }
+}
+
+static void convolve_bi_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t c, y;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+ uint8_t *odd_dst;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src_ptr + src_stride);
+ vp9_prefetch_load(src_ptr + src_stride + 32);
+ vp9_prefetch_load(src_ptr + src_stride + 64);
+
+ src = src_ptr;
+ dst = dst_ptr;
+
+ odd_dst = (dst + dst_stride);
+
+ for (c = 0; c < 4; c++) {
+ __asm__ __volatile__ (
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload1], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p5], %[qload1] \n\t"
+ "ulw %[qload2], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[qload2] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[qload2] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* even 2 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 0(%[dst]) \n\t" /* even 3 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 4 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload1], 20(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p5], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* even 5 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[st3], 0(%[dst]) \n\t" /* even 6 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 7 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload2], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload2] \n\t"
+ "preceu.ph.qbl %[p5], %[qload2] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload1], 21(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p5], %[qload1] \n\t"
+ "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+
+ "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */
+
+ : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
+ [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
+ : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+ [cm] "r" (cm),
+ [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+ );
+
+ src += 16;
+ dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+ odd_dst = (dst + dst_stride);
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += 1;
+ }
+}
+
+void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter, int w, int h) {
+ int x, y;
+
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) {
+ int sum = 0;
+
+ sum += src[x] * filter[3];
+ sum += src[x + 1] * filter[4];
+
+ dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ }
+
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+void vp9_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter,
+ int w, int h) {
+ uint32_t pos = 38;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src);
+ vp9_prefetch_load(src + 32);
+
+ switch (w) {
+ case 4:
+ convolve_bi_horiz_4_transposed_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter, h);
+ break;
+ case 8:
+ convolve_bi_horiz_8_transposed_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter, h);
+ break;
+ case 16:
+ case 32:
+ convolve_bi_horiz_16_transposed_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter, h,
+ (w/16));
+ break;
+ case 64:
+ vp9_prefetch_load(src + 32);
+ convolve_bi_horiz_64_transposed_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter, h);
+ break;
+ default:
+ convolve_bi_horiz_transposed(src, src_stride,
+ dst, dst_stride,
+ filter, w, h);
+ break;
+ }
+}
+#endif
diff --git a/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
new file mode 100644
index 0000000..1debdb4
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
@@ -0,0 +1,713 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vp9_ff_cropTbl;
+ int32_t Temp1, Temp2, Temp3, Temp4;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* odd 1. pixel */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp4], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p1], %[Temp2](%[cm]) \n\t"
+ "lbux %[p2], %[Temp4](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[tp1], 0(%[dst]) \n\t"
+ "sb %[p1], 1(%[dst]) \n\t"
+ "sb %[tp2], 2(%[dst]) \n\t"
+ "sb %[p2], 3(%[dst]) \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+ : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector4a = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t tp1, tp2, tp3;
+ uint32_t p1, p2, p3, p4;
+ uint32_t st0, st1;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* even 3. pixel */
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st0], 0(%[dst]) \n\t"
+ "lbux %[st1], %[Temp3](%[cm]) \n\t"
+
+ "balign %[tp3], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 2(%[dst]) \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tp3] \n\t"
+ "preceu.ph.qbl %[p4], %[tp3] \n\t"
+ "sb %[st0], 4(%[dst]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "lbux %[st0], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac1, 31 \n\t"
+
+ /* odd 3. pixel */
+ "lbux %[st1], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 4. pixel */
+ "sb %[st1], 1(%[dst]) \n\t"
+ "sb %[st0], 6(%[dst]) \n\t"
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p4], %[Temp3](%[cm]) \n\t"
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+ "lbux %[p1], %[Temp1](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[p4], 3(%[dst]) \n\t"
+ "sb %[p2], 5(%[dst]) \n\t"
+ "sb %[p1], 7(%[dst]) \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
+ [st0] "=&r" (st0), [st1] "=&r" (st1),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+ : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h,
+ int32_t count) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src_ptr + src_stride);
+ vp9_prefetch_load(src_ptr + src_stride + 32);
+ vp9_prefetch_store(dst_ptr + dst_stride);
+
+ for (c = 0; c < count; c++) {
+ __asm__ __volatile__ (
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+
+ "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
+ "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
+ "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
+
+ : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
+ [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [p5] "=&r" (p5),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+ : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src_ptr + src_stride);
+ vp9_prefetch_load(src_ptr + src_stride + 32);
+ vp9_prefetch_load(src_ptr + src_stride + 64);
+ vp9_prefetch_store(dst_ptr + dst_stride);
+ vp9_prefetch_store(dst_ptr + dst_stride + 32);
+
+ for (c = 0; c < 4; c++) {
+ __asm__ __volatile__ (
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+
+ "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
+ "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
+ "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
+
+ : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
+ [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [p5] "=&r" (p5),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+ : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (16 == x_step_q4) {
+ uint32_t pos = 38;
+
+ vp9_prefetch_load((const uint8_t *)filter_x);
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src);
+ vp9_prefetch_load(src + 32);
+ vp9_prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filter_x, (int32_t)h);
+ break;
+ case 8:
+ convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filter_x, (int32_t)h);
+ break;
+ case 16:
+ convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filter_x, (int32_t)h, 1);
+ break;
+ case 32:
+ convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filter_x, (int32_t)h, 2);
+ break;
+ case 64:
+ vp9_prefetch_load(src + 64);
+ vp9_prefetch_store(dst + 32);
+
+ convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filter_x, (int32_t)h);
+ break;
+ default:
+ vp9_convolve8_horiz_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else {
+ vp9_convolve8_horiz_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ }
+}
+#endif
diff --git a/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c
new file mode 100644
index 0000000..8eb105c
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_vert_4_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_y,
+ int32_t w,
+ int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2;
+ uint32_t p1, p2;
+ uint32_t scratch1;
+ uint32_t store1, store2;
+ int32_t Temp1, Temp2;
+ const int16_t *filter = &filter_y[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_store(dst + dst_stride);
+
+ for (x = 0; x < w; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__ (
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
+
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2),
+ [scratch1] "=&r" (scratch1),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [store1] "=&r" (store1), [store2] "=&r" (store2),
+ [src_ptr] "+r" (src_ptr)
+ : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
+ [src_stride] "r" (src_stride),
+ [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+ );
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_vert_64_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_y,
+ int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2;
+ uint32_t p1, p2;
+ uint32_t scratch1;
+ uint32_t store1, store2;
+ int32_t Temp1, Temp2;
+ const int16_t *filter = &filter_y[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_store(dst + dst_stride);
+
+ for (x = 0; x < 64; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__ (
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
+
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2),
+ [scratch1] "=&r" (scratch1),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [store1] "=&r" (store1), [store2] "=&r" (store2),
+ [src_ptr] "+r" (src_ptr)
+ : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
+ [src_stride] "r" (src_stride),
+ [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+ );
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp9_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (16 == y_step_q4) {
+ uint32_t pos = 38;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ vp9_prefetch_store(dst);
+
+ switch (w) {
+ case 4 :
+ case 8 :
+ case 16 :
+ case 32 :
+ convolve_bi_vert_4_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_y, w, h);
+ break;
+ case 64 :
+ vp9_prefetch_store(dst + 32);
+ convolve_bi_vert_64_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_y, h);
+ break;
+ default:
+ vp9_convolve8_vert_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else {
+ vp9_convolve8_vert_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ }
+}
+#endif
diff --git a/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
index 0930ad1..da7f0fd 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
@@ -355,6 +355,12 @@
filter_x, x_step_q4,
filter_y, y_step_q4,
w, h);
+ } else if (((const int32_t *)filter_y)[0] == 0) {
+ vp9_convolve2_avg_vert_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
} else {
if (16 == y_step_q4) {
uint32_t pos = 38;
diff --git a/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
index 37c665b..69da1cf 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
@@ -965,6 +965,12 @@
filter_x, x_step_q4,
filter_y, y_step_q4,
w, h);
+ } else if (((const int32_t *)filter_x)[0] == 0) {
+ vp9_convolve2_avg_horiz_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
} else {
if (16 == x_step_q4) {
uint32_t pos = 38;
diff --git a/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
index 2c48bd0..126e05a 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
@@ -930,6 +930,21 @@
}
}
+void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ int w, int h) {
+ int x, y;
+
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) {
+ dst[x * dst_stride] = src[x];
+ }
+
+ src += src_stride;
+ dst += 1;
+ }
+}
+
void vp9_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
@@ -966,20 +981,14 @@
/* copy the src to dst */
if (filter_x[3] == 0x80) {
- int32_t y;
- int32_t c;
- const uint8_t *src_ptr = src - src_stride * 3;
- uint8_t *dst_ptr = temp;
-
- for (y = intermediate_height; y--;) {
- for (c = 0; c < w; c++) {
- dst_ptr[c * intermediate_height] = src_ptr[c];
- }
-
- /* next row... */
- src_ptr += src_stride;
- dst_ptr += 1;
- }
+ copy_horiz_transposed(src - src_stride * 3, src_stride,
+ temp, intermediate_height,
+ w, intermediate_height);
+ } else if (((const int32_t *)filter_x)[0] == 0) {
+ vp9_convolve2_dspr2(src - src_stride * 3, src_stride,
+ temp, intermediate_height,
+ filter_x,
+ w, intermediate_height);
} else {
src -= (src_stride * 3 + 3);
@@ -1021,20 +1030,14 @@
/* copy the src to dst */
if (filter_y[3] == 0x80) {
- int32_t y;
- int32_t c;
- uint8_t *src_ptr = temp + 3;
- uint8_t *dst_ptr = dst;
-
- for (y = w; y--;) {
- for (c = 0; c < h; c++) {
- dst_ptr[c * dst_stride] = src_ptr[c];
- }
-
- /* next row... */
- src_ptr += intermediate_height;
- dst_ptr += 1;
- }
+ copy_horiz_transposed(temp + 3, intermediate_height,
+ dst, dst_stride,
+ h, w);
+ } else if (((const int32_t *)filter_y)[0] == 0) {
+ vp9_convolve2_dspr2(temp + 3, intermediate_height,
+ dst, dst_stride,
+ filter_y,
+ h, w);
} else {
switch (h) {
case 4:
diff --git a/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
index 743d641..0303896 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
@@ -849,6 +849,12 @@
filter_x, x_step_q4,
filter_y, y_step_q4,
w, h);
+ } else if (((const int32_t *)filter_x)[0] == 0) {
+ vp9_convolve2_horiz_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
} else {
if (16 == x_step_q4) {
uint32_t pos = 38;
diff --git a/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c
index bdc7930..0930bb3 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c
@@ -341,6 +341,12 @@
filter_x, x_step_q4,
filter_y, y_step_q4,
w, h);
+ } else if (((const int32_t *)filter_y)[0] == 0) {
+ vp9_convolve2_vert_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
} else {
if (16 == y_step_q4) {
uint32_t pos = 38;
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index f116c06..66fc1cb 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -578,7 +578,7 @@
}
}
-static int get_tx_eob(struct segmentation *seg, int segment_id,
+static int get_tx_eob(const struct segmentation *seg, int segment_id,
TX_SIZE tx_size) {
const int eob_max = 16 << (tx_size << 1);
return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 1087536..7619d76 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -406,8 +406,6 @@
const BLOCK_SIZE bsize = mi->sb_type;
const int allow_hp = xd->allow_high_precision_mv;
- x->partition_info = x->pi + (m - cm->mi);
-
#ifdef ENTROPY_STATS
active_section = 9;
#endif
@@ -489,7 +487,7 @@
for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
const int j = idy * 2 + idx;
- const MB_PREDICTION_MODE blockmode = x->partition_info->bmi[j].mode;
+ const MB_PREDICTION_MODE blockmode = m->bmi[j].as_mode;
write_sb_mv_ref(bc, blockmode, mv_ref_p);
++cm->counts.inter_mode[mi->mode_context[rf]]
[inter_mode_offset(blockmode)];
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 2e28a2e..6314b60 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -23,16 +23,9 @@
int offset;
} search_site;
-typedef struct {
- struct {
- MB_PREDICTION_MODE mode;
- } bmi[4];
-} PARTITION_INFO;
-
// Structure to hold snapshot of coding context during the mode picking process
typedef struct {
MODE_INFO mic;
- PARTITION_INFO partition_info;
unsigned char zcoeff_blk[256];
int skip;
int_mv best_ref_mv;
@@ -87,9 +80,6 @@
MACROBLOCKD e_mbd;
int skip_block;
- PARTITION_INFO *partition_info; /* work pointer */
- PARTITION_INFO *pi; /* Corresponds to upper left visible macroblock */
- PARTITION_INFO *pip; /* Base of allocated array */
search_site *ss;
int ss_count;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index bd48a9b..f0282b3 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -381,7 +381,6 @@
}
if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8) {
- *x->partition_info = ctx->partition_info;
mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
}
@@ -492,9 +491,6 @@
x->mb_activity_ptr = &cpi->mb_activity_map[idx_map];
x->active_ptr = cpi->active_map + idx_map;
- /* pointers to mode info contexts */
- x->partition_info = x->pi + idx_str;
-
xd->mi_8x8 = cm->mi_grid_visible + idx_str;
xd->prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str;
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index 04a4172..9ebcc49 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -337,7 +337,7 @@
for (idy = 0; idy < 2; idy += num_4x4_h) {
for (idx = 0; idx < 2; idx += num_4x4_w) {
const int i = idy * 2 + idx;
- if (x->partition_info->bmi[i].mode == NEWMV)
+ if (mi->bmi[i].as_mode == NEWMV)
inc_mvs(mi->bmi[i].as_mv, best_ref_mv, is_compound, &cpi->NMVcount);
}
}
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index fee0f15..b2becbb 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -515,7 +515,6 @@
setup_pre_planes(xd, 0, lst_yv12, 0, 0, NULL);
setup_dst_planes(xd, new_yv12, 0, 0);
- x->partition_info = x->pi;
xd->mi_8x8 = cm->mi_grid_visible;
// required for vp9_frame_init_quantizer
xd->this_mi =
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index e35e5e1..ff1d179 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -309,9 +309,6 @@
cpi->mb_activity_map = 0;
vpx_free(cpi->mb_norm_activity_map);
cpi->mb_norm_activity_map = 0;
-
- vpx_free(cpi->mb.pip);
- cpi->mb.pip = 0;
}
// Computes a q delta (in "q index" terms) to get from a starting q value
@@ -983,20 +980,6 @@
"Failed to allocate altref buffer");
}
-static int alloc_partition_data(VP9_COMP *cpi) {
- vpx_free(cpi->mb.pip);
-
- cpi->mb.pip = vpx_calloc(cpi->common.mode_info_stride *
- (cpi->common.mi_rows + MI_BLOCK_SIZE),
- sizeof(PARTITION_INFO));
- if (!cpi->mb.pip)
- return 1;
-
- cpi->mb.pi = cpi->mb.pip + cpi->common.mode_info_stride + 1;
-
- return 0;
-}
-
void vp9_alloc_compressor_data(VP9_COMP *cpi) {
VP9_COMMON *cm = &cpi->common;
@@ -1004,10 +987,6 @@
vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
"Failed to allocate frame buffers");
- if (alloc_partition_data(cpi))
- vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
- "Failed to allocate partition data");
-
if (vp9_alloc_frame_buffer(&cpi->last_frame_uf,
cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index c2286fb..079463c 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1507,7 +1507,8 @@
if (has_second_rf)
mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
- x->partition_info->bmi[i].mode = m;
+ mic->bmi[i].as_mode = m;
+
for (idy = 0; idy < num_4x4_blocks_high; ++idy)
for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
@@ -1654,7 +1655,7 @@
BEST_SEG_INFO *bsi_buf, int filter_idx,
int_mv seg_mvs[4][MAX_REF_FRAMES],
int mi_row, int mi_col) {
- int i, j, br = 0, idx, idy;
+ int i, br = 0, idx, idy;
int64_t bd = 0, block_sse = 0;
MB_PREDICTION_MODE this_mode;
MODE_INFO *mi = x->e_mbd.mi_8x8[0];
@@ -2016,15 +2017,6 @@
bsi->segment_rd = INT64_MAX;
return;
}
-
- for (j = 1; j < num_4x4_blocks_high; ++j)
- vpx_memcpy(&x->partition_info->bmi[i + j * 2],
- &x->partition_info->bmi[i],
- sizeof(x->partition_info->bmi[i]));
- for (j = 1; j < num_4x4_blocks_wide; ++j)
- vpx_memcpy(&x->partition_info->bmi[i + j],
- &x->partition_info->bmi[i],
- sizeof(x->partition_info->bmi[i]));
}
} /* for each label */
@@ -2036,7 +2028,7 @@
// update the coding decisions
for (i = 0; i < 4; ++i)
- bsi->modes[i] = x->partition_info->bmi[i].mode;
+ bsi->modes[i] = mi->bmi[i].as_mode;
}
static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
@@ -2081,7 +2073,7 @@
if (has_second_ref(mbmi))
mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
xd->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
- x->partition_info->bmi[i].mode = bsi->modes[i];
+ mi->bmi[i].as_mode = bsi->modes[i];
}
/*
@@ -2214,7 +2206,6 @@
static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
int mode_index,
- PARTITION_INFO *partition,
int_mv *ref_mv,
int_mv *second_ref_mv,
int64_t comp_pred_diff[NB_PREDICTION_TYPES],
@@ -2228,9 +2219,6 @@
ctx->best_mode_index = mode_index;
ctx->mic = *xd->this_mi;
- if (partition)
- ctx->partition_info = *partition;
-
ctx->best_ref_mv.as_int = ref_mv->as_int;
ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;
@@ -3788,7 +3776,6 @@
set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
scale_factor);
store_coding_context(x, ctx, best_mode_index,
- NULL,
&mbmi->ref_mvs[mbmi->ref_frame[0]][0],
&mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
mbmi->ref_frame[1]][0],
@@ -3847,7 +3834,6 @@
cpi->common.y_dc_delta_q);
int_mv seg_mvs[4][MAX_REF_FRAMES];
b_mode_info best_bmodes[4];
- PARTITION_INFO best_partition;
int best_skip2 = 0;
unsigned char best_zcoeff_blk[256] = { 0 };
@@ -4063,7 +4049,6 @@
&mbmi->ref_mvs[second_ref_frame][0] : NULL;
b_mode_info tmp_best_bmodes[16];
MB_MODE_INFO tmp_best_mbmode;
- PARTITION_INFO tmp_best_partition;
BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
int pred_exists = 0;
int uv_skippable;
@@ -4127,7 +4112,6 @@
tmp_best_sse = total_sse;
tmp_best_skippable = skippable;
tmp_best_mbmode = *mbmi;
- tmp_best_partition = *x->partition_info;
for (i = 0; i < 4; i++)
tmp_best_bmodes[i] = xd->this_mi->bmi[i];
pred_exists = 1;
@@ -4179,7 +4163,6 @@
distortion = tmp_best_distortion;
skippable = tmp_best_skippable;
*mbmi = tmp_best_mbmode;
- *x->partition_info = tmp_best_partition;
for (i = 0; i < 4; i++)
xd->this_mi->bmi[i] = tmp_best_bmodes[i];
}
@@ -4307,7 +4290,6 @@
RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
best_mbmode = *mbmi;
best_skip2 = this_skip2;
- best_partition = *x->partition_info;
vpx_memcpy(best_zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
sizeof(best_zcoeff_blk));
@@ -4465,15 +4447,8 @@
for (i = 0; i < 4; i++)
xd->this_mi->bmi[i].as_mode = best_bmodes[i].as_mode;
} else {
- for (i = 0; i < 4; i++)
- xd->this_mi->bmi[i].as_mv[0].as_int =
- best_bmodes[i].as_mv[0].as_int;
-
- if (has_second_ref(mbmi))
- for (i = 0; i < 4; i++)
- xd->this_mi->bmi[i].as_mv[1].as_int = best_bmodes[i].as_mv[1].as_int;
-
- *x->partition_info = best_partition;
+ for (i = 0; i < 4; ++i)
+ vpx_memcpy(&xd->this_mi->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
mbmi->mv[0].as_int = xd->this_mi->bmi[3].as_mv[0].as_int;
mbmi->mv[1].as_int = xd->this_mi->bmi[3].as_mv[1].as_int;
@@ -4516,7 +4491,6 @@
set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
scale_factor);
store_coding_context(x, ctx, best_mode_index,
- &best_partition,
&mbmi->ref_mvs[mbmi->ref_frame[0]][0],
&mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
mbmi->ref_frame[1]][0],
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 10fa461..99b21e2 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -89,6 +89,11 @@
# common (c)
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_common_dspr2.h
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_avg_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_vert_dspr2.c
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_avg_dspr2.c
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_dspr2.c