blob: 1d5f5c43beaf170abe976334cfc7cb577e9e9f26 [file] [log] [blame] [edit]
/*
* Copyright (c) 2021, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 3-Clause Clear License
* and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
* License was not distributed with this source code in the LICENSE file, you
* can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
* Alliance for Open Media Patent License 1.0 was not distributed with this
* source code in the PATENTS file, you can obtain it at
* aomedia.org/license/patent-license/.
*/
#include <stdlib.h>
#include "config/aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
#include "aom_dsp/mips/common_dspr2.h"
#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
#include "aom_mem/aom_mem.h"
#if HAVE_DSPR2
void aom_lpf_horizontal_4_dspr2(unsigned char *s, int pitch,
const uint8_t *blimit, const uint8_t *limit,
const uint8_t *thresh) {
uint8_t i;
uint32_t mask;
uint32_t hev;
uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
uint32_t thresh_vec, flimit_vec, limit_vec;
uint32_t uflimit, ulimit, uthresh;
uflimit = *blimit;
ulimit = *limit;
uthresh = *thresh;
/* create quad-byte */
__asm__ __volatile__(
"replv.qb %[thresh_vec], %[uthresh] \n\t"
"replv.qb %[flimit_vec], %[uflimit] \n\t"
"replv.qb %[limit_vec], %[ulimit] \n\t"
: [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
[limit_vec] "=r"(limit_vec)
: [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
/* prefetch data for store */
prefetch_store(s);
/* loop filter designed to work using chars so that we can make maximum use
of 8 bit simd instructions. */
for (i = 0; i < 2; i++) {
sm1 = s - (pitch << 2);
s0 = sm1 + pitch;
s1 = s0 + pitch;
s2 = s - pitch;
s3 = s;
s4 = s + pitch;
s5 = s4 + pitch;
s6 = s5 + pitch;
__asm__ __volatile__(
"lw %[p1], (%[s1]) \n\t"
"lw %[p2], (%[s2]) \n\t"
"lw %[p3], (%[s3]) \n\t"
"lw %[p4], (%[s4]) \n\t"
: [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4)
: [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
/* if (p1 - p4 == 0) and (p2 - p3 == 0)
mask will be zero and filtering is not needed */
if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
__asm__ __volatile__(
"lw %[pm1], (%[sm1]) \n\t"
"lw %[p0], (%[s0]) \n\t"
"lw %[p5], (%[s5]) \n\t"
"lw %[p6], (%[s6]) \n\t"
: [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6)
: [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6));
filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
p6, thresh_vec, &hev, &mask);
/* if mask == 0 do filtering is not needed */
if (mask) {
/* filtering */
filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
__asm__ __volatile__(
"sw %[p1], (%[s1]) \n\t"
"sw %[p2], (%[s2]) \n\t"
"sw %[p3], (%[s3]) \n\t"
"sw %[p4], (%[s4]) \n\t"
:
: [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4),
[s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
}
}
s = s + 4;
}
}
void aom_lpf_vertical_4_dspr2(unsigned char *s, int pitch,
const uint8_t *blimit, const uint8_t *limit,
const uint8_t *thresh) {
uint8_t i;
uint32_t mask, hev;
uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
uint8_t *s1, *s2, *s3, *s4;
uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
uint32_t thresh_vec, flimit_vec, limit_vec;
uint32_t uflimit, ulimit, uthresh;
uflimit = *blimit;
ulimit = *limit;
uthresh = *thresh;
/* create quad-byte */
__asm__ __volatile__(
"replv.qb %[thresh_vec], %[uthresh] \n\t"
"replv.qb %[flimit_vec], %[uflimit] \n\t"
"replv.qb %[limit_vec], %[ulimit] \n\t"
: [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
[limit_vec] "=r"(limit_vec)
: [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
/* prefetch data for store */
prefetch_store(s + pitch);
for (i = 0; i < 2; i++) {
s1 = s;
s2 = s + pitch;
s3 = s2 + pitch;
s4 = s3 + pitch;
s = s4 + pitch;
/* load quad-byte vectors
* memory is 4 byte aligned
*/
p2 = *((uint32_t *)(s1 - 4));
p6 = *((uint32_t *)(s1));
p1 = *((uint32_t *)(s2 - 4));
p5 = *((uint32_t *)(s2));
p0 = *((uint32_t *)(s3 - 4));
p4 = *((uint32_t *)(s3));
pm1 = *((uint32_t *)(s4 - 4));
p3 = *((uint32_t *)(s4));
/* transpose pm1, p0, p1, p2 */
__asm__ __volatile__(
"precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
"precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
"precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
"precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
"precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
"precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
"precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
"precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
"precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
"precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
"append %[p1], %[sec3], 16 \n\t"
"append %[pm1], %[sec4], 16 \n\t"
: [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
[prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
[pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
:);
/* transpose p3, p4, p5, p6 */
__asm__ __volatile__(
"precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
"precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
"precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
"precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
"precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
"precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
"precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
"precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
"precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
"precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
"append %[p5], %[sec3], 16 \n\t"
"append %[p3], %[sec4], 16 \n\t"
: [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
[prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
[p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
:);
/* if (p1 - p4 == 0) and (p2 - p3 == 0)
* mask will be zero and filtering is not needed
*/
if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
p6, thresh_vec, &hev, &mask);
/* if mask == 0 do filtering is not needed */
if (mask) {
/* filtering */
filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
/* unpack processed 4x4 neighborhood
* don't use transpose on output data
* because memory isn't aligned
*/
__asm__ __volatile__(
"sb %[p4], 1(%[s4]) \n\t"
"sb %[p3], 0(%[s4]) \n\t"
"sb %[p2], -1(%[s4]) \n\t"
"sb %[p1], -2(%[s4]) \n\t"
:
: [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
[s4] "r"(s4));
__asm__ __volatile__(
"srl %[p4], %[p4], 8 \n\t"
"srl %[p3], %[p3], 8 \n\t"
"srl %[p2], %[p2], 8 \n\t"
"srl %[p1], %[p1], 8 \n\t"
: [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
:);
__asm__ __volatile__(
"sb %[p4], 1(%[s3]) \n\t"
"sb %[p3], 0(%[s3]) \n\t"
"sb %[p2], -1(%[s3]) \n\t"
"sb %[p1], -2(%[s3]) \n\t"
: [p1] "+r"(p1)
: [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3));
__asm__ __volatile__(
"srl %[p4], %[p4], 8 \n\t"
"srl %[p3], %[p3], 8 \n\t"
"srl %[p2], %[p2], 8 \n\t"
"srl %[p1], %[p1], 8 \n\t"
: [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
:);
__asm__ __volatile__(
"sb %[p4], 1(%[s2]) \n\t"
"sb %[p3], 0(%[s2]) \n\t"
"sb %[p2], -1(%[s2]) \n\t"
"sb %[p1], -2(%[s2]) \n\t"
:
: [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
[s2] "r"(s2));
__asm__ __volatile__(
"srl %[p4], %[p4], 8 \n\t"
"srl %[p3], %[p3], 8 \n\t"
"srl %[p2], %[p2], 8 \n\t"
"srl %[p1], %[p1], 8 \n\t"
: [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
:);
__asm__ __volatile__(
"sb %[p4], 1(%[s1]) \n\t"
"sb %[p3], 0(%[s1]) \n\t"
"sb %[p2], -1(%[s1]) \n\t"
"sb %[p1], -2(%[s1]) \n\t"
:
: [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
[s1] "r"(s1));
}
}
}
}
void aom_lpf_horizontal_4_dual_dspr2(
uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
const uint8_t *limit1, const uint8_t *thresh1) {
aom_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0);
aom_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1);
}
void aom_lpf_horizontal_8_dual_dspr2(
uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
const uint8_t *limit1, const uint8_t *thresh1) {
aom_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0);
aom_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1);
}
void aom_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
const uint8_t *limit0,
const uint8_t *thresh0,
const uint8_t *blimit1,
const uint8_t *limit1,
const uint8_t *thresh1) {
aom_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0);
aom_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
}
void aom_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
const uint8_t *limit0,
const uint8_t *thresh0,
const uint8_t *blimit1,
const uint8_t *limit1,
const uint8_t *thresh1) {
aom_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0);
aom_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
}
void aom_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh) {
aom_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
aom_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
}
#endif // #if HAVE_DSPR2