Clean up and speed up CLPF clipping
* Move clipping tests from inside to outside loops
* Let sizex and sizey to clpf_block() be the clipped block size rather
than both just bs
* Make fallback tests to C more accurate
Change-Id: Icdc57540ce21b41a95403fdcc37988a4ebf546c7
diff --git a/av1/common/clpf_simd.h b/av1/common/clpf_simd.h
index 979856b..6fef4b7 100644
--- a/av1/common/clpf_simd.h
+++ b/av1/common/clpf_simd.h
@@ -76,24 +76,27 @@
v128 o = v128_from_v64(l1, l2);
const v128 a =
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
- v128 b = v128_from_v64(v64_load_unaligned(src - 2 * !!x0),
- v64_load_unaligned(src - 2 * !!x0 + sstride));
- v128 c = v128_from_v64(v64_load_unaligned(src - !!x0),
- v64_load_unaligned(src - !!x0 + sstride));
- v128 d = v128_from_v64(v64_load_unaligned(src + !!right),
- v64_load_unaligned(src + !!right + sstride));
- v128 e = v128_from_v64(v64_load_unaligned(src + 2 * !!right),
- v64_load_unaligned(src + 2 * !!right + sstride));
const v128 f = v128_from_v64(
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
+ v128 b, c, d, e;
- if (!x0) { // Left clipping
- b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
- c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
+ if (x0) {
+ b = v128_from_v64(v64_load_unaligned(src - 2),
+ v64_load_unaligned(src - 2 + sstride));
+ c = v128_from_v64(v64_load_unaligned(src - 1),
+ v64_load_unaligned(src - 1 + sstride));
+ } else { // Left clipping
+ b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
+ c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
}
- if (!right) { // Right clipping
- d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
- e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
+ if (right) {
+ d = v128_from_v64(v64_load_unaligned(src + 1),
+ v64_load_unaligned(src + 1 + sstride));
+ e = v128_from_v64(v64_load_unaligned(src + 2),
+ v64_load_unaligned(src + 2 + sstride));
+ } else { // Right clipping
+ d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
+ e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
}
o = calc_delta(o, a, b, c, d, e, f, sp, sm);
@@ -134,31 +137,34 @@
const uint32_t l5 = u32_load_aligned(src + ((y != bottom) + 3) * sstride);
v128 o = v128_from_32(l1, l2, l3, l4);
const v128 a = v128_from_32(l0, l1, l2, l3);
- v128 b = v128_from_32(u32_load_unaligned(src - 2 * !!x0),
- u32_load_unaligned(src + sstride - 2 * !!x0),
- u32_load_unaligned(src + 2 * sstride - 2 * !!x0),
- u32_load_unaligned(src + 3 * sstride - 2 * !!x0));
- v128 c = v128_from_32(u32_load_unaligned(src - !!x0),
- u32_load_unaligned(src + sstride - !!x0),
- u32_load_unaligned(src + 2 * sstride - !!x0),
- u32_load_unaligned(src + 3 * sstride - !!x0));
- v128 d = v128_from_32(u32_load_unaligned(src + !!right),
- u32_load_unaligned(src + sstride + !!right),
- u32_load_unaligned(src + 2 * sstride + !!right),
- u32_load_unaligned(src + 3 * sstride + !!right));
- v128 e = v128_from_32(u32_load_unaligned(src + 2 * !!right),
- u32_load_unaligned(src + sstride + 2 * !!right),
- u32_load_unaligned(src + 2 * sstride + 2 * !!right),
- u32_load_unaligned(src + 3 * sstride + 2 * !!right));
const v128 f = v128_from_32(l2, l3, l4, l5);
+ v128 b, c, d, e;
- if (!x0) { // Left clipping
- b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
- c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
+ if (x0) {
+ b = v128_from_32(u32_load_unaligned(src - 2),
+ u32_load_unaligned(src + sstride - 2),
+ u32_load_unaligned(src + 2 * sstride - 2),
+ u32_load_unaligned(src + 3 * sstride - 2));
+ c = v128_from_32(u32_load_unaligned(src - 1),
+ u32_load_unaligned(src + sstride - 1),
+ u32_load_unaligned(src + 2 * sstride - 1),
+ u32_load_unaligned(src + 3 * sstride - 1));
+ } else { // Left clipping
+ b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
+ c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
}
- if (!right) { // Right clipping
- d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
- e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
+ if (right) {
+ d = v128_from_32(u32_load_unaligned(src + 1),
+ u32_load_unaligned(src + sstride + 1),
+ u32_load_unaligned(src + 2 * sstride + 1),
+ u32_load_unaligned(src + 3 * sstride + 1));
+ e = v128_from_32(u32_load_unaligned(src + 2 * !!right),
+ u32_load_unaligned(src + sstride + 2),
+ u32_load_unaligned(src + 2 * sstride + 2),
+ u32_load_unaligned(src + 3 * sstride + 2));
+ } else { // Right clipping
+ d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
+ e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
}
o = calc_delta(o, a, b, c, d, e, f, sp, sm);
@@ -176,9 +182,10 @@
int dstride, int x0, int y0, int sizex,
int sizey, int width, int height,
unsigned int strength) {
- if ((sizex != 4 && sizex != 8) || y0 + 4 > height ||
- (sizey & 3 && sizex == 4) || x0 + 4 > width) {
- // Fallback to C for odd sizes
+ if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
+ // Fallback to C for odd sizes:
+ // * block widths not 4 or 8
+ // * block heights not a multiple of 4 if the block width is 4
aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
height, strength);
} else {
@@ -255,24 +262,27 @@
v128 o = v128_from_v64(l1, l2);
const v128 a =
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
- v128 b = v128_from_v64(v64_load_unaligned(src - 2 * !!x0),
- v64_load_unaligned(src - 2 * !!x0 + sstride));
- v128 c = v128_from_v64(v64_load_unaligned(src - !!x0),
- v64_load_unaligned(src - !!x0 + sstride));
- v128 d = v128_from_v64(v64_load_unaligned(src + !!right),
- v64_load_unaligned(src + !!right + sstride));
- v128 e = v128_from_v64(v64_load_unaligned(src + 2 * !!right),
- v64_load_unaligned(src + 2 * !!right + sstride));
const v128 f = v128_from_v64(
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
+ v128 b, c, d, e;
- if (!x0) { // Left clipping
- b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
- c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
+ if (x0) {
+ b = v128_from_v64(v64_load_unaligned(src - 2),
+ v64_load_unaligned(src - 2 + sstride));
+ c = v128_from_v64(v64_load_unaligned(src - 1),
+ v64_load_unaligned(src - 1 + sstride));
+ } else { // Left clipping
+ b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
+ c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
}
- if (!right) { // Right clipping
- d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
- e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
+ if (right) {
+ d = v128_from_v64(v64_load_unaligned(src + 1),
+ v64_load_unaligned(src + 1 + sstride));
+ e = v128_from_v64(v64_load_unaligned(src + 2),
+ v64_load_unaligned(src + 2 + sstride));
+ } else { // Right clipping
+ d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
+ e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
}
calc_delta_hbd4(o, a, b, c, d, e, f, dst, sp, sm, dstride);
src += sstride * 2;
@@ -309,18 +319,21 @@
const v128 o = v128_load_aligned(src);
const v128 a = v128_load_aligned(src - (y != -y0) * sstride);
const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride);
- v128 b = v128_load_unaligned(src - 2 * !!x0);
- v128 c = v128_load_unaligned(src - !!x0);
- v128 d = v128_load_unaligned(src + !!right);
- v128 e = v128_load_unaligned(src + 2 * !!right);
+ v128 b, c, d, e;
- if (!x0) { // Left clipping
- b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
- c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
+ if (x0) {
+ b = v128_load_unaligned(src - 2);
+ c = v128_load_unaligned(src - 1);
+ } else { // Left clipping
+ b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
+ c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
}
- if (!right) { // Right clipping
- d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
- e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
+ if (right) {
+ d = v128_load_unaligned(src + 1);
+ e = v128_load_unaligned(src + 2);
+ } else { // Right clipping
+ d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
+ e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
}
calc_delta_hbd8(o, a, b, c, d, e, f, dst, sp, sm);
src += sstride;
@@ -332,8 +345,10 @@
int sstride, int dstride, int x0, int y0,
int sizex, int sizey, int width, int height,
unsigned int strength) {
- if ((sizex != 4 && sizex != 8) || y0 + 4 > height || x0 + 4 > width) {
- // Fallback to C for odd sizes
+ if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
+ // Fallback to C for odd sizes:
+ // * block width not 4 or 8
+ // * block heights not a multiple of 2 if the block width is 4
aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey,
width, height, strength);
} else {