blob: fae8192d6a98555e40b8644bf43b14674058cc6a [file] [log] [blame]
/*
* Copyright 2020 Google LLC
*
*/
/*
* Copyright (c) 2020, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#define BlockSize 32
groupshared uint input_edges[16 * (64 / 4)];
void filter_vertical_edge(RWByteAddressBuffer buffer, uint offset, int4 limits, uint type) {
uint4 pixels = buffer.Load4(offset - 8);
int4 p0, p1, q0, q1;
p0.x = (pixels.x >> 0) & 0xff;
p0.y = (pixels.x >> 8) & 0xff;
p0.z = (pixels.x >> 16) & 0xff;
p0.w = (pixels.x >> 24) & 0xff;
p1.x = (pixels.y >> 0) & 0xff;
p1.y = (pixels.y >> 8) & 0xff;
p1.z = (pixels.y >> 16) & 0xff;
p1.w = (pixels.y >> 24) & 0xff;
q0.x = (pixels.z >> 0) & 0xff;
q0.y = (pixels.z >> 8) & 0xff;
q0.z = (pixels.z >> 16) & 0xff;
q0.w = (pixels.z >> 24) & 0xff;
q1.x = (pixels.w >> 0) & 0xff;
q1.y = (pixels.w >> 8) & 0xff;
q1.z = (pixels.w >> 16) & 0xff;
q1.w = (pixels.w >> 24) & 0xff;
int mask = abs(p1.z - p1.w) <= limits.x && abs(q0.x - q0.y) <= limits.x &&
abs(p1.w - q0.x) * 2 + (abs(p1.z - q0.y) >> 1) <= limits.y;
mask &= (abs(p1.y - p1.z) <= limits.x && abs(q0.y - q0.z) <= limits.x) | (type == 0);
mask &= (abs(p1.x - p1.y) <= limits.x && abs(q0.z - q0.w) <= limits.x) | (type <= 1);
if (!mask) return;
int flat_uv = abs(p1.y - p1.w) <= 1 && abs(p1.z - p1.w) <= 1 && abs(q0.y - q0.x) <= 1 && abs(q0.z - q0.x) <= 1;
int flat = flat_uv && abs(p1.x - p1.w) <= 1 && abs(q0.w - q0.x) <= 1;
int flat2 = abs(p0.y - p1.w) <= 1 && abs(p0.z - p1.w) <= 1 && abs(p0.w - p1.w) <= 1 && abs(q1.x - q0.x) <= 1 &&
abs(q1.y - q0.x) <= 1 && abs(q1.z - q0.x) <= 1;
if (type == 3 && flat && flat2) {
pixels.x &= 0x0000ffff;
pixels.w &= 0xffff0000;
pixels.x |= ((p0.y * 7 + p0.z * 2 + p0.w * 2 + p1.x * 1 + p1.y * 1 + p1.z * 1 + p1.w * 1 + q0.x * 1 + 8) >> 4)
<< 16;
pixels.x |=
((p0.y * 5 + p0.z * 2 + p0.w * 2 + p1.x * 2 + p1.y * 1 + p1.z * 1 + p1.w * 1 + q0.x * 1 + q0.y * 1 + 8) >> 4)
<< 24;
pixels.y = ((p0.y * 4 + p0.z * 1 + p0.w * 2 + p1.x * 2 + p1.y * 2 + p1.z * 1 + p1.w * 1 + q0.x * 1 + q0.y * 1 +
q0.z * 1 + 8) >>
4)
<< 0;
pixels.y |= ((p0.y * 3 + p0.z * 1 + p0.w * 1 + p1.x * 2 + p1.y * 2 + p1.z * 2 + p1.w * 1 + q0.x * 1 + q0.y * 1 +
q0.z * 1 + q0.w * 1 + 8) >>
4)
<< 8;
pixels.y |= ((p0.y * 2 + p0.z * 1 + p0.w * 1 + p1.x * 1 + p1.y * 2 + p1.z * 2 + p1.w * 2 + q0.x * 1 + q0.y * 1 +
q0.z * 1 + q0.w * 1 + q1.x * 1 + 8) >>
4)
<< 16;
pixels.y |= ((p0.y * 1 + p0.z * 1 + p0.w * 1 + p1.x * 1 + p1.y * 1 + p1.z * 2 + p1.w * 2 + q0.x * 2 + q0.y * 1 +
q0.z * 1 + q0.w * 1 + q1.x * 1 + q1.y * 1 + 8) >>
4)
<< 24;
pixels.z = ((p0.z * 1 + p0.w * 1 + p1.x * 1 + p1.y * 1 + p1.z * 1 + p1.w * 2 + q0.x * 2 + q0.y * 2 + q0.z * 1 +
q0.w * 1 + q1.x * 1 + q1.y * 1 + q1.z * 1 + 8) >>
4)
<< 0;
pixels.z |= ((p0.w * 1 + p1.x * 1 + p1.y * 1 + p1.z * 1 + p1.w * 1 + q0.x * 2 + q0.y * 2 + q0.z * 2 + q0.w * 1 +
q1.x * 1 + q1.y * 1 + q1.z * 2 + 8) >>
4)
<< 8;
pixels.z |= ((p1.x * 1 + p1.y * 1 + p1.z * 1 + p1.w * 1 + q0.x * 1 + q0.y * 2 + q0.z * 2 + q0.w * 2 + q1.x * 1 +
q1.y * 1 + q1.z * 3 + 8) >>
4)
<< 16;
pixels.z |= ((p1.y * 1 + p1.z * 1 + p1.w * 1 + q0.x * 1 + q0.y * 1 + q0.z * 2 + q0.w * 2 + q1.x * 2 + q1.y * 1 +
q1.z * 4 + 8) >>
4)
<< 24;
pixels.w |=
((p1.z * 1 + p1.w * 1 + q0.x * 1 + q0.y * 1 + q0.z * 1 + q0.w * 2 + q1.x * 2 + q1.y * 2 + q1.z * 5 + 8) >> 4)
<< 0;
pixels.w |= ((p1.w * 1 + q0.x * 1 + q0.y * 1 + q0.z * 1 + q0.w * 1 + q1.x * 2 + q1.y * 2 + q1.z * 7 + 8) >> 4) << 8;
buffer.Store4(offset - 8, pixels);
} else if (type >= 2 && flat) {
int v = p1.x * 3 + p1.y + p1.z + p1.w + q0.x;
pixels.y = p1.x;
pixels.y |= ((v + p1.y + 4) >> 3) << 8;
v += -p1.x + q0.y;
pixels.y |= ((v + p1.z + 4) >> 3) << 16;
v += -p1.x + q0.z;
pixels.y |= ((v + p1.w + 4) >> 3) << 24;
v += -p1.x + q0.w;
pixels.z = (v + q0.x + 4) >> 3;
v += -p1.y + q0.w;
pixels.z |= ((v + q0.y + 4) >> 3) << 8;
v += -p1.z + q0.w;
pixels.z |= ((v + q0.z + 4) >> 3) << 16;
pixels.z |= q0.w << 24;
buffer.Store2(offset - 4, pixels.yz);
} else if (type == 1 && flat_uv) {
pixels.y &= 0x0000ffff;
pixels.z &= 0xffff0000;
// 5-tap filter [1, 2, 2, 2, 1]
pixels.y |= ((p1.y * 3 + p1.z * 2 + p1.w * 2 + q0.x + 4) >> 3) << 16;
pixels.y |= ((p1.y + p1.z * 2 + p1.w * 2 + q0.x * 2 + q0.y + 4) >> 3) << 24;
pixels.z |= ((p1.z + p1.w * 2 + q0.x * 2 + q0.y * 2 + q0.z + 4) >> 3);
pixels.z |= ((p1.w + q0.x * 2 + q0.y * 2 + q0.z * 3 + 4) >> 3) << 8;
buffer.Store2(offset - 4, pixels.yz);
} else {
uint hev = (abs(p1.w - p1.z) > limits.z || abs(q0.x - q0.y) > limits.z) ? 0xffffffff : 0;
int ps1 = p1.z - 128;
int ps0 = p1.w - 128;
int qs0 = q0.x - 128;
int qs1 = q0.y - 128;
int f0 = clamp(ps1 - qs1, -128, 127) & hev;
f0 = clamp(f0 + 3 * (qs0 - ps0), -128, 124);
int f1 = min(f0 + 4, 127) >> 3;
int f2 = (f0 + 3) >> 3;
pixels.y &= 0x0000FFFF;
pixels.z &= 0xFFFF0000;
pixels.y |= (clamp(ps0 + f2, -128, 127) + 128) << 24;
pixels.z |= clamp(qs0 - f1, -128, 127) + 128;
f0 = ((f1 + 1) >> 1) & (~hev);
pixels.y |= (clamp(ps1 + f0, -128, 127) + 128) << 16;
pixels.z |= (clamp(qs1 - f0, -128, 127) + 128) << 8;
buffer.Store2(offset - 4, pixels.yz);
}
}
RWByteAddressBuffer dst_frame : register(u0);
ByteAddressBuffer lf_blocks : register(t0);
cbuffer LoopfilterData : register(b0) {
int4 cb_planes[3];
int4 cb_limits[64];
};
cbuffer LoopfilterSRT : register(b1) {
uint cb_wicount;
uint cb_plane;
uint cb_offset_base;
uint cb_block_cols;
uint cb_block_id_offset;
};
[numthreads(64, 1, 1)] void main(uint3 thread
: SV_DispatchThreadID) {
if (thread.x >= cb_wicount) return;
const int plane = cb_plane;
const int wi = thread.x & 3;
const uint block_id = ((thread.x >> 2) << 1) + cb_block_id_offset;
uint2 data = lf_blocks.Load2((cb_offset_base + block_id) * BlockSize + (wi << 3));
const int local_offset = (thread.x & (64 - 4)) << 2;
input_edges[local_offset + (wi << 2) + 0] = data.x & 0xffff;
input_edges[local_offset + (wi << 2) + 1] = (data.x >> 16);
input_edges[local_offset + (wi << 2) + 2] = data.y & 0xffff;
input_edges[local_offset + (wi << 2) + 3] = (data.y >> 16);
GroupMemoryBarrier();
const int block_x = block_id % cb_block_cols;
const int block_y = block_id / cb_block_cols;
const int stride = cb_planes[plane].x;
const int addr = cb_planes[plane].y + block_x * 64 + (block_y * 4 + wi) * stride;
for (int col = 0; col < 16;) {
uint edge = input_edges[local_offset + col];
if (!edge) break;
int level = edge & 63;
int filter = (edge >> 6) & 3;
int step = edge >> 8;
if (level) {
const int4 limits = cb_limits[level];
filter_vertical_edge(dst_frame, addr + col * 4, limits, filter);
}
col += step;
}
}