blob: 610e8f63677747862b71ffa74baeb07cdf222954 [file] [log] [blame]
/*
* Copyright 2020 Google LLC
*
*/
/*
* Copyright (c) 2020, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#pragma warning(disable : 4714)
ByteAddressBuffer cdef_index : register(t0);
ByteAddressBuffer skips : register(t1);
RWByteAddressBuffer dst : register(u0);
struct CDefData {
int4 pl;
int uv_stride;
int dst_offset_y;
int dst_offset_u;
int dst_offset_v;
int uv_offset_u;
int uv_offset_v;
int index_stride;
int skips_stride;
int pri_damping;
int sec_damping;
int pli;
int hbd;
int bit_depth;
int3 _dummie;
int4 cdef_directions[16][2];
int4 cdef_strength[16];
int4 cdef_uv_strength[16];
};
cbuffer cb_cdef_data : register(b0) { CDefData data; }
#define CDEF_BLOCK_WIDTH 32
#define CDEF_BLOCK_HEIGHT 32
#define CDEF_UV_BLOCK_WIDTH 16
#define CDEF_UV_BLOCK_HEIGHT 16
#define CDEF_WIDTH 64
#define CDEF_HEIGHT 64
#define CDEF_VERY_LARGE (30000)
#define WG_WIDTH 4
#define WG_HEIGHT 4
groupshared int input[CDEF_BLOCK_HEIGHT + 18][CDEF_BLOCK_WIDTH + 8];
groupshared int output[CDEF_BLOCK_HEIGHT][CDEF_BLOCK_WIDTH];
groupshared int costs[8][WG_HEIGHT][WG_WIDTH];
groupshared int2 temp[WG_HEIGHT][WG_WIDTH];
// TODO: reorganize load (optimization)
void load_input(int4 plane, int gx, int gy, int llx, int lly, int llz) {
uint id = ((llz * 4 + lly) * 4 + llx);
int lx = id % (CDEF_BLOCK_WIDTH / 4 + 2);
int ly = id / (CDEF_BLOCK_WIDTH / 4 + 2);
plane.z >>= 2;
for (int y = gy - 3 + ly; y < gy - 3 + CDEF_BLOCK_HEIGHT + 6; y += WG_HEIGHT) {
const int gx4 = gx >> 2;
for (int x = gx4 - 1 + lx; x < gx4 - 1 + CDEF_BLOCK_WIDTH / 4 + 2; x += WG_WIDTH) {
int is_clamp = (x < 0) || (y < 0) || (x >= plane.z) || (y >= plane.w);
int4 in_4 = {CDEF_VERY_LARGE, CDEF_VERY_LARGE, CDEF_VERY_LARGE, CDEF_VERY_LARGE};
if (!is_clamp) {
uint input_char = dst.Load(plane.y + y * plane.x + x * 4);
in_4.x = (input_char >> 0) & 255;
in_4.y = (input_char >> 8) & 255;
in_4.z = (input_char >> 16) & 255;
in_4.w = (input_char >> 24) & 255;
}
input[y - (gy - 3)][(x - (gx4 - 1)) * 4 + 0] = in_4.x;
input[y - (gy - 3)][(x - (gx4 - 1)) * 4 + 1] = in_4.y;
input[y - (gy - 3)][(x - (gx4 - 1)) * 4 + 2] = in_4.z;
input[y - (gy - 3)][(x - (gx4 - 1)) * 4 + 3] = in_4.w;
}
}
}
void load_input_hbd(int4 plane, int gx, int gy, int llx, int lly, int llz) {
uint id = ((llz * 4 + lly) * 4 + llx);
int lx = id % (CDEF_BLOCK_WIDTH / 4 + 2);
int ly = id / (CDEF_BLOCK_WIDTH / 4 + 2);
plane.z >>= 2;
for (int y = gy - 3 + ly; y < gy - 3 + CDEF_BLOCK_HEIGHT + 6; y += WG_HEIGHT) {
const int gx4 = gx >> 2;
for (int x = gx4 - 1 + lx; x < gx4 - 1 + CDEF_BLOCK_WIDTH / 4 + 2; x += WG_WIDTH) {
int is_clamp = (x < 0) || (y < 0) || (x >= plane.z) || (y >= plane.w);
int4 in_4 = {CDEF_VERY_LARGE, CDEF_VERY_LARGE, CDEF_VERY_LARGE, CDEF_VERY_LARGE};
if (!is_clamp) {
uint2 input_char = dst.Load2(plane.y + y * plane.x + (x << 3));
in_4.x = (input_char.x >> 0) & 0x03ff;
in_4.y = (input_char.x >> 16) & 0x03ff;
in_4.z = (input_char.y >> 0) & 0x03ff;
in_4.w = (input_char.y >> 16) & 0x03ff;
}
input[y - (gy - 3)][(x - (gx4 - 1)) * 4 + 0] = in_4.x;
input[y - (gy - 3)][(x - (gx4 - 1)) * 4 + 1] = in_4.y;
input[y - (gy - 3)][(x - (gx4 - 1)) * 4 + 2] = in_4.z;
input[y - (gy - 3)][(x - (gx4 - 1)) * 4 + 3] = in_4.w;
}
}
}
// TODO: reorganize load (optimization)
void load_uv_input(int4 plane, int gx, int gy, int llx, int lly, int llz, int pid) {
uint id = ((llz * 4 + lly) * 4 + llx);
int lx = id % (CDEF_UV_BLOCK_WIDTH / 4 + 2);
int ly = id / (CDEF_UV_BLOCK_WIDTH / 4 + 2);
plane.z >>= 2;
for (int y = gy - 3 + ly; y < gy - 3 + CDEF_UV_BLOCK_HEIGHT + 6; y += WG_HEIGHT) {
const int gx4 = gx >> 2;
for (int x = gx4 - 1 + lx; x < gx4 - 1 + CDEF_UV_BLOCK_WIDTH / 4 + 2; x += WG_WIDTH) {
int is_clamp = (x < 0) || (y < 0) || (x >= plane.z) || (y >= plane.w);
int4 in_4 = {CDEF_VERY_LARGE, CDEF_VERY_LARGE, CDEF_VERY_LARGE, CDEF_VERY_LARGE};
if (!is_clamp) {
uint input_char = dst.Load(plane.y + y * plane.x + x * 4);
in_4.x = (input_char >> 0) & 255;
in_4.y = (input_char >> 8) & 255;
in_4.z = (input_char >> 16) & 255;
in_4.w = (input_char >> 24) & 255;
}
input[y - (gy - 3) + (pid - 1) * 24][(x - (gx4 - 1)) * 4 + 0] = in_4.x;
input[y - (gy - 3) + (pid - 1) * 24][(x - (gx4 - 1)) * 4 + 1] = in_4.y;
input[y - (gy - 3) + (pid - 1) * 24][(x - (gx4 - 1)) * 4 + 2] = in_4.z;
input[y - (gy - 3) + (pid - 1) * 24][(x - (gx4 - 1)) * 4 + 3] = in_4.w;
}
}
}
void load_uv_input_hbd(int4 plane, int gx, int gy, int llx, int lly, int llz, int pid) {
uint id = ((llz * 4 + lly) * 4 + llx);
int lx = id % (CDEF_UV_BLOCK_WIDTH / 4 + 2);
int ly = id / (CDEF_UV_BLOCK_WIDTH / 4 + 2);
plane.z >>= 2;
for (int y = gy - 3 + ly; y < gy - 3 + CDEF_UV_BLOCK_HEIGHT + 6; y += WG_HEIGHT) {
const int gx4 = gx >> 2;
for (int x = gx4 - 1 + lx; x < gx4 - 1 + CDEF_UV_BLOCK_WIDTH / 4 + 2; x += WG_WIDTH) {
int is_clamp = (x < 0) || (y < 0) || (x >= plane.z) || (y >= plane.w);
int4 in_4 = {CDEF_VERY_LARGE, CDEF_VERY_LARGE, CDEF_VERY_LARGE, CDEF_VERY_LARGE};
if (!is_clamp) {
uint2 input_char = dst.Load2(plane.y + y * plane.x + (x << 3));
in_4.x = (input_char.x >> 0) & 0x03ff;
in_4.y = (input_char.x >> 16) & 0x03ff;
in_4.z = (input_char.y >> 0) & 0x03ff;
in_4.w = (input_char.y >> 16) & 0x03ff;
}
input[y - (gy - 3) + (pid - 1) * 24][(x - (gx4 - 1)) * 4 + 0] = in_4.x;
input[y - (gy - 3) + (pid - 1) * 24][(x - (gx4 - 1)) * 4 + 1] = in_4.y;
input[y - (gy - 3) + (pid - 1) * 24][(x - (gx4 - 1)) * 4 + 2] = in_4.z;
input[y - (gy - 3) + (pid - 1) * 24][(x - (gx4 - 1)) * 4 + 3] = in_4.w;
}
}
}
int get_loaded_source_sample(int x, int y) { return input[y + 3][x + 4]; }
int2 get_block_dir(int x0, int y0, int lx, int ly, int lz, int coeff_shift) {
int i;
int z0 = lz;
int cost[8] = {0, 0, 0, 0, 0, 0, 0, 0};
const int div_table[] = {0, 840, 420, 280, 210, 168, 140, 120, 105};
const int div_table_idx[8][8] = {{1, 2, 3, 4, 5, 6, 7, 8}, {2, 4, 6, 8, 8, 8, 8, 8}, {8, 8, 8, 8, 8, 8, 8, 8},
{2, 4, 6, 8, 8, 8, 8, 8}, {1, 2, 3, 4, 5, 6, 7, 8}, {2, 4, 6, 8, 8, 8, 8, 8},
{8, 8, 8, 8, 8, 8, 8, 8}, {2, 4, 6, 8, 8, 8, 8, 8}};
const int4 prt_idx[8] = {{1, 1, 0, 0}, {1, 1, 0, 1}, {1, 0, 0, 0}, {1, -1, 0, 1},
{1, -1, 0, 0}, {-1, 1, 1, 0}, {0, 1, 0, 0}, {1, 1, 1, 0}};
const int prt_idx_shift[8] = {0, 0, 0, 3, 7, 3, 0, 0};
const int cost_prt_idx[8][8] = {{14, 13, 12, 11, 10, 9, 8, -1},
{10, 9, 8, -1, -1, -1, -1, -1},
{
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
},
{10, 9, 8, -1, -1, -1, -1, -1},
{14, 13, 12, 11, 10, 9, 8, -1},
{10, 9, 8, -1, -1, -1, -1, -1},
{
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
},
{10, 9, 8, -1, -1, -1, -1, -1}};
int partial[15] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int best_cost = 0;
int best_cost_2 = 0;
int2 best_dir = {0, 0};
for (i = 0; i < 8; i++) {
int j;
for (j = 0; j < 8; j++) {
int x;
x = (get_loaded_source_sample(x0 + j, y0 + i) >> coeff_shift) - 128;
int4 idx = prt_idx[z0];
partial[idx.x * (i >> idx.z) + idx.y * (j >> idx.w) + prt_idx_shift[z0]] += x;
}
}
// for (i = 0; i < 8; i++)
{ costs[lz][ly][lx] = 0; }
for (i = 0; i < 8; i++) {
int pt1 = partial[i];
int pt2 = cost_prt_idx[z0][i] >= 0 ? partial[cost_prt_idx[z0][i]] : 0;
costs[lz][ly][lx] += (pt1 * pt1 + pt2 * pt2) * div_table[div_table_idx[z0][i]];
}
GroupMemoryBarrierWithGroupSync();
for (i = 0; i < 8; i++) {
cost[i] = costs[i][ly][lx];
}
for (i = 0; i < 8; i++) {
if (cost[i] > best_cost) {
best_cost = cost[i];
best_cost_2 = cost[(i + 4) & 7];
best_dir.x = i;
}
}
best_dir.y = best_cost - best_cost_2;
best_dir.y >>= 10;
return best_dir;
}
int2 get_block_dir_old(int x0, int y0, int coeff_shift) {
int i;
int j;
int cost[8] = {0, 0, 0, 0, 0, 0, 0, 0};
int partial[8][15] = {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}};
const int div_table[] = {0, 840, 420, 280, 210, 168, 140, 120, 105};
int best_cost = 0;
int best_cost_2 = 0;
int2 best_dir = {0, 0};
for (i = 0; i < 8; i++) {
for (j = 0; j < 8; j++) {
int x;
x = (get_loaded_source_sample(x0 + j, y0 + i) >> coeff_shift) - 128;
partial[0][i + j] += x;
partial[1][i + (j >> 1)] += x;
partial[2][i] += x;
partial[3][3 + i - (j >> 1)] += x;
partial[4][7 + i - j] += x;
partial[5][3 - (i >> 1) + j] += x;
partial[6][j] += x;
partial[7][(i >> 1) + j] += x;
}
}
for (i = 0; i < 8; i++) {
cost[2] += partial[2][i] * partial[2][i];
cost[6] += partial[6][i] * partial[6][i];
}
cost[2] *= div_table[8];
cost[6] *= div_table[8];
for (i = 0; i < 7; i++) {
cost[0] += (partial[0][i] * partial[0][i] + partial[0][14 - i] * partial[0][14 - i]) * div_table[i + 1];
cost[4] += (partial[4][i] * partial[4][i] + partial[4][14 - i] * partial[4][14 - i]) * div_table[i + 1];
}
cost[0] += partial[0][7] * partial[0][7] * div_table[8];
cost[4] += partial[4][7] * partial[4][7] * div_table[8];
for (i = 1; i < 8; i += 2) {
for (j = 0; j < 4 + 1; j++) {
cost[i] += partial[i][3 + j] * partial[i][3 + j];
}
cost[i] *= div_table[8];
for (j = 0; j < 4 - 1; j++) {
cost[i] += (partial[i][j] * partial[i][j] + partial[i][10 - j] * partial[i][10 - j]) * div_table[2 * j + 2];
}
}
for (i = 0; i < 8; i++) {
if (cost[i] > best_cost) {
best_cost = cost[i];
best_cost_2 = cost[(i + 4) & 7];
best_dir.x = i;
}
}
best_dir.y = best_cost - best_cost_2;
best_dir.y >>= 10;
return best_dir;
}
#define MAX_SB_SIZE_LOG2 7
#define ALIGN_POWER_OF_TWO(value, n) (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
#define CDEF_VBORDER (3)
#define CDEF_HBORDER (8)
#define CDEF_BSTRIDE ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
int get_msb(unsigned int n) {
int log = 0;
unsigned int value = n;
int i;
for (i = 4; i >= 0; --i) {
const int shift = (1 << i);
const unsigned int x = value >> shift;
if (x != 0) {
value = x;
log += shift;
}
}
return log;
}
int constrain(int diff, int threshold, int shift) {
return sign(diff) * min(abs(diff), max(0, threshold - (abs(diff) >> shift)));
}
int adjust_strength(int strength, int var) {
const int i = var >> 6 ? min(get_msb(var >> 6), 12) : 0;
return var ? (strength * (4 + i) + 8) >> 4 : 0;
}
[numthreads(WG_WIDTH, WG_HEIGHT, 8)] void main(uint3 Gid
: SV_GroupID, uint3 DTid
: SV_DispatchThreadID, uint3 GTid
: SV_GroupThreadID, uint GI
: SV_GroupIndex) {
int gx = Gid.x * CDEF_BLOCK_WIDTH;
int gy = Gid.y * CDEF_BLOCK_HEIGHT;
int guvx = Gid.x * CDEF_UV_BLOCK_WIDTH;
int guvy = Gid.y * CDEF_UV_BLOCK_HEIGHT;
int lx = GTid.x;
int ly = GTid.y;
int lz = GTid.z;
int2 dir_var;
int bit_depth = data.bit_depth;
int coeff_shift = bit_depth - 8;
int skip = skips.Load(((Gid.y * WG_HEIGHT + ly) * data.skips_stride + Gid.x * WG_WIDTH + lx) * 4);
if (data.hbd) {
load_input_hbd(data.pl, gx, gy, lx, ly, lz);
} else {
load_input(data.pl, gx, gy, lx, ly, lz);
}
GroupMemoryBarrierWithGroupSync();
if (lz == 0 && !skip) {
temp[ly][lx] = get_block_dir_old(lx * 8, ly * 8, coeff_shift);
}
int index = cdef_index.Load(((Gid.y >> 1) * data.index_stride + (Gid.x >> 1)) * 4);
uint strength = 0;
if (index >= 0 && index < 16) {
strength = data.cdef_strength[index].x;
}
int t = skip ? 0 : strength / 4;
int s = skip ? 0 : strength % 4;
s += s == 3;
GroupMemoryBarrierWithGroupSync();
dir_var = temp[ly][lx];
const int damping = data.pri_damping + coeff_shift;
if (!skip) {
const int pri_taps[2][2] = {{4, 2}, {3, 3}};
const int sec_taps[2][2] = {{2, 1}, {2, 1}};
const int x0 = lx * 8;
const int y0 = ly * 8;
const int z0 = lz;
const int pri_strength = adjust_strength(t << coeff_shift, dir_var.y);
const int sec_strength = s << coeff_shift;
const int dir = t ? dir_var.x : 0;
int i, j, k;
const int p_t = (pri_strength >> coeff_shift) & 1;
const int s_t = (pri_strength >> coeff_shift) & 1;
const int pri_msb = get_msb(pri_strength);
const int sec_msb = get_msb(sec_strength);
const int pri_shift = max(0, damping - pri_msb);
const int sec_shift = max(0, damping - sec_msb);
// for (i = 0; i < 8; i++) {
{
i = z0;
for (j = 0; j < 8; j++) {
int sum = 0;
int y;
int x = get_loaded_source_sample(x0 + j, y0 + i);
int mmax = x;
int mmin = x;
for (k = 0; k < 2; k++) {
if (pri_strength) {
int2 dir_pri = data.cdef_directions[dir][k].xy;
int p0 = get_loaded_source_sample(x0 + j + dir_pri.x, y0 + i + dir_pri.y);
int p1 = get_loaded_source_sample(x0 + j - dir_pri.x, y0 + i - dir_pri.y);
sum += pri_taps[p_t][k] * constrain(p0 - x, pri_strength, pri_shift);
sum += pri_taps[p_t][k] * constrain(p1 - x, pri_strength, pri_shift);
// mmax = max(p0 & 0xff, mmax);
// mmax = max(p1 & 0xff, mmax);
// NOTE!: (CDEF_VERY_LARGE & 0xff) = 48, can we adjust CDEF_VERY_LARGE???
if (p0 != CDEF_VERY_LARGE) mmax = max(p0, mmax);
if (p1 != CDEF_VERY_LARGE) mmax = max(p1, mmax);
mmin = min(p0, mmin);
mmin = min(p1, mmin);
}
if (sec_strength) {
int2 dir1_sec = data.cdef_directions[dir + 2][k].xy;
int2 dir2_sec = data.cdef_directions[dir + 6][k].xy;
int s0 = get_loaded_source_sample(x0 + j + dir1_sec.x, y0 + i + dir1_sec.y);
int s1 = get_loaded_source_sample(x0 + j - dir1_sec.x, y0 + i - dir1_sec.y);
int s2 = get_loaded_source_sample(x0 + j + dir2_sec.x, y0 + i + dir2_sec.y);
int s3 = get_loaded_source_sample(x0 + j - dir2_sec.x, y0 + i - dir2_sec.y);
// mmax = max(s0 & 0xff, mmax);
// mmax = max(s1 & 0xff, mmax);
// mmax = max(s2 & 0xff, mmax);
// mmax = max(s3 & 0xff, mmax);
// NOTE!: (CDEF_VERY_LARGE & 0xff) = 48, can we adjust CDEF_VERY_LARGE???
if (s0 != CDEF_VERY_LARGE) mmax = max(s0, mmax);
if (s1 != CDEF_VERY_LARGE) mmax = max(s1, mmax);
if (s2 != CDEF_VERY_LARGE) mmax = max(s2, mmax);
if (s3 != CDEF_VERY_LARGE) mmax = max(s3, mmax);
mmin = min(s0, mmin);
mmin = min(s1, mmin);
mmin = min(s2, mmin);
mmin = min(s3, mmin);
sum += sec_taps[s_t][k] * constrain(s0 - x, sec_strength, sec_shift);
sum += sec_taps[s_t][k] * constrain(s1 - x, sec_strength, sec_shift);
sum += sec_taps[s_t][k] * constrain(s2 - x, sec_strength, sec_shift);
sum += sec_taps[s_t][k] * constrain(s3 - x, sec_strength, sec_shift);
}
}
y = clamp(x + ((8 + sum - (sum < 0)) >> 4), mmin, mmax);
output[y0 + i][x0 + j] = y;
}
}
} else {
for (int j = 0; j < 8; j++) output[ly * 8 + lz][lx * 8 + j] = get_loaded_source_sample(lx * 8 + j, ly * 8 + lz);
}
GroupMemoryBarrierWithGroupSync();
int dy = lz;
{
for (int dx = 0; dx < 2; dx++) {
int4 out_sample;
out_sample.x = output[ly * 8 + dy][lx * 8 + dx * 4 + 0];
out_sample.y = output[ly * 8 + dy][lx * 8 + dx * 4 + 1];
out_sample.z = output[ly * 8 + dy][lx * 8 + dx * 4 + 2];
out_sample.w = output[ly * 8 + dy][lx * 8 + dx * 4 + 3];
if (data.hbd) {
dst.Store2(data.dst_offset_y + (gy + ly * 8 + dy) * data.pl.x + ((gx + lx * 8 + dx * 4) << 1),
uint2((out_sample.x << 0) | (out_sample.y << 16), (out_sample.z << 0) | (out_sample.w << 16)));
} else {
dst.Store(data.dst_offset_y + (gy + ly * 8 + dy) * data.pl.x + gx + lx * 8 + dx * 4,
(out_sample.x << 0) | (out_sample.y << 8) | (out_sample.z << 16) | (out_sample.w << 24));
}
}
}
// Chroma processing
int pid = 1 + (lz >> 2);
int llz = lz & 3;
int4 plane = data.pl;
plane.x = data.uv_stride;
plane.y = pid == 1 ? data.uv_offset_u : data.uv_offset_v;
plane.z >>= 1;
plane.w >>= 1;
GroupMemoryBarrierWithGroupSync();
if (data.hbd) {
load_uv_input_hbd(plane, guvx, guvy, lx, ly, llz, pid);
} else {
load_uv_input(plane, guvx, guvy, lx, ly, llz, pid);
}
GroupMemoryBarrierWithGroupSync();
if (index >= 0 && index < 16) {
strength = data.cdef_uv_strength[index].x;
}
t = skip ? 0 : strength / 4;
s = skip ? 0 : strength % 4;
s += s == 3;
if (!skip) {
const int pri_taps[2][2] = {{4, 2}, {3, 3}};
const int sec_taps[2][2] = {{2, 1}, {2, 1}};
const int x0 = lx * 4;
const int y0 = ly * 4;
const int z0 = llz;
const int pri_strength = t << coeff_shift;
const int sec_strength = s << coeff_shift;
const int dir = pri_strength ? dir_var.x : 0;
int i, j, k;
const int p_t = (pri_strength >> coeff_shift) & 1;
const int s_t = (pri_strength >> coeff_shift) & 1;
const int pri_msb = get_msb(pri_strength);
const int sec_msb = get_msb(sec_strength);
const int pri_shift = max(0, damping - 1 - pri_msb);
const int sec_shift = max(0, damping - 1 - sec_msb);
// for (i = 0; i < 8; i++) {
{
i = z0;
for (j = 0; j < 4; j++) {
int sum = 0;
int y;
int x = get_loaded_source_sample(x0 + j, y0 + i + (pid - 1) * 24);
int mmax = x;
int mmin = x;
for (k = 0; k < 2; k++) {
if (pri_strength) {
int2 dir_pri = data.cdef_directions[dir][k].xy;
int p0 = get_loaded_source_sample(x0 + j + dir_pri.x, y0 + i + dir_pri.y + (pid - 1) * 24);
int p1 = get_loaded_source_sample(x0 + j - dir_pri.x, y0 + i - dir_pri.y + (pid - 1) * 24);
sum += pri_taps[p_t][k] * constrain(p0 - x, pri_strength, pri_shift);
sum += pri_taps[p_t][k] * constrain(p1 - x, pri_strength, pri_shift);
// mmax = max(p0 & 0xff, mmax);
// mmax = max(p1 & 0xff, mmax);
// NOTE!: (CDEF_VERY_LARGE & 0xff) = 48, can we adjust CDEF_VERY_LARGE???
if (p0 != CDEF_VERY_LARGE) mmax = max(p0, mmax);
if (p1 != CDEF_VERY_LARGE) mmax = max(p1, mmax);
mmin = min(p0, mmin);
mmin = min(p1, mmin);
}
if (sec_strength) {
int2 dir1_sec = data.cdef_directions[dir + 2][k].xy;
int2 dir2_sec = data.cdef_directions[dir + 6][k].xy;
int s0 = get_loaded_source_sample(x0 + j + dir1_sec.x, y0 + i + dir1_sec.y + (pid - 1) * 24);
int s1 = get_loaded_source_sample(x0 + j - dir1_sec.x, y0 + i - dir1_sec.y + (pid - 1) * 24);
int s2 = get_loaded_source_sample(x0 + j + dir2_sec.x, y0 + i + dir2_sec.y + (pid - 1) * 24);
int s3 = get_loaded_source_sample(x0 + j - dir2_sec.x, y0 + i - dir2_sec.y + (pid - 1) * 24);
// mmax = max(s0 & 0xff, mmax);
// mmax = max(s1 & 0xff, mmax);
// mmax = max(s2 & 0xff, mmax);
// mmax = max(s3 & 0xff, mmax);
// NOTE!: (CDEF_VERY_LARGE & 0xff) = 48, can we adjust CDEF_VERY_LARGE???
if (s0 != CDEF_VERY_LARGE) mmax = max(s0, mmax);
if (s1 != CDEF_VERY_LARGE) mmax = max(s1, mmax);
if (s2 != CDEF_VERY_LARGE) mmax = max(s2, mmax);
if (s3 != CDEF_VERY_LARGE) mmax = max(s3, mmax);
mmin = min(s0, mmin);
mmin = min(s1, mmin);
mmin = min(s2, mmin);
mmin = min(s3, mmin);
sum += sec_taps[s_t][k] * constrain(s0 - x, sec_strength, sec_shift);
sum += sec_taps[s_t][k] * constrain(s1 - x, sec_strength, sec_shift);
sum += sec_taps[s_t][k] * constrain(s2 - x, sec_strength, sec_shift);
sum += sec_taps[s_t][k] * constrain(s3 - x, sec_strength, sec_shift);
}
}
y = clamp(x + ((8 + sum - (sum < 0)) >> 4), mmin, mmax);
output[y0 + i + (pid - 1) * 16][x0 + j] = y;
}
}
} else {
for (int j = 0; j < 4; j++)
output[ly * 4 + llz + (pid - 1) * 16][lx * 4 + j] =
get_loaded_source_sample(lx * 4 + j, ly * 4 + llz + (pid - 1) * 24);
}
GroupMemoryBarrierWithGroupSync();
dy = llz;
{
int4 out_sample;
out_sample.x = output[ly * 4 + dy + (pid - 1) * 16][lx * 4 + 0];
out_sample.y = output[ly * 4 + dy + (pid - 1) * 16][lx * 4 + 1];
out_sample.z = output[ly * 4 + dy + (pid - 1) * 16][lx * 4 + 2];
out_sample.w = output[ly * 4 + dy + (pid - 1) * 16][lx * 4 + 3];
const int dst_offset_uv = pid == 1 ? data.dst_offset_u : data.dst_offset_v;
if (data.hbd) {
dst.Store2(dst_offset_uv + (guvy + ly * 4 + dy) * data.uv_stride + ((guvx + lx * 4) << 1),
uint2((out_sample.x << 0) | (out_sample.y << 16), (out_sample.z << 0) | (out_sample.w << 16)));
} else {
dst.Store(dst_offset_uv + (guvy + ly * 4 + dy) * data.uv_stride + guvx + lx * 4,
(out_sample.x << 0) | (out_sample.y << 8) | (out_sample.z << 16) | (out_sample.w << 24));
}
}
}