| /* |
| * Copyright 2020 Google LLC |
| * |
| */ |
| |
| /* |
| * Copyright (c) 2020, Alliance for Open Media. All rights reserved |
| * |
| * This source code is subject to the terms of the BSD 2 Clause License and |
| * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| * was not distributed with this source code in the LICENSE file, you can |
| * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| * Media Patent License 1.0 was not distributed with this source code in the |
| * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| */ |
| |
| #pragma warning(disable : 4714) |
| ByteAddressBuffer cdef_index : register(t0); |
| ByteAddressBuffer skips : register(t1); |
| RWByteAddressBuffer dst : register(u0); |
| |
| struct CDefData { |
| int4 pl; |
| |
| int uv_stride; |
| int dst_offset_y; |
| int dst_offset_u; |
| int dst_offset_v; |
| |
| int uv_offset_u; |
| int uv_offset_v; |
| int index_stride; |
| int skips_stride; |
| |
| int pri_damping; |
| int sec_damping; |
| int pli; |
| int hbd; |
| |
| int bit_depth; |
| int3 _dummie; |
| |
| int4 cdef_directions[16][2]; |
| int4 cdef_strength[16]; |
| int4 cdef_uv_strength[16]; |
| }; |
| |
| cbuffer cb_cdef_data : register(b0) { CDefData data; } |
| |
| #define CDEF_BLOCK_WIDTH 32 |
| #define CDEF_BLOCK_HEIGHT 32 |
| #define CDEF_UV_BLOCK_WIDTH 16 |
| #define CDEF_UV_BLOCK_HEIGHT 16 |
| |
| #define CDEF_WIDTH 64 |
| #define CDEF_HEIGHT 64 |
| |
| #define CDEF_VERY_LARGE (30000) |
| |
| #define WG_WIDTH 4 |
| #define WG_HEIGHT 4 |
| |
| groupshared int input[CDEF_BLOCK_HEIGHT + 18][CDEF_BLOCK_WIDTH + 8]; |
| groupshared int output[CDEF_BLOCK_HEIGHT][CDEF_BLOCK_WIDTH]; |
| groupshared int costs[8][WG_HEIGHT][WG_WIDTH]; |
| groupshared int2 temp[WG_HEIGHT][WG_WIDTH]; |
| |
| // TODO: reorganize load (optimization) |
| void load_input(int4 plane, int gx, int gy, int llx, int lly, int llz) { |
| uint id = ((llz * 4 + lly) * 4 + llx); |
| int lx = id % (CDEF_BLOCK_WIDTH / 4 + 2); |
| int ly = id / (CDEF_BLOCK_WIDTH / 4 + 2); |
| |
| plane.z >>= 2; |
| for (int y = gy - 3 + ly; y < gy - 3 + CDEF_BLOCK_HEIGHT + 6; y += WG_HEIGHT) { |
| const int gx4 = gx >> 2; |
| for (int x = gx4 - 1 + lx; x < gx4 - 1 + CDEF_BLOCK_WIDTH / 4 + 2; x += WG_WIDTH) { |
| int is_clamp = (x < 0) || (y < 0) || (x >= plane.z) || (y >= plane.w); |
| int4 in_4 = {CDEF_VERY_LARGE, CDEF_VERY_LARGE, CDEF_VERY_LARGE, CDEF_VERY_LARGE}; |
| if (!is_clamp) { |
| uint input_char = dst.Load(plane.y + y * plane.x + x * 4); |
| in_4.x = (input_char >> 0) & 255; |
| in_4.y = (input_char >> 8) & 255; |
| in_4.z = (input_char >> 16) & 255; |
| in_4.w = (input_char >> 24) & 255; |
| } |
| input[y - (gy - 3)][(x - (gx4 - 1)) * 4 + 0] = in_4.x; |
| input[y - (gy - 3)][(x - (gx4 - 1)) * 4 + 1] = in_4.y; |
| input[y - (gy - 3)][(x - (gx4 - 1)) * 4 + 2] = in_4.z; |
| input[y - (gy - 3)][(x - (gx4 - 1)) * 4 + 3] = in_4.w; |
| } |
| } |
| } |
| void load_input_hbd(int4 plane, int gx, int gy, int llx, int lly, int llz) { |
| uint id = ((llz * 4 + lly) * 4 + llx); |
| int lx = id % (CDEF_BLOCK_WIDTH / 4 + 2); |
| int ly = id / (CDEF_BLOCK_WIDTH / 4 + 2); |
| |
| plane.z >>= 2; |
| for (int y = gy - 3 + ly; y < gy - 3 + CDEF_BLOCK_HEIGHT + 6; y += WG_HEIGHT) { |
| const int gx4 = gx >> 2; |
| for (int x = gx4 - 1 + lx; x < gx4 - 1 + CDEF_BLOCK_WIDTH / 4 + 2; x += WG_WIDTH) { |
| int is_clamp = (x < 0) || (y < 0) || (x >= plane.z) || (y >= plane.w); |
| int4 in_4 = {CDEF_VERY_LARGE, CDEF_VERY_LARGE, CDEF_VERY_LARGE, CDEF_VERY_LARGE}; |
| if (!is_clamp) { |
| uint2 input_char = dst.Load2(plane.y + y * plane.x + (x << 3)); |
| in_4.x = (input_char.x >> 0) & 0x03ff; |
| in_4.y = (input_char.x >> 16) & 0x03ff; |
| in_4.z = (input_char.y >> 0) & 0x03ff; |
| in_4.w = (input_char.y >> 16) & 0x03ff; |
| } |
| input[y - (gy - 3)][(x - (gx4 - 1)) * 4 + 0] = in_4.x; |
| input[y - (gy - 3)][(x - (gx4 - 1)) * 4 + 1] = in_4.y; |
| input[y - (gy - 3)][(x - (gx4 - 1)) * 4 + 2] = in_4.z; |
| input[y - (gy - 3)][(x - (gx4 - 1)) * 4 + 3] = in_4.w; |
| } |
| } |
| } |
| |
| // TODO: reorganize load (optimization) |
| void load_uv_input(int4 plane, int gx, int gy, int llx, int lly, int llz, int pid) { |
| uint id = ((llz * 4 + lly) * 4 + llx); |
| int lx = id % (CDEF_UV_BLOCK_WIDTH / 4 + 2); |
| int ly = id / (CDEF_UV_BLOCK_WIDTH / 4 + 2); |
| plane.z >>= 2; |
| for (int y = gy - 3 + ly; y < gy - 3 + CDEF_UV_BLOCK_HEIGHT + 6; y += WG_HEIGHT) { |
| const int gx4 = gx >> 2; |
| for (int x = gx4 - 1 + lx; x < gx4 - 1 + CDEF_UV_BLOCK_WIDTH / 4 + 2; x += WG_WIDTH) { |
| int is_clamp = (x < 0) || (y < 0) || (x >= plane.z) || (y >= plane.w); |
| int4 in_4 = {CDEF_VERY_LARGE, CDEF_VERY_LARGE, CDEF_VERY_LARGE, CDEF_VERY_LARGE}; |
| if (!is_clamp) { |
| uint input_char = dst.Load(plane.y + y * plane.x + x * 4); |
| in_4.x = (input_char >> 0) & 255; |
| in_4.y = (input_char >> 8) & 255; |
| in_4.z = (input_char >> 16) & 255; |
| in_4.w = (input_char >> 24) & 255; |
| } |
| input[y - (gy - 3) + (pid - 1) * 24][(x - (gx4 - 1)) * 4 + 0] = in_4.x; |
| input[y - (gy - 3) + (pid - 1) * 24][(x - (gx4 - 1)) * 4 + 1] = in_4.y; |
| input[y - (gy - 3) + (pid - 1) * 24][(x - (gx4 - 1)) * 4 + 2] = in_4.z; |
| input[y - (gy - 3) + (pid - 1) * 24][(x - (gx4 - 1)) * 4 + 3] = in_4.w; |
| } |
| } |
| } |
| void load_uv_input_hbd(int4 plane, int gx, int gy, int llx, int lly, int llz, int pid) { |
| uint id = ((llz * 4 + lly) * 4 + llx); |
| int lx = id % (CDEF_UV_BLOCK_WIDTH / 4 + 2); |
| int ly = id / (CDEF_UV_BLOCK_WIDTH / 4 + 2); |
| plane.z >>= 2; |
| for (int y = gy - 3 + ly; y < gy - 3 + CDEF_UV_BLOCK_HEIGHT + 6; y += WG_HEIGHT) { |
| const int gx4 = gx >> 2; |
| for (int x = gx4 - 1 + lx; x < gx4 - 1 + CDEF_UV_BLOCK_WIDTH / 4 + 2; x += WG_WIDTH) { |
| int is_clamp = (x < 0) || (y < 0) || (x >= plane.z) || (y >= plane.w); |
| int4 in_4 = {CDEF_VERY_LARGE, CDEF_VERY_LARGE, CDEF_VERY_LARGE, CDEF_VERY_LARGE}; |
| if (!is_clamp) { |
| uint2 input_char = dst.Load2(plane.y + y * plane.x + (x << 3)); |
| in_4.x = (input_char.x >> 0) & 0x03ff; |
| in_4.y = (input_char.x >> 16) & 0x03ff; |
| in_4.z = (input_char.y >> 0) & 0x03ff; |
| in_4.w = (input_char.y >> 16) & 0x03ff; |
| } |
| input[y - (gy - 3) + (pid - 1) * 24][(x - (gx4 - 1)) * 4 + 0] = in_4.x; |
| input[y - (gy - 3) + (pid - 1) * 24][(x - (gx4 - 1)) * 4 + 1] = in_4.y; |
| input[y - (gy - 3) + (pid - 1) * 24][(x - (gx4 - 1)) * 4 + 2] = in_4.z; |
| input[y - (gy - 3) + (pid - 1) * 24][(x - (gx4 - 1)) * 4 + 3] = in_4.w; |
| } |
| } |
| } |
| |
| int get_loaded_source_sample(int x, int y) { return input[y + 3][x + 4]; } |
| |
| int2 get_block_dir(int x0, int y0, int lx, int ly, int lz, int coeff_shift) { |
| int i; |
| int z0 = lz; |
| int cost[8] = {0, 0, 0, 0, 0, 0, 0, 0}; |
| |
| const int div_table[] = {0, 840, 420, 280, 210, 168, 140, 120, 105}; |
| const int div_table_idx[8][8] = {{1, 2, 3, 4, 5, 6, 7, 8}, {2, 4, 6, 8, 8, 8, 8, 8}, {8, 8, 8, 8, 8, 8, 8, 8}, |
| {2, 4, 6, 8, 8, 8, 8, 8}, {1, 2, 3, 4, 5, 6, 7, 8}, {2, 4, 6, 8, 8, 8, 8, 8}, |
| {8, 8, 8, 8, 8, 8, 8, 8}, {2, 4, 6, 8, 8, 8, 8, 8}}; |
| const int4 prt_idx[8] = {{1, 1, 0, 0}, {1, 1, 0, 1}, {1, 0, 0, 0}, {1, -1, 0, 1}, |
| {1, -1, 0, 0}, {-1, 1, 1, 0}, {0, 1, 0, 0}, {1, 1, 1, 0}}; |
| const int prt_idx_shift[8] = {0, 0, 0, 3, 7, 3, 0, 0}; |
| |
| const int cost_prt_idx[8][8] = {{14, 13, 12, 11, 10, 9, 8, -1}, |
| {10, 9, 8, -1, -1, -1, -1, -1}, |
| { |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| }, |
| {10, 9, 8, -1, -1, -1, -1, -1}, |
| {14, 13, 12, 11, 10, 9, 8, -1}, |
| {10, 9, 8, -1, -1, -1, -1, -1}, |
| { |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| -1, |
| }, |
| {10, 9, 8, -1, -1, -1, -1, -1}}; |
| int partial[15] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
| int best_cost = 0; |
| int best_cost_2 = 0; |
| int2 best_dir = {0, 0}; |
| for (i = 0; i < 8; i++) { |
| int j; |
| for (j = 0; j < 8; j++) { |
| int x; |
| x = (get_loaded_source_sample(x0 + j, y0 + i) >> coeff_shift) - 128; |
| int4 idx = prt_idx[z0]; |
| partial[idx.x * (i >> idx.z) + idx.y * (j >> idx.w) + prt_idx_shift[z0]] += x; |
| } |
| } |
| // for (i = 0; i < 8; i++) |
| { costs[lz][ly][lx] = 0; } |
| for (i = 0; i < 8; i++) { |
| int pt1 = partial[i]; |
| int pt2 = cost_prt_idx[z0][i] >= 0 ? partial[cost_prt_idx[z0][i]] : 0; |
| costs[lz][ly][lx] += (pt1 * pt1 + pt2 * pt2) * div_table[div_table_idx[z0][i]]; |
| } |
| GroupMemoryBarrierWithGroupSync(); |
| for (i = 0; i < 8; i++) { |
| cost[i] = costs[i][ly][lx]; |
| } |
| for (i = 0; i < 8; i++) { |
| if (cost[i] > best_cost) { |
| best_cost = cost[i]; |
| best_cost_2 = cost[(i + 4) & 7]; |
| best_dir.x = i; |
| } |
| } |
| best_dir.y = best_cost - best_cost_2; |
| best_dir.y >>= 10; |
| return best_dir; |
| } |
| |
| int2 get_block_dir_old(int x0, int y0, int coeff_shift) { |
| int i; |
| int j; |
| int cost[8] = {0, 0, 0, 0, 0, 0, 0, 0}; |
| int partial[8][15] = {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, |
| {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, |
| {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, |
| {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}; |
| const int div_table[] = {0, 840, 420, 280, 210, 168, 140, 120, 105}; |
| |
| int best_cost = 0; |
| int best_cost_2 = 0; |
| int2 best_dir = {0, 0}; |
| for (i = 0; i < 8; i++) { |
| for (j = 0; j < 8; j++) { |
| int x; |
| x = (get_loaded_source_sample(x0 + j, y0 + i) >> coeff_shift) - 128; |
| partial[0][i + j] += x; |
| partial[1][i + (j >> 1)] += x; |
| partial[2][i] += x; |
| partial[3][3 + i - (j >> 1)] += x; |
| partial[4][7 + i - j] += x; |
| partial[5][3 - (i >> 1) + j] += x; |
| partial[6][j] += x; |
| partial[7][(i >> 1) + j] += x; |
| } |
| } |
| for (i = 0; i < 8; i++) { |
| cost[2] += partial[2][i] * partial[2][i]; |
| cost[6] += partial[6][i] * partial[6][i]; |
| } |
| cost[2] *= div_table[8]; |
| cost[6] *= div_table[8]; |
| for (i = 0; i < 7; i++) { |
| cost[0] += (partial[0][i] * partial[0][i] + partial[0][14 - i] * partial[0][14 - i]) * div_table[i + 1]; |
| cost[4] += (partial[4][i] * partial[4][i] + partial[4][14 - i] * partial[4][14 - i]) * div_table[i + 1]; |
| } |
| cost[0] += partial[0][7] * partial[0][7] * div_table[8]; |
| cost[4] += partial[4][7] * partial[4][7] * div_table[8]; |
| for (i = 1; i < 8; i += 2) { |
| for (j = 0; j < 4 + 1; j++) { |
| cost[i] += partial[i][3 + j] * partial[i][3 + j]; |
| } |
| cost[i] *= div_table[8]; |
| for (j = 0; j < 4 - 1; j++) { |
| cost[i] += (partial[i][j] * partial[i][j] + partial[i][10 - j] * partial[i][10 - j]) * div_table[2 * j + 2]; |
| } |
| } |
| for (i = 0; i < 8; i++) { |
| if (cost[i] > best_cost) { |
| best_cost = cost[i]; |
| best_cost_2 = cost[(i + 4) & 7]; |
| best_dir.x = i; |
| } |
| } |
| best_dir.y = best_cost - best_cost_2; |
| best_dir.y >>= 10; |
| return best_dir; |
| } |
| |
| #define MAX_SB_SIZE_LOG2 7 |
| #define ALIGN_POWER_OF_TWO(value, n) (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1)) |
| #define CDEF_VBORDER (3) |
| #define CDEF_HBORDER (8) |
| #define CDEF_BSTRIDE ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3) |
| |
| int get_msb(unsigned int n) { |
| int log = 0; |
| unsigned int value = n; |
| int i; |
| |
| for (i = 4; i >= 0; --i) { |
| const int shift = (1 << i); |
| const unsigned int x = value >> shift; |
| if (x != 0) { |
| value = x; |
| log += shift; |
| } |
| } |
| return log; |
| } |
| |
| int constrain(int diff, int threshold, int shift) { |
| return sign(diff) * min(abs(diff), max(0, threshold - (abs(diff) >> shift))); |
| } |
| |
| int adjust_strength(int strength, int var) { |
| const int i = var >> 6 ? min(get_msb(var >> 6), 12) : 0; |
| return var ? (strength * (4 + i) + 8) >> 4 : 0; |
| } |
| |
| [numthreads(WG_WIDTH, WG_HEIGHT, 8)] void main(uint3 Gid |
| : SV_GroupID, uint3 DTid |
| : SV_DispatchThreadID, uint3 GTid |
| : SV_GroupThreadID, uint GI |
| : SV_GroupIndex) { |
| int gx = Gid.x * CDEF_BLOCK_WIDTH; |
| int gy = Gid.y * CDEF_BLOCK_HEIGHT; |
| int guvx = Gid.x * CDEF_UV_BLOCK_WIDTH; |
| int guvy = Gid.y * CDEF_UV_BLOCK_HEIGHT; |
| int lx = GTid.x; |
| int ly = GTid.y; |
| int lz = GTid.z; |
| int2 dir_var; |
| int bit_depth = data.bit_depth; |
| int coeff_shift = bit_depth - 8; |
| |
| int skip = skips.Load(((Gid.y * WG_HEIGHT + ly) * data.skips_stride + Gid.x * WG_WIDTH + lx) * 4); |
| if (data.hbd) { |
| load_input_hbd(data.pl, gx, gy, lx, ly, lz); |
| } else { |
| load_input(data.pl, gx, gy, lx, ly, lz); |
| } |
| GroupMemoryBarrierWithGroupSync(); |
| |
| if (lz == 0 && !skip) { |
| temp[ly][lx] = get_block_dir_old(lx * 8, ly * 8, coeff_shift); |
| } |
| int index = cdef_index.Load(((Gid.y >> 1) * data.index_stride + (Gid.x >> 1)) * 4); |
| uint strength = 0; |
| if (index >= 0 && index < 16) { |
| strength = data.cdef_strength[index].x; |
| } |
| int t = skip ? 0 : strength / 4; |
| int s = skip ? 0 : strength % 4; |
| s += s == 3; |
| |
| GroupMemoryBarrierWithGroupSync(); |
| dir_var = temp[ly][lx]; |
| |
| const int damping = data.pri_damping + coeff_shift; |
| if (!skip) { |
| const int pri_taps[2][2] = {{4, 2}, {3, 3}}; |
| const int sec_taps[2][2] = {{2, 1}, {2, 1}}; |
| |
| const int x0 = lx * 8; |
| const int y0 = ly * 8; |
| const int z0 = lz; |
| const int pri_strength = adjust_strength(t << coeff_shift, dir_var.y); |
| const int sec_strength = s << coeff_shift; |
| const int dir = t ? dir_var.x : 0; |
| int i, j, k; |
| const int p_t = (pri_strength >> coeff_shift) & 1; |
| const int s_t = (pri_strength >> coeff_shift) & 1; |
| const int pri_msb = get_msb(pri_strength); |
| const int sec_msb = get_msb(sec_strength); |
| const int pri_shift = max(0, damping - pri_msb); |
| const int sec_shift = max(0, damping - sec_msb); |
| |
| // for (i = 0; i < 8; i++) { |
| { |
| i = z0; |
| for (j = 0; j < 8; j++) { |
| int sum = 0; |
| int y; |
| int x = get_loaded_source_sample(x0 + j, y0 + i); |
| int mmax = x; |
| int mmin = x; |
| for (k = 0; k < 2; k++) { |
| if (pri_strength) { |
| int2 dir_pri = data.cdef_directions[dir][k].xy; |
| int p0 = get_loaded_source_sample(x0 + j + dir_pri.x, y0 + i + dir_pri.y); |
| int p1 = get_loaded_source_sample(x0 + j - dir_pri.x, y0 + i - dir_pri.y); |
| sum += pri_taps[p_t][k] * constrain(p0 - x, pri_strength, pri_shift); |
| sum += pri_taps[p_t][k] * constrain(p1 - x, pri_strength, pri_shift); |
| |
| // mmax = max(p0 & 0xff, mmax); |
| // mmax = max(p1 & 0xff, mmax); |
| // NOTE!: (CDEF_VERY_LARGE & 0xff) = 48, can we adjust CDEF_VERY_LARGE??? |
| if (p0 != CDEF_VERY_LARGE) mmax = max(p0, mmax); |
| if (p1 != CDEF_VERY_LARGE) mmax = max(p1, mmax); |
| mmin = min(p0, mmin); |
| mmin = min(p1, mmin); |
| } |
| if (sec_strength) { |
| int2 dir1_sec = data.cdef_directions[dir + 2][k].xy; |
| int2 dir2_sec = data.cdef_directions[dir + 6][k].xy; |
| int s0 = get_loaded_source_sample(x0 + j + dir1_sec.x, y0 + i + dir1_sec.y); |
| int s1 = get_loaded_source_sample(x0 + j - dir1_sec.x, y0 + i - dir1_sec.y); |
| int s2 = get_loaded_source_sample(x0 + j + dir2_sec.x, y0 + i + dir2_sec.y); |
| int s3 = get_loaded_source_sample(x0 + j - dir2_sec.x, y0 + i - dir2_sec.y); |
| // mmax = max(s0 & 0xff, mmax); |
| // mmax = max(s1 & 0xff, mmax); |
| // mmax = max(s2 & 0xff, mmax); |
| // mmax = max(s3 & 0xff, mmax); |
| // NOTE!: (CDEF_VERY_LARGE & 0xff) = 48, can we adjust CDEF_VERY_LARGE??? |
| if (s0 != CDEF_VERY_LARGE) mmax = max(s0, mmax); |
| if (s1 != CDEF_VERY_LARGE) mmax = max(s1, mmax); |
| if (s2 != CDEF_VERY_LARGE) mmax = max(s2, mmax); |
| if (s3 != CDEF_VERY_LARGE) mmax = max(s3, mmax); |
| mmin = min(s0, mmin); |
| mmin = min(s1, mmin); |
| mmin = min(s2, mmin); |
| mmin = min(s3, mmin); |
| sum += sec_taps[s_t][k] * constrain(s0 - x, sec_strength, sec_shift); |
| sum += sec_taps[s_t][k] * constrain(s1 - x, sec_strength, sec_shift); |
| sum += sec_taps[s_t][k] * constrain(s2 - x, sec_strength, sec_shift); |
| sum += sec_taps[s_t][k] * constrain(s3 - x, sec_strength, sec_shift); |
| } |
| } |
| y = clamp(x + ((8 + sum - (sum < 0)) >> 4), mmin, mmax); |
| |
| output[y0 + i][x0 + j] = y; |
| } |
| } |
| |
| } else { |
| for (int j = 0; j < 8; j++) output[ly * 8 + lz][lx * 8 + j] = get_loaded_source_sample(lx * 8 + j, ly * 8 + lz); |
| } |
| GroupMemoryBarrierWithGroupSync(); |
| int dy = lz; |
| { |
| for (int dx = 0; dx < 2; dx++) { |
| int4 out_sample; |
| out_sample.x = output[ly * 8 + dy][lx * 8 + dx * 4 + 0]; |
| out_sample.y = output[ly * 8 + dy][lx * 8 + dx * 4 + 1]; |
| out_sample.z = output[ly * 8 + dy][lx * 8 + dx * 4 + 2]; |
| out_sample.w = output[ly * 8 + dy][lx * 8 + dx * 4 + 3]; |
| |
| if (data.hbd) { |
| dst.Store2(data.dst_offset_y + (gy + ly * 8 + dy) * data.pl.x + ((gx + lx * 8 + dx * 4) << 1), |
| uint2((out_sample.x << 0) | (out_sample.y << 16), (out_sample.z << 0) | (out_sample.w << 16))); |
| } else { |
| dst.Store(data.dst_offset_y + (gy + ly * 8 + dy) * data.pl.x + gx + lx * 8 + dx * 4, |
| (out_sample.x << 0) | (out_sample.y << 8) | (out_sample.z << 16) | (out_sample.w << 24)); |
| } |
| } |
| } |
| // Chroma processing |
| int pid = 1 + (lz >> 2); |
| int llz = lz & 3; |
| int4 plane = data.pl; |
| plane.x = data.uv_stride; |
| plane.y = pid == 1 ? data.uv_offset_u : data.uv_offset_v; |
| plane.z >>= 1; |
| plane.w >>= 1; |
| GroupMemoryBarrierWithGroupSync(); |
| if (data.hbd) { |
| load_uv_input_hbd(plane, guvx, guvy, lx, ly, llz, pid); |
| } else { |
| load_uv_input(plane, guvx, guvy, lx, ly, llz, pid); |
| } |
| GroupMemoryBarrierWithGroupSync(); |
| if (index >= 0 && index < 16) { |
| strength = data.cdef_uv_strength[index].x; |
| } |
| t = skip ? 0 : strength / 4; |
| s = skip ? 0 : strength % 4; |
| s += s == 3; |
| |
| if (!skip) { |
| const int pri_taps[2][2] = {{4, 2}, {3, 3}}; |
| const int sec_taps[2][2] = {{2, 1}, {2, 1}}; |
| |
| const int x0 = lx * 4; |
| const int y0 = ly * 4; |
| const int z0 = llz; |
| const int pri_strength = t << coeff_shift; |
| const int sec_strength = s << coeff_shift; |
| const int dir = pri_strength ? dir_var.x : 0; |
| int i, j, k; |
| const int p_t = (pri_strength >> coeff_shift) & 1; |
| const int s_t = (pri_strength >> coeff_shift) & 1; |
| const int pri_msb = get_msb(pri_strength); |
| const int sec_msb = get_msb(sec_strength); |
| const int pri_shift = max(0, damping - 1 - pri_msb); |
| const int sec_shift = max(0, damping - 1 - sec_msb); |
| |
| // for (i = 0; i < 8; i++) { |
| { |
| i = z0; |
| for (j = 0; j < 4; j++) { |
| int sum = 0; |
| int y; |
| int x = get_loaded_source_sample(x0 + j, y0 + i + (pid - 1) * 24); |
| int mmax = x; |
| int mmin = x; |
| for (k = 0; k < 2; k++) { |
| if (pri_strength) { |
| int2 dir_pri = data.cdef_directions[dir][k].xy; |
| int p0 = get_loaded_source_sample(x0 + j + dir_pri.x, y0 + i + dir_pri.y + (pid - 1) * 24); |
| int p1 = get_loaded_source_sample(x0 + j - dir_pri.x, y0 + i - dir_pri.y + (pid - 1) * 24); |
| sum += pri_taps[p_t][k] * constrain(p0 - x, pri_strength, pri_shift); |
| sum += pri_taps[p_t][k] * constrain(p1 - x, pri_strength, pri_shift); |
| // mmax = max(p0 & 0xff, mmax); |
| // mmax = max(p1 & 0xff, mmax); |
| // NOTE!: (CDEF_VERY_LARGE & 0xff) = 48, can we adjust CDEF_VERY_LARGE??? |
| if (p0 != CDEF_VERY_LARGE) mmax = max(p0, mmax); |
| if (p1 != CDEF_VERY_LARGE) mmax = max(p1, mmax); |
| mmin = min(p0, mmin); |
| mmin = min(p1, mmin); |
| } |
| if (sec_strength) { |
| int2 dir1_sec = data.cdef_directions[dir + 2][k].xy; |
| int2 dir2_sec = data.cdef_directions[dir + 6][k].xy; |
| int s0 = get_loaded_source_sample(x0 + j + dir1_sec.x, y0 + i + dir1_sec.y + (pid - 1) * 24); |
| int s1 = get_loaded_source_sample(x0 + j - dir1_sec.x, y0 + i - dir1_sec.y + (pid - 1) * 24); |
| int s2 = get_loaded_source_sample(x0 + j + dir2_sec.x, y0 + i + dir2_sec.y + (pid - 1) * 24); |
| int s3 = get_loaded_source_sample(x0 + j - dir2_sec.x, y0 + i - dir2_sec.y + (pid - 1) * 24); |
| // mmax = max(s0 & 0xff, mmax); |
| // mmax = max(s1 & 0xff, mmax); |
| // mmax = max(s2 & 0xff, mmax); |
| // mmax = max(s3 & 0xff, mmax); |
| // NOTE!: (CDEF_VERY_LARGE & 0xff) = 48, can we adjust CDEF_VERY_LARGE??? |
| if (s0 != CDEF_VERY_LARGE) mmax = max(s0, mmax); |
| if (s1 != CDEF_VERY_LARGE) mmax = max(s1, mmax); |
| if (s2 != CDEF_VERY_LARGE) mmax = max(s2, mmax); |
| if (s3 != CDEF_VERY_LARGE) mmax = max(s3, mmax); |
| mmin = min(s0, mmin); |
| mmin = min(s1, mmin); |
| mmin = min(s2, mmin); |
| mmin = min(s3, mmin); |
| sum += sec_taps[s_t][k] * constrain(s0 - x, sec_strength, sec_shift); |
| sum += sec_taps[s_t][k] * constrain(s1 - x, sec_strength, sec_shift); |
| sum += sec_taps[s_t][k] * constrain(s2 - x, sec_strength, sec_shift); |
| sum += sec_taps[s_t][k] * constrain(s3 - x, sec_strength, sec_shift); |
| } |
| } |
| y = clamp(x + ((8 + sum - (sum < 0)) >> 4), mmin, mmax); |
| output[y0 + i + (pid - 1) * 16][x0 + j] = y; |
| } |
| } |
| |
| } else { |
| for (int j = 0; j < 4; j++) |
| output[ly * 4 + llz + (pid - 1) * 16][lx * 4 + j] = |
| get_loaded_source_sample(lx * 4 + j, ly * 4 + llz + (pid - 1) * 24); |
| } |
| GroupMemoryBarrierWithGroupSync(); |
| dy = llz; |
| { |
| int4 out_sample; |
| out_sample.x = output[ly * 4 + dy + (pid - 1) * 16][lx * 4 + 0]; |
| out_sample.y = output[ly * 4 + dy + (pid - 1) * 16][lx * 4 + 1]; |
| out_sample.z = output[ly * 4 + dy + (pid - 1) * 16][lx * 4 + 2]; |
| out_sample.w = output[ly * 4 + dy + (pid - 1) * 16][lx * 4 + 3]; |
| const int dst_offset_uv = pid == 1 ? data.dst_offset_u : data.dst_offset_v; |
| if (data.hbd) { |
| dst.Store2(dst_offset_uv + (guvy + ly * 4 + dy) * data.uv_stride + ((guvx + lx * 4) << 1), |
| uint2((out_sample.x << 0) | (out_sample.y << 16), (out_sample.z << 0) | (out_sample.w << 16))); |
| } else { |
| dst.Store(dst_offset_uv + (guvy + ly * 4 + dy) * data.uv_stride + guvx + lx * 4, |
| (out_sample.x << 0) | (out_sample.y << 8) | (out_sample.z << 16) | (out_sample.w << 24)); |
| } |
| } |
| } |