| /* |
| * Copyright 2020 Google LLC |
| * |
| */ |
| |
| /* |
| * Copyright (c) 2020, Alliance for Open Media. All rights reserved |
| * |
| * This source code is subject to the terms of the BSD 2 Clause License and |
| * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| * was not distributed with this source code in the LICENSE file, you can |
| * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| * Media Patent License 1.0 was not distributed with this source code in the |
| * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| */ |
| |
| #include "film_grain_const.h" |
| |
| ByteAddressBuffer src : register(t0); |
| StructuredBuffer<int> grain_block : register(t1); |
| StructuredBuffer<int> random_offset : register(t2); |
| RWByteAddressBuffer dst : register(u0); |
| |
| struct FilmGrainData { |
| GrainParams params; |
| |
| int4 src_planes[3]; |
| int4 dst_planes[3]; |
| |
| int enable_chroma; |
| int random_offset_stride; |
| int width; |
| int height; |
| |
| int mc_identity; |
| int luma_grain_stride; |
| int chroma_grain_stride; |
| int left_pad; |
| |
| int right_pad; |
| int top_pad; |
| int bottom_pad; |
| int ar_padding; |
| |
| int grain_offset_u; |
| int grain_offset_v; |
| int is_10x3; |
| int pad; |
| |
| int4 scaling_lut[256]; |
| }; |
| |
| cbuffer cb_film_grain_data : register(b0) { FilmGrainData data; }; |
| |
| #define clamp_ln0(a, b) clamp((a * 23 + b * 22 + 16) >> 5, grain_min, grain_max) |
| #define clamp_ln1(a, b) clamp((a * 27 + b * 17 + 16) >> 5, grain_min, grain_max) |
| #define clamp_ln2(a, b) clamp((a * 17 + b * 27 + 16) >> 5, grain_min, grain_max) |
| |
| #define clamp_ln(n, a, b) ((n == 1) ? clamp_ln1(a, b) : clamp_ln2(a, b)) |
| |
| groupshared int luma_grain_temp[3][32 * 32]; |
| groupshared int avarage_luma[3][32 * 32]; |
| groupshared int cr_grain_temp[3][16 * 16]; |
| groupshared int cb_grain_temp[3][16 * 16]; |
| |
| #define chroma_subsamp_x 1 |
| #define chroma_subsamp_y 1 |
| |
| // return scaling_lut[x] + (((scaling_lut[x + 1] - scaling_lut[x]) * |
| // (index & ((1 << (bit_depth - 8)) - 1)) + |
| // (1 << (bit_depth - 9))) >> |
| // (bit_depth - 8)); |
| |
| #define scale_LUT(index) \ |
| (data.scaling_lut[index >> (bit_depth - 8)].x + \ |
| ((index >> (bit_depth - 8)) == 255 \ |
| ? 0 \ |
| : (((data.scaling_lut[(index >> (bit_depth - 8)) + 1].x - data.scaling_lut[index >> (bit_depth - 8)].x) * \ |
| (index & ((1 << (bit_depth - 8)) - 1)) + \ |
| (1 << (bit_depth - 9))) >> \ |
| (bit_depth - 8)))) |
| |
| #define scale_LUT_cb(index) \ |
| (data.scaling_lut[index >> (bit_depth - 8)].y + \ |
| ((index >> (bit_depth - 8)) == 255 \ |
| ? 0 \ |
| : (((data.scaling_lut[(index >> (bit_depth - 8)) + 1].y - data.scaling_lut[index >> (bit_depth - 8)].y) * \ |
| (index & ((1 << (bit_depth - 8)) - 1)) + \ |
| (1 << (bit_depth - 9))) >> \ |
| (bit_depth - 8)))) |
| |
| #define scale_LUT_cr(index) \ |
| (data.scaling_lut[index >> (bit_depth - 8)].z + \ |
| ((index >> (bit_depth - 8)) == 255 \ |
| ? 0 \ |
| : (((data.scaling_lut[(index >> (bit_depth - 8)) + 1].z - data.scaling_lut[index >> (bit_depth - 8)].z) * \ |
| (index & ((1 << (bit_depth - 8)) - 1)) + \ |
| (1 << (bit_depth - 9))) >> \ |
| (bit_depth - 8)))) |
| |
| [numthreads(luma_subblock_size_x / 4, luma_subblock_size_y, 3)] void main(int3 Gid |
| : SV_GroupID, int3 GTid |
| : SV_GroupThreadID) { |
| int overlap = data.params.overlap_flag; |
| int bit_depth = data.params.bit_depth; |
| int lid = GTid.z; |
| int ii = Gid.y; |
| int jj = Gid.x * 3 + lid; |
| int grain_center = 128 << (bit_depth - 8); |
| int grain_min = 0 - grain_center; |
| int grain_max = (256 << (bit_depth - 8)) - 1 - grain_center; |
| const int enable_chroma = data.enable_chroma; |
| |
| { |
| int y = ii * luma_subblock_size_y; |
| { |
| int x = jj * luma_subblock_size_x; |
| |
| int true_chroma_subblock_size_y = luma_subblock_size_y >> chroma_subsamp_y; |
| int true_chroma_subblock_size_x = luma_subblock_size_x >> chroma_subsamp_x; |
| // Grain blocks offset calculation |
| const int random_offset_stride = data.random_offset_stride; |
| int offset_y_up = y ? random_offset[(ii - (y ? 1 : 0)) * random_offset_stride + jj] : 0; |
| int offset_y_up_left = |
| (y > 0) && (x > 0) ? random_offset[(ii - (y ? 1 : 0)) * random_offset_stride + jj - (x ? 1 : 0)] : 0; |
| int offset_y_left = x ? random_offset[ii * random_offset_stride + jj - (x ? 1 : 0)] : 0; |
| int offset_y = random_offset[ii * random_offset_stride + jj]; |
| int offset_x_up = (offset_y_up >> 4) & 15; |
| int offset_x_up_left = (offset_y_up_left >> 4) & 15; |
| int offset_x_left = (offset_y_left >> 4) & 15; |
| int offset_x = (offset_y >> 4) & 15; |
| offset_y_up &= 15; |
| offset_y_up_left &= 15; |
| offset_y_left &= 15; |
| offset_y &= 15; |
| const int ar_padding = data.ar_padding; |
| int luma_offset_y = 2 * ar_padding + (offset_y << 1); |
| int luma_offset_x = 2 * ar_padding + (offset_x << 1); |
| int luma_offset_y_left = 2 * ar_padding + (offset_y_left << 1); |
| int luma_offset_x_left = 2 * ar_padding + (offset_x_left << 1) + 32; |
| int luma_offset_y_up = 2 * ar_padding + (offset_y_up << 1) + 32; |
| int luma_offset_x_up = 2 * ar_padding + (offset_x_up << 1); |
| int luma_offset_y_up_left = 2 * ar_padding + (offset_y_up_left << 1) + 32; |
| int luma_offset_x_up_left = 2 * ar_padding + (offset_x_up_left << 1) + 32; |
| |
| const int top_pad = data.top_pad; |
| const int left_pad = data.left_pad; |
| int chroma_offset_y = top_pad + (luma_offset_y >> chroma_subsamp_y); |
| int chroma_offset_x = left_pad + (luma_offset_x >> chroma_subsamp_x); |
| int chroma_offset_y_left = top_pad + (luma_offset_y_left >> chroma_subsamp_y); |
| int chroma_offset_x_left = left_pad + (luma_offset_x_left >> chroma_subsamp_x); |
| int chroma_offset_y_up = top_pad + (luma_offset_y_up >> chroma_subsamp_y); |
| int chroma_offset_x_up = left_pad + (luma_offset_x_up >> chroma_subsamp_x); |
| int chroma_offset_y_up_left = top_pad + (luma_offset_y_up_left >> chroma_subsamp_y); |
| int chroma_offset_x_up_left = left_pad + (luma_offset_x_up_left >> chroma_subsamp_x); |
| |
| luma_offset_y += top_pad; |
| luma_offset_x += left_pad; |
| luma_offset_y_left += top_pad; |
| luma_offset_x_left += left_pad; |
| luma_offset_y_up += top_pad; |
| luma_offset_x_up += left_pad; |
| luma_offset_y_up_left += top_pad; |
| luma_offset_x_up_left += left_pad; |
| |
| const int grain_offset_u = data.grain_offset_u; |
| const int grain_offset_v = data.grain_offset_v; |
| const int luma_grain_stride = data.luma_grain_stride; |
| const int chroma_grain_stride = data.chroma_grain_stride; |
| // Grain blocks fetching |
| // for (int i = 0; i < 32; i++) { |
| { |
| int i = GTid.y; |
| for (int j = GTid.x; j < luma_subblock_size_x; j += luma_subblock_size_x / 4) { |
| // Luma grain fetching |
| luma_grain_temp[lid][i * luma_subblock_size_x + j] = |
| (grain_block[(luma_offset_y + i) * luma_grain_stride + luma_offset_x + j]); |
| // Chroma grain fetching |
| if (i < true_chroma_subblock_size_y && j < true_chroma_subblock_size_x && enable_chroma) { |
| cb_grain_temp[lid][i * true_chroma_subblock_size_x + j] = |
| (grain_block[grain_offset_u + (chroma_offset_y + i) * chroma_grain_stride + chroma_offset_x + j]); |
| cr_grain_temp[lid][i * true_chroma_subblock_size_x + j] = |
| grain_block[grain_offset_v + (chroma_offset_y + i) * chroma_grain_stride + chroma_offset_x + j]; |
| } |
| } |
| } |
| GroupMemoryBarrierWithGroupSync(); |
| |
| // Overlap processing on X axis |
| if (overlap && x) { |
| int i = GTid.y; |
| int j = GTid.x; |
| // Luma overlap |
| if (j < 2) { |
| int test_luma_left = (grain_block[(luma_offset_y_left + i) * luma_grain_stride + luma_offset_x_left + j]); |
| luma_grain_temp[lid][i * luma_subblock_size_x + j] = |
| clamp_ln(j + 1, test_luma_left, luma_grain_temp[lid][i * luma_subblock_size_x + j]); |
| } |
| // Chroma overlap |
| if (i < true_chroma_subblock_size_y && j <= (1 - chroma_subsamp_x) && enable_chroma) { |
| int test_cb_left = (grain_block[grain_offset_u + (chroma_offset_y_left + i) * chroma_grain_stride + |
| chroma_offset_x_left + j]); |
| int test_cr_left = (grain_block[grain_offset_v + (chroma_offset_y_left + i) * chroma_grain_stride + |
| chroma_offset_x_left + j]); |
| cb_grain_temp[lid][i * chroma_subblock_size_x + j] = |
| clamp_ln0(test_cb_left, cb_grain_temp[lid][i * true_chroma_subblock_size_x + j]); |
| cr_grain_temp[lid][i * chroma_subblock_size_x + j] = |
| clamp_ln0(test_cr_left, cr_grain_temp[lid][i * true_chroma_subblock_size_x + j]); |
| } |
| } |
| GroupMemoryBarrierWithGroupSync(); |
| |
| // Overlap processing on Y axis |
| if (overlap && y) { |
| int i = GTid.y; |
| for (int j = GTid.x; j < 32; j += luma_subblock_size_x / 4) { |
| // Luma overlap |
| if (i < 2) { |
| int test_luma_up_left = |
| (grain_block[(luma_offset_y_up_left + i) * luma_grain_stride + luma_offset_x_up_left + j]); |
| int test_luma_up = (grain_block[(luma_offset_y_up + i) * luma_grain_stride + luma_offset_x_up + j]); |
| if (x && (j < 2)) { |
| test_luma_up = clamp_ln(j + 1, test_luma_up_left, test_luma_up); |
| } |
| luma_grain_temp[lid][i * luma_subblock_size_x + j] = |
| clamp_ln(i + 1, test_luma_up, luma_grain_temp[lid][i * luma_subblock_size_x + j]); |
| } |
| // Chroma overlap |
| if ((i <= (1 - chroma_subsamp_y)) && (j < true_chroma_subblock_size_x) && enable_chroma) { |
| int test_cb_up_left = (grain_block[grain_offset_u + (chroma_offset_y_up_left + i) * chroma_grain_stride + |
| chroma_offset_x_up_left + j]); |
| |
| int test_cb_up = |
| (grain_block[grain_offset_u + (chroma_offset_y_up + i) * chroma_grain_stride + chroma_offset_x_up + j]); |
| |
| int test_cr_up_left = (grain_block[grain_offset_v + (chroma_offset_y_up_left + i) * chroma_grain_stride + |
| chroma_offset_x_up_left + j]); |
| |
| int test_cr_up = |
| (grain_block[grain_offset_v + (chroma_offset_y_up + i) * chroma_grain_stride + chroma_offset_x_up + j]); |
| |
| if (x && (j == 0)) { |
| test_cb_up = clamp_ln0(test_cb_up_left, test_cb_up); |
| test_cr_up = clamp_ln0(test_cr_up_left, test_cr_up); |
| } |
| |
| cb_grain_temp[lid][i * true_chroma_subblock_size_x + j] = |
| clamp_ln0(test_cb_up, cb_grain_temp[lid][i * true_chroma_subblock_size_x + j]); |
| cr_grain_temp[lid][i * true_chroma_subblock_size_x + j] = |
| clamp_ln0(test_cr_up, cr_grain_temp[lid][i * true_chroma_subblock_size_x + j]); |
| } |
| } |
| } |
| GroupMemoryBarrierWithGroupSync(); |
| |
| // Grain blocks application |
| int rounding_offset = (1 << (data.params.scaling_shift - 1)); |
| int min_luma, max_luma, min_chroma, max_chroma; |
| |
| if (data.params.clip_to_restricted_range) { |
| min_luma = min_luma_legal_range << (bit_depth - 8); |
| max_luma = max_luma_legal_range << (bit_depth - 8); |
| if (data.mc_identity) { |
| min_chroma = min_luma_legal_range << (bit_depth - 8); |
| max_chroma = max_luma_legal_range << (bit_depth - 8); |
| } else { |
| min_chroma = min_chroma_legal_range << (bit_depth - 8); |
| max_chroma = max_chroma_legal_range << (bit_depth - 8); |
| } |
| } else { |
| min_luma = min_chroma = 0; |
| max_luma = max_chroma = (256 << (bit_depth - 8)) - 1; |
| } |
| int cb_mult = data.params.cb_mult - 128; // fixed scale |
| int cb_luma_mult = data.params.cb_luma_mult - 128; // fixed scale |
| int cb_offset = (data.params.cb_offset << (bit_depth - 8)) - (1 << bit_depth); |
| |
| int cr_mult = data.params.cr_mult - 128; // fixed scale |
| int cr_luma_mult = data.params.cr_luma_mult - 128; // fixed scale |
| int cr_offset = (data.params.cr_offset << (bit_depth - 8)) - (1 << bit_depth); |
| |
| if (data.params.chroma_scaling_from_luma) { |
| cb_mult = 0; // fixed scale |
| cb_luma_mult = 64; // fixed scale |
| cb_offset = 0; |
| |
| cr_mult = 0; // fixed scale |
| cr_luma_mult = 64; // fixed scale |
| cr_offset = 0; |
| } |
| int apply_y = data.params.num_y_points > 0 ? 1 : 0; |
| int apply_cb = (data.params.num_cb_points > 0 || data.params.chroma_scaling_from_luma) ? 1 : 0; |
| int apply_cr = (data.params.num_cr_points > 0 || data.params.chroma_scaling_from_luma) ? 1 : 0; |
| |
| // for (int i = 0; i < (luma_subblock_size_y); i++) { |
| { |
| // for (int j = 0; j < (luma_subblock_size_x); j += 4) { |
| { |
| // Luma grain block application |
| int i = GTid.y; |
| int j = GTid.x * 4; |
| |
| const int2 src_luma_plane = data.src_planes[0].xy; |
| int4 in_luma; |
| if (bit_depth == 8) { |
| uint luma_uint = src.Load(src_luma_plane.y + (y + i) * src_luma_plane.x + x + j); |
| in_luma.x = (luma_uint >> 0) & 255; |
| in_luma.y = (luma_uint >> 8) & 255; |
| in_luma.z = (luma_uint >> 16) & 255; |
| in_luma.w = (luma_uint >> 24) & 255; |
| } else { |
| uint2 luma_uint = src.Load2(src_luma_plane.y + (y + i) * src_luma_plane.x + (x + j) * 2); |
| in_luma.x = (luma_uint.x >> 0) & 0x03ff; |
| in_luma.y = (luma_uint.x >> 16) & 0x03ff; |
| in_luma.z = (luma_uint.y >> 0) & 0x03ff; |
| in_luma.w = (luma_uint.y >> 16) & 0x03ff; |
| } |
| int4 scaled_luma; |
| |
| if (bit_depth == 8) { |
| scaled_luma.x = data.scaling_lut[in_luma.x].x * luma_grain_temp[lid][i * luma_subblock_size_x + j + 0]; |
| scaled_luma.y = data.scaling_lut[in_luma.y].x * luma_grain_temp[lid][i * luma_subblock_size_x + j + 1]; |
| scaled_luma.z = data.scaling_lut[in_luma.z].x * luma_grain_temp[lid][i * luma_subblock_size_x + j + 2]; |
| scaled_luma.w = data.scaling_lut[in_luma.w].x * luma_grain_temp[lid][i * luma_subblock_size_x + j + 3]; |
| } else { |
| scaled_luma.x = scale_LUT(in_luma.x) * luma_grain_temp[lid][i * luma_subblock_size_x + j + 0]; |
| scaled_luma.y = scale_LUT(in_luma.y) * luma_grain_temp[lid][i * luma_subblock_size_x + j + 1]; |
| scaled_luma.z = scale_LUT(in_luma.z) * luma_grain_temp[lid][i * luma_subblock_size_x + j + 2]; |
| scaled_luma.w = scale_LUT(in_luma.w) * luma_grain_temp[lid][i * luma_subblock_size_x + j + 3]; |
| } |
| int4 out_luma = |
| clamp(in_luma + ((scaled_luma + rounding_offset) >> data.params.scaling_shift), min_luma, max_luma); |
| if (!apply_y) { |
| out_luma = in_luma; |
| } |
| |
| if (data.is_10x3) { |
| luma_grain_temp[lid][i * luma_subblock_size_x + j + 0] = out_luma.x; |
| luma_grain_temp[lid][i * luma_subblock_size_x + j + 1] = out_luma.y; |
| luma_grain_temp[lid][i * luma_subblock_size_x + j + 2] = out_luma.z; |
| luma_grain_temp[lid][i * luma_subblock_size_x + j + 3] = out_luma.w; |
| } else { |
| const int2 dst_luma_plane = data.dst_planes[0].xy; |
| if (((y + i) < data.height) && ((x + j) < data.width)) { |
| if (bit_depth == 8) { |
| dst.Store(dst_luma_plane.y + (y + i) * dst_luma_plane.x + x + j, |
| out_luma.x | (out_luma.y << 8) | (out_luma.z << 16) | (out_luma.w << 24)); |
| } else { |
| dst.Store2(dst_luma_plane.y + (y + i) * dst_luma_plane.x + (x + j) * 2, |
| uint2((out_luma.x << 0) | (out_luma.y << 16), (out_luma.z << 0) | (out_luma.w << 16))); |
| } |
| } |
| } |
| |
| GroupMemoryBarrierWithGroupSync(); |
| |
| if (data.is_10x3) { |
| int x3 = Gid.x * luma_subblock_size_x * 3; |
| const int2 dst_luma_plane = data.dst_planes[0].xy; |
| for (int j3 = GTid.z * (luma_subblock_size_x / 4) + GTid.x; j3 < luma_subblock_size_x; |
| j3 += 3 * (luma_subblock_size_x / 4)) { |
| uint3 res; |
| res.x = luma_grain_temp[(j3 * 3 + 0) / (uint)luma_subblock_size_x] |
| [i * luma_subblock_size_x + (j3 * 3 + 0) % (uint)luma_subblock_size_x]; |
| res.y = luma_grain_temp[(j3 * 3 + 1) / (uint)luma_subblock_size_x] |
| [i * luma_subblock_size_x + (j3 * 3 + 1) % (uint)luma_subblock_size_x]; |
| res.z = luma_grain_temp[(j3 * 3 + 2) / (uint)luma_subblock_size_x] |
| [i * luma_subblock_size_x + (j3 * 3 + 2) % (uint)luma_subblock_size_x]; |
| if (((y + i) < data.height) && ((x3 + j3 * 3) < data.width)) { |
| dst.Store(dst_luma_plane.y + (y + i) * dst_luma_plane.x + 4 * x3 / 3U + j3 * 4, |
| ((res.z & 0x3ff) << 20) | ((res.y & 0x3ff) << 10) | (res.x & 0x3ff)); |
| } |
| } |
| } |
| |
| // Mean luma calculation |
| if (chroma_subsamp_x) { |
| if ((i & 1) == 0) { |
| avarage_luma[lid][(i >> 1) * chroma_subblock_size_x + (j >> 1) + 0] = (in_luma.x + in_luma.y + 1) >> 1; |
| avarage_luma[lid][(i >> 1) * chroma_subblock_size_x + (j >> 1) + 1] = (in_luma.z + in_luma.w + 1) >> 1; |
| } |
| } else { |
| avarage_luma[lid][i * chroma_subblock_size_x + j + 0] = in_luma.x; |
| avarage_luma[lid][i * chroma_subblock_size_x + j + 1] = in_luma.y; |
| avarage_luma[lid][i * chroma_subblock_size_x + j + 2] = in_luma.z; |
| avarage_luma[lid][i * chroma_subblock_size_x + j + 3] = in_luma.w; |
| } |
| |
| GroupMemoryBarrierWithGroupSync(); |
| |
| // Chroma block application |
| if (i < chroma_subblock_size_y && j < chroma_subblock_size_x && enable_chroma) { |
| { // cb |
| int4 avarage_luma_c; |
| avarage_luma_c.x = avarage_luma[lid][i * true_chroma_subblock_size_x + j + 0]; |
| avarage_luma_c.y = avarage_luma[lid][i * true_chroma_subblock_size_x + j + 1]; |
| avarage_luma_c.z = avarage_luma[lid][i * true_chroma_subblock_size_x + j + 2]; |
| avarage_luma_c.w = avarage_luma[lid][i * true_chroma_subblock_size_x + j + 3]; |
| |
| int2 chroma_plane = data.src_planes[1].xy; |
| int4 in_cb; |
| if (bit_depth == 8) { |
| uint cb_uint = src.Load(chroma_plane.y + ((y >> chroma_subsamp_y) + i) * chroma_plane.x + |
| (x >> chroma_subsamp_x) + j); |
| in_cb.x = (cb_uint >> 0) & 255; |
| in_cb.y = (cb_uint >> 8) & 255; |
| in_cb.z = (cb_uint >> 16) & 255; |
| in_cb.w = (cb_uint >> 24) & 255; |
| } else { |
| uint2 cb_uint = src.Load2(chroma_plane.y + ((y >> chroma_subsamp_y) + i) * chroma_plane.x + |
| ((x >> chroma_subsamp_x) + j) * 2); |
| in_cb.x = (cb_uint.x >> 0) & 0x03ff; |
| in_cb.y = (cb_uint.x >> 16) & 0x03ff; |
| in_cb.z = (cb_uint.y >> 0) & 0x03ff; |
| in_cb.w = (cb_uint.y >> 16) & 0x03ff; |
| } |
| int4 cb_to_scale = clamp(((avarage_luma_c * cb_luma_mult + cb_mult * in_cb) >> 6) + cb_offset, 0, |
| (256 << (bit_depth - 8)) - 1); |
| int4 scaled_cb; |
| if (bit_depth == 8) { |
| scaled_cb.x = |
| data.scaling_lut[cb_to_scale.x].y * (cb_grain_temp[lid][i * true_chroma_subblock_size_x + j + 0]); |
| scaled_cb.y = |
| data.scaling_lut[cb_to_scale.y].y * (cb_grain_temp[lid][i * true_chroma_subblock_size_x + j + 1]); |
| scaled_cb.z = |
| data.scaling_lut[cb_to_scale.z].y * (cb_grain_temp[lid][i * true_chroma_subblock_size_x + j + 2]); |
| scaled_cb.w = |
| data.scaling_lut[cb_to_scale.w].y * (cb_grain_temp[lid][i * true_chroma_subblock_size_x + j + 3]); |
| } else { |
| scaled_cb.x = |
| scale_LUT_cb(cb_to_scale.x) * (cb_grain_temp[lid][i * true_chroma_subblock_size_x + j + 0]); |
| scaled_cb.y = |
| scale_LUT_cb(cb_to_scale.y) * (cb_grain_temp[lid][i * true_chroma_subblock_size_x + j + 1]); |
| scaled_cb.z = |
| scale_LUT_cb(cb_to_scale.z) * (cb_grain_temp[lid][i * true_chroma_subblock_size_x + j + 2]); |
| scaled_cb.w = |
| scale_LUT_cb(cb_to_scale.w) * (cb_grain_temp[lid][i * true_chroma_subblock_size_x + j + 3]); |
| } |
| int4 out_cb = |
| clamp(in_cb + ((scaled_cb + rounding_offset) >> data.params.scaling_shift), min_chroma, max_chroma); |
| if (!apply_cb) { |
| out_cb = in_cb; |
| } |
| if (data.is_10x3) { |
| cb_grain_temp[lid][i * true_chroma_subblock_size_x + j + 0] = out_cb.x; |
| cb_grain_temp[lid][i * true_chroma_subblock_size_x + j + 1] = out_cb.y; |
| cb_grain_temp[lid][i * true_chroma_subblock_size_x + j + 2] = out_cb.z; |
| cb_grain_temp[lid][i * true_chroma_subblock_size_x + j + 3] = out_cb.w; |
| } else { |
| if ((((y >> chroma_subsamp_y) + i) < (data.height >> chroma_subsamp_y)) && |
| (((x >> chroma_subsamp_x) + j) < (data.width >> chroma_subsamp_x))) { |
| chroma_plane = data.dst_planes[1].xy; |
| if (bit_depth == 8) { |
| dst.Store( |
| chroma_plane.y + ((y >> chroma_subsamp_y) + i) * chroma_plane.x + (x >> chroma_subsamp_x) + j, |
| out_cb.x | (out_cb.y << 8) | (out_cb.z << 16) | (out_cb.w << 24)); |
| } else { |
| dst.Store2(chroma_plane.y + ((y >> chroma_subsamp_y) + i) * chroma_plane.x + |
| ((x >> chroma_subsamp_x) + j) * 2, |
| uint2((out_cb.x << 0) | (out_cb.y << 16), (out_cb.z << 0) | (out_cb.w << 16))); |
| } |
| } |
| } |
| } |
| { // cr |
| int4 avarage_luma_c; |
| avarage_luma_c.x = avarage_luma[lid][i * true_chroma_subblock_size_x + j + 0]; |
| avarage_luma_c.y = avarage_luma[lid][i * true_chroma_subblock_size_x + j + 1]; |
| avarage_luma_c.z = avarage_luma[lid][i * true_chroma_subblock_size_x + j + 2]; |
| avarage_luma_c.w = avarage_luma[lid][i * true_chroma_subblock_size_x + j + 3]; |
| |
| int2 chroma_plane = data.src_planes[2].xy; |
| int4 in_cr; |
| if (bit_depth == 8) { |
| uint cr_uint = src.Load(chroma_plane.y + ((y >> chroma_subsamp_y) + i) * chroma_plane.x + |
| (x >> chroma_subsamp_x) + j); |
| in_cr.x = (cr_uint >> 0) & 255; |
| in_cr.y = (cr_uint >> 8) & 255; |
| in_cr.z = (cr_uint >> 16) & 255; |
| in_cr.w = (cr_uint >> 24) & 255; |
| } else { |
| uint2 cr_uint = src.Load2(chroma_plane.y + ((y >> chroma_subsamp_y) + i) * chroma_plane.x + |
| ((x >> chroma_subsamp_x) + j) * 2); |
| in_cr.x = (cr_uint.x >> 0) & 0x03ff; |
| in_cr.y = (cr_uint.x >> 16) & 0x03ff; |
| in_cr.z = (cr_uint.y >> 0) & 0x03ff; |
| in_cr.w = (cr_uint.y >> 16) & 0x03ff; |
| } |
| |
| int4 cr_to_scale = clamp(((avarage_luma_c * cr_luma_mult + cr_mult * in_cr) >> 6) + cr_offset, 0, |
| (256 << (bit_depth - 8)) - 1); |
| |
| int4 scaled_cr; |
| if (bit_depth == 8) { |
| scaled_cr.x = |
| data.scaling_lut[cr_to_scale.x].z * (cr_grain_temp[lid][i * true_chroma_subblock_size_x + j + 0]); |
| scaled_cr.y = |
| data.scaling_lut[cr_to_scale.y].z * (cr_grain_temp[lid][i * true_chroma_subblock_size_x + j + 1]); |
| scaled_cr.z = |
| data.scaling_lut[cr_to_scale.z].z * (cr_grain_temp[lid][i * true_chroma_subblock_size_x + j + 2]); |
| scaled_cr.w = |
| data.scaling_lut[cr_to_scale.w].z * (cr_grain_temp[lid][i * true_chroma_subblock_size_x + j + 3]); |
| } else { |
| scaled_cr.x = |
| scale_LUT_cr(cr_to_scale.x) * (cr_grain_temp[lid][i * true_chroma_subblock_size_x + j + 0]); |
| scaled_cr.y = |
| scale_LUT_cr(cr_to_scale.y) * (cr_grain_temp[lid][i * true_chroma_subblock_size_x + j + 1]); |
| scaled_cr.z = |
| scale_LUT_cr(cr_to_scale.z) * (cr_grain_temp[lid][i * true_chroma_subblock_size_x + j + 2]); |
| scaled_cr.w = |
| scale_LUT_cr(cr_to_scale.w) * (cr_grain_temp[lid][i * true_chroma_subblock_size_x + j + 3]); |
| } |
| |
| int4 out_cr = |
| clamp(in_cr + ((scaled_cr + rounding_offset) >> data.params.scaling_shift), min_chroma, max_chroma); |
| if (!apply_cr) { |
| out_cr = in_cr; |
| } |
| if (data.is_10x3) { |
| cr_grain_temp[lid][i * true_chroma_subblock_size_x + j + 0] = out_cr.x; |
| cr_grain_temp[lid][i * true_chroma_subblock_size_x + j + 1] = out_cr.y; |
| cr_grain_temp[lid][i * true_chroma_subblock_size_x + j + 2] = out_cr.z; |
| cr_grain_temp[lid][i * true_chroma_subblock_size_x + j + 3] = out_cr.w; |
| } else { |
| if ((((y >> chroma_subsamp_y) + i) < (data.height >> chroma_subsamp_y)) && |
| (((x >> chroma_subsamp_x) + j) < (data.width >> chroma_subsamp_x))) { |
| chroma_plane = data.dst_planes[2].xy; |
| if (bit_depth == 8) { |
| dst.Store( |
| chroma_plane.y + ((y >> chroma_subsamp_y) + i) * chroma_plane.x + (x >> chroma_subsamp_x) + j, |
| out_cr.x | (out_cr.y << 8) | (out_cr.z << 16) | (out_cr.w << 24)); |
| } else { |
| dst.Store2(chroma_plane.y + ((y >> chroma_subsamp_y) + i) * chroma_plane.x + |
| ((x >> chroma_subsamp_x) + j) * 2, |
| uint2((out_cr.x << 0) | (out_cr.y << 16), (out_cr.z << 0) | (out_cr.w << 16))); |
| } |
| } |
| } |
| } |
| } |
| |
| GroupMemoryBarrierWithGroupSync(); |
| |
| if (data.is_10x3) { |
| int x3 = Gid.x * 3 * true_chroma_subblock_size_x; |
| if (GTid.y < true_chroma_subblock_size_y) { |
| i = GTid.y; |
| int2 chroma_plane = data.dst_planes[1].xy; |
| for (int j3 = GTid.z * (true_chroma_subblock_size_x / 4) + GTid.x; j3 < true_chroma_subblock_size_x; |
| j3 += (true_chroma_subblock_size_x / 4)) { |
| uint3 res; |
| res.x = |
| cb_grain_temp[(j3 * 3 + 0) / (uint)true_chroma_subblock_size_x] |
| [i * true_chroma_subblock_size_x + (j3 * 3 + 0) % (uint)true_chroma_subblock_size_x]; |
| res.y = |
| cb_grain_temp[(j3 * 3 + 1) / (uint)true_chroma_subblock_size_x] |
| [i * true_chroma_subblock_size_x + (j3 * 3 + 1) % (uint)true_chroma_subblock_size_x]; |
| res.z = |
| cb_grain_temp[(j3 * 3 + 2) / (uint)true_chroma_subblock_size_x] |
| [i * true_chroma_subblock_size_x + (j3 * 3 + 2) % (uint)true_chroma_subblock_size_x]; |
| if ((((y >> 1) + i) < (data.height >> 1)) && ((x3 + j3 * 3) < (data.width >> 1))) { |
| dst.Store(chroma_plane.y + ((y >> 1) + i) * chroma_plane.x + 4 * x3 / 3U + j3 * 4, |
| ((res.z & 0x3ff) << 20) | ((res.y & 0x3ff) << 10) | (res.x & 0x3ff)); |
| } |
| } |
| } else { |
| i = GTid.y - true_chroma_subblock_size_y; |
| int2 chroma_plane = data.dst_planes[2].xy; |
| for (int j3 = GTid.z * (true_chroma_subblock_size_x / 4) + GTid.x; j3 < true_chroma_subblock_size_x; |
| j3 += (true_chroma_subblock_size_x / 4)) { |
| uint3 res; |
| res.x = |
| cr_grain_temp[(j3 * 3 + 0) / (uint)true_chroma_subblock_size_x] |
| [i * true_chroma_subblock_size_x + (j3 * 3 + 0) % (uint)true_chroma_subblock_size_x]; |
| res.y = |
| cr_grain_temp[(j3 * 3 + 1) / (uint)chroma_subblock_size_x] |
| [i * true_chroma_subblock_size_x + (j3 * 3 + 1) % (uint)true_chroma_subblock_size_x]; |
| res.z = |
| cr_grain_temp[(j3 * 3 + 2) / (uint)chroma_subblock_size_x] |
| [i * true_chroma_subblock_size_x + (j3 * 3 + 2) % (uint)true_chroma_subblock_size_x]; |
| if ((((y >> 1) + i) < (data.height >> 1)) && ((x3 + j3 * 3) < (data.width >> 1))) { |
| dst.Store(chroma_plane.y + (y / 2U + i) * chroma_plane.x + 4 * x3 / 3U + j3 * 4, |
| ((res.z & 0x3ff) << 20) | ((res.y & 0x3ff) << 10) | (res.x & 0x3ff)); |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |