| /* |
| * Copyright 2020 Google LLC |
| * |
| */ |
| |
| /* |
| * Copyright (c) 2020, Alliance for Open Media. All rights reserved |
| * |
| * This source code is subject to the terms of the BSD 2 Clause License and |
| * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| * was not distributed with this source code in the LICENSE file, you can |
| * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| * Media Patent License 1.0 was not distributed with this source code in the |
| * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| */ |
| |
| #include "film_grain_const.h" |
| |
| StructuredBuffer<int> random : register(t0); |
| StructuredBuffer<int> random_uv : register(t1); |
| StructuredBuffer<int> gaussian_sequence : register(t2); |
| RWStructuredBuffer<int> dst : register(u0); |
| |
| cbuffer cb_film_grain_data : register(b0) { FilmGrainGenData data; }; |
| |
| uint get_random_number(uint random_register_new) { |
| uint bit; |
| bit = ((random_register_new >> 0) ^ (random_register_new >> 1) ^ (random_register_new >> 3) ^ |
| (random_register_new >> 12)) & |
| 1; |
| return (random_register_new >> 1) | (bit << 15); |
| } |
| int clamp_rand(int bits, uint rand) { return (rand >> (16 - bits)) & ((1 << bits) - 1); } |
| |
| uint init_random_generator(int luma_line, uint seed) { |
| // same for the picture |
| |
| uint msb = (seed >> 8) & 255; |
| uint lsb = seed & 255; |
| |
| uint random_register = (msb << 8) + lsb; |
| |
| // changes for each row |
| int luma_num = luma_line >> 5; |
| |
| random_register ^= ((luma_num * 37 + 178) & 255) << 8; |
| random_register ^= ((luma_num * 173 + 105) & 255); |
| return random_register; |
| } |
| |
| #define WGH 64 |
| groupshared int crcb_grain_temp[38 * 44]; |
| groupshared int luma_grain_temp[70 * 76]; |
| |
| [numthreads(WGH, 1, 1)] void main(uint3 DTid |
| : SV_DispatchThreadID, uint3 GTid |
| : SV_GroupThreadID) { |
| const int chroma_block_size_y = data.chroma_block_size_y; |
| const int chroma_block_size_x = data.chroma_block_size_x; |
| const int chroma_grain_stride = data.chroma_grain_stride; |
| const int luma_grain_stride = data.luma_grain_stride; |
| const int dst_offset_u = data.dst_offset_u; |
| const int dst_offset_v = data.dst_offset_v; |
| int bit_depth = data.params.bit_depth; |
| int grain_center = 128 << (bit_depth - 8); |
| int grain_min = 0 - grain_center; |
| int grain_max = (256 << (bit_depth - 8)) - 1 - grain_center; |
| int gauss_sec_shift = 12 - bit_depth + data.params.grain_scale_shift; |
| |
| int num_pos_chroma = 2 * data.params.ar_coeff_lag * (data.params.ar_coeff_lag + 1); |
| if (data.params.num_y_points > 0) ++num_pos_chroma; |
| int rounding_offset = (1 << (data.params.ar_coeff_shift - 1)); |
| int skip = 0; |
| if (DTid.y == 0) { |
| if (!(data.params.num_cb_points || data.params.chroma_scaling_from_luma)) { |
| skip = 1; |
| } |
| } else { |
| if (!(data.params.num_cr_points || data.params.chroma_scaling_from_luma)) { |
| skip = 1; |
| } |
| } |
| if (!skip) { |
| if (GTid.x == 0) { |
| uint rand_reg = init_random_generator((7 + DTid.y * 4) << 5, data.params.random_seed); |
| rand_reg = get_random_number(rand_reg); |
| crcb_grain_temp[0] = rand_reg; |
| // could be optimizaed by 2^ setps parallelization |
| for (int i = 1; i < chroma_block_size_y; i++) { |
| rand_reg = random_uv[rand_reg]; |
| crcb_grain_temp[i * chroma_grain_stride] = rand_reg; |
| } |
| } |
| GroupMemoryBarrier(); |
| for (int i = GTid.x; i < chroma_block_size_y; i += WGH) { |
| uint rand_reg = crcb_grain_temp[i * chroma_grain_stride]; |
| for (int j = 1; j < chroma_block_size_x; j++) { |
| rand_reg = get_random_number(rand_reg); |
| crcb_grain_temp[i * chroma_grain_stride + j] = rand_reg; |
| } |
| } |
| GroupMemoryBarrier(); |
| for (int j = GTid.x; j < chroma_block_size_y * chroma_block_size_x; j += WGH) { |
| crcb_grain_temp[j] = |
| (gaussian_sequence[clamp_rand(gauss_bits, crcb_grain_temp[j])] + ((1 << gauss_sec_shift) >> 1)) >> |
| gauss_sec_shift; |
| } |
| } else { |
| for (int j = GTid.x; j < chroma_block_size_y * chroma_block_size_x; j += WGH) crcb_grain_temp[j] = 0; |
| } |
| int i; |
| for (i = 0; i < 70; i++) { |
| for (int j = GTid.x; j < 76; j += WGH) { |
| luma_grain_temp[i * 76 + j] = dst[(data.top_pad + i) * luma_grain_stride + j + data.left_pad]; |
| } |
| } |
| GroupMemoryBarrierWithGroupSync(); |
| for (i = data.top_pad + GTid.x; i < chroma_block_size_y - data.bottom_pad; i += WGH) { |
| for (int j = data.left_pad - GTid.x * (data.left_pad + 1); j < chroma_block_size_x - data.right_pad; j++) { |
| if (j < data.left_pad) { |
| continue; |
| } |
| int wsum_crcb = 0; |
| if (data.params.ar_coeff_lag == 3) { |
| for (uint pos = 0; pos < 24; pos++) { |
| wsum_crcb = wsum_crcb + data.params.ar_coeffs[1 + DTid.y][pos].x * |
| crcb_grain_temp[(i + pos / 7 - 3) * chroma_grain_stride + j + pos % 7 - 3]; |
| } |
| } else { |
| for (int pos = 0; pos < num_pos_chroma - 1; pos++) { |
| wsum_crcb = |
| wsum_crcb + |
| data.params.ar_coeffs[1 + DTid.y][pos].x * |
| crcb_grain_temp[(i + data.pred_pos[pos][0].y) * chroma_grain_stride + j + data.pred_pos[pos][1].y]; |
| } |
| } |
| if (data.params.num_y_points > 0) { |
| int av_luma = 0; |
| int luma_coord_y = ((i - data.top_pad) << 1); |
| int luma_coord_x = ((j - data.left_pad) << 1); |
| |
| for (int k = luma_coord_y; k < luma_coord_y + 1 + 1; k++) |
| for (int l = luma_coord_x; l < luma_coord_x + 1 + 1; l++) av_luma += luma_grain_temp[k * 76 + l]; |
| |
| av_luma = (av_luma + ((1 << (1 + 1)) >> 1)) >> (1 + 1); |
| |
| wsum_crcb = wsum_crcb + data.params.ar_coeffs[DTid.y + 1][num_pos_chroma - 1].x * av_luma; |
| } |
| if (!skip) |
| crcb_grain_temp[i * chroma_grain_stride + j] = |
| clamp(crcb_grain_temp[i * chroma_grain_stride + j] + |
| ((wsum_crcb + rounding_offset) >> data.params.ar_coeff_shift), |
| grain_min, grain_max); |
| } |
| } |
| GroupMemoryBarrierWithGroupSync(); |
| if (DTid.y == 0) { |
| for (int j = GTid.x; j < chroma_block_size_y * chroma_block_size_x; j += WGH) { |
| dst[dst_offset_u + j] = crcb_grain_temp[j]; |
| } |
| } else { |
| for (int j = GTid.x; j < chroma_block_size_y * chroma_block_size_x; j += WGH) { |
| dst[dst_offset_v + j] = crcb_grain_temp[j]; |
| } |
| } |
| } |