blob: 59ed5f97ac30dc60c40baf57c8a57ed815446ed5 [file] [log] [blame]
/*
* Copyright 2020 Google LLC
*
*/
/*
* Copyright (c) 2020, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "idct_shader_common.h"
#define ScanSize 4
#define QuantShift 2
cbuffer cb_scans_4x4 : register(b0) { int4 cb_scans[ScanSize * 3]; };
groupshared int shared_4x4_mem[4 * 64];
[numthreads(64, 1, 1)] void main(uint3 thread
: SV_DispatchThreadID) {
if (thread.x >= cb_wicount) return;
int wi = thread.x & 3;
uint4 block = buf_blocks.Load4((cb_index_offset + thread.x / 4) * IdctBlockSize);
const int plane = (block.x >> 21) & 3;
const int loc_mem_offset = 4 * (thread.x & (64 - 4));
const int scan_offset = ScanSize * ((block.x >> ScanShift) & ScanMask) + wi;
const int coef_count = block.x & 0x7ff;
const int input_offset = (block.y + wi * 4) << 2;
int4 coefs = int4(0, 0, 0, 0);
if (wi < coef_count)
{
coefs = (int4)buf_input.Load4(input_offset);
buf_input.Store4(input_offset, int4(0, 0, 0, 0));
}
const int coef_min = -(1 << (cb_bitdepth + 7));
const int coef_max = (1 << (cb_bitdepth + 7)) - 1;
shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].x] = clamp(coefs.x, coef_min, coef_max);
shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].y] = clamp(coefs.y, coef_min, coef_max);
shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].z] = clamp(coefs.z, coef_min, coef_max);
shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].w] = clamp(coefs.w, coef_min, coef_max);
GroupMemoryBarrier();
const int loc_offset_row = (thread.x & 63) * 4;
const int loc_offset_col = loc_mem_offset + wi;
int a1, b1, c1, d1, e1;
a1 = shared_4x4_mem[loc_offset_row + 0] >> QuantShift;
c1 = shared_4x4_mem[loc_offset_row + 1] >> QuantShift;
d1 = shared_4x4_mem[loc_offset_row + 2] >> QuantShift;
b1 = shared_4x4_mem[loc_offset_row + 3] >> QuantShift;
a1 += c1;
d1 -= b1;
e1 = (a1 - d1) >> 1;
b1 = e1 - b1;
c1 = e1 - c1;
a1 -= b1;
d1 += c1;
shared_4x4_mem[loc_offset_row + 0] = a1;
shared_4x4_mem[loc_offset_row + 1] = b1;
shared_4x4_mem[loc_offset_row + 2] = c1;
shared_4x4_mem[loc_offset_row + 3] = d1;
GroupMemoryBarrier();
a1 = shared_4x4_mem[loc_offset_col + 0];
c1 = shared_4x4_mem[loc_offset_col + 4];
d1 = shared_4x4_mem[loc_offset_col + 8];
b1 = shared_4x4_mem[loc_offset_col + 12];
a1 += c1;
d1 -= b1;
e1 = (a1 - d1) >> 1;
b1 = e1 - b1;
c1 = e1 - c1;
a1 -= b1;
d1 += c1;
shared_4x4_mem[loc_offset_col + 0] = a1;
shared_4x4_mem[loc_offset_col + 4] = b1;
shared_4x4_mem[loc_offset_col + 8] = c1;
shared_4x4_mem[loc_offset_col + 12] = d1;
GroupMemoryBarrier();
const int stride = cb_planes[plane].x;
const int offset =
cb_planes[plane].y + 8 * ((block.z & 0xffff) + (wi >> 2)) + (4 * (block.z >> 16) + (wi & 3)) * stride;
const int output_offset = loc_mem_offset + (wi & 3) * 4 + 4 * (wi >> 2);
buf_dst.Store2(offset,
int2((shared_4x4_mem[output_offset + 0] & 0xffff) | (shared_4x4_mem[output_offset + 1] << 16),
(shared_4x4_mem[output_offset + 2] & 0xffff) | (shared_4x4_mem[output_offset + 3] << 16)));
}