libav1/dx/shaders/idct_lossless.hlsl - av1-xbox-one - Git at Google

 /*
  * Copyright 2020 Google LLC
  *
  */

 /*
  * Copyright (c) 2020, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
  * was not distributed with this source code in the LICENSE file, you can
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */

 #include "idct_shader_common.h"

 #define ScanSize 4
 #define QuantShift 2

 cbuffer cb_scans_4x4 : register(b0) { int4 cb_scans[ScanSize * 3]; };

 groupshared int shared_4x4_mem[4 * 64];

 [numthreads(64, 1, 1)] void main(uint3 thread
                                  : SV_DispatchThreadID) {
   if (thread.x >= cb_wicount) return;
   int wi = thread.x & 3;
   uint4 block = buf_blocks.Load4((cb_index_offset + thread.x / 4) * IdctBlockSize);
   const int plane = (block.x >> 21) & 3;
   const int loc_mem_offset = 4 * (thread.x & (64 - 4));
   const int scan_offset = ScanSize * ((block.x >> ScanShift) & ScanMask) + wi;
   const int coef_count = block.x & 0x7ff;

   const int input_offset = (block.y + wi * 4) << 2;
   int4 coefs = int4(0, 0, 0, 0);
   if (wi < coef_count)
   {
       coefs = (int4)buf_input.Load4(input_offset);
       buf_input.Store4(input_offset, int4(0, 0, 0, 0));
   }
   const int coef_min = -(1 << (cb_bitdepth + 7));
   const int coef_max = (1 << (cb_bitdepth + 7)) - 1;
   shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].x] = clamp(coefs.x, coef_min, coef_max);
   shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].y] = clamp(coefs.y, coef_min, coef_max);
   shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].z] = clamp(coefs.z, coef_min, coef_max);
   shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].w] = clamp(coefs.w, coef_min, coef_max);

   GroupMemoryBarrier();

   const int loc_offset_row = (thread.x & 63) * 4;
   const int loc_offset_col = loc_mem_offset + wi;

   int a1, b1, c1, d1, e1;
   a1 = shared_4x4_mem[loc_offset_row + 0] >> QuantShift;
   c1 = shared_4x4_mem[loc_offset_row + 1] >> QuantShift;
   d1 = shared_4x4_mem[loc_offset_row + 2] >> QuantShift;
   b1 = shared_4x4_mem[loc_offset_row + 3] >> QuantShift;

   a1 += c1;
   d1 -= b1;
   e1 = (a1 - d1) >> 1;
   b1 = e1 - b1;
   c1 = e1 - c1;
   a1 -= b1;
   d1 += c1;

   shared_4x4_mem[loc_offset_row + 0] = a1;
   shared_4x4_mem[loc_offset_row + 1] = b1;
   shared_4x4_mem[loc_offset_row + 2] = c1;
   shared_4x4_mem[loc_offset_row + 3] = d1;

   GroupMemoryBarrier();

   a1 = shared_4x4_mem[loc_offset_col + 0];
   c1 = shared_4x4_mem[loc_offset_col + 4];
   d1 = shared_4x4_mem[loc_offset_col + 8];
   b1 = shared_4x4_mem[loc_offset_col + 12];
   a1 += c1;
   d1 -= b1;
   e1 = (a1 - d1) >> 1;
   b1 = e1 - b1;
   c1 = e1 - c1;
   a1 -= b1;
   d1 += c1;
   shared_4x4_mem[loc_offset_col + 0] = a1;
   shared_4x4_mem[loc_offset_col + 4] = b1;
   shared_4x4_mem[loc_offset_col + 8] = c1;
   shared_4x4_mem[loc_offset_col + 12] = d1;

   GroupMemoryBarrier();

   const int stride = cb_planes[plane].x;
   const int offset =
       cb_planes[plane].y + 8 * ((block.z & 0xffff) + (wi >> 2)) + (4 * (block.z >> 16) + (wi & 3)) * stride;

   const int output_offset = loc_mem_offset + (wi & 3) * 4 + 4 * (wi >> 2);
   buf_dst.Store2(offset,
                  int2((shared_4x4_mem[output_offset + 0] & 0xffff) | (shared_4x4_mem[output_offset + 1] << 16),
                       (shared_4x4_mem[output_offset + 2] & 0xffff) | (shared_4x4_mem[output_offset + 3] << 16)));
 }
	/*
	* Copyright 2020 Google LLC
	*
	*/

	/*
	* Copyright (c) 2020, Alliance for Open Media. All rights reserved
	*
	* This source code is subject to the terms of the BSD 2 Clause License and
	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
	* was not distributed with this source code in the LICENSE file, you can
	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
	* Media Patent License 1.0 was not distributed with this source code in the
	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
	*/

	#include "idct_shader_common.h"

	#define ScanSize 4
	#define QuantShift 2

	cbuffer cb_scans_4x4 : register(b0) { int4 cb_scans[ScanSize * 3]; };

	groupshared int shared_4x4_mem[4 * 64];

	[numthreads(64, 1, 1)] void main(uint3 thread
	: SV_DispatchThreadID) {
	if (thread.x >= cb_wicount) return;
	int wi = thread.x & 3;
	uint4 block = buf_blocks.Load4((cb_index_offset + thread.x / 4) * IdctBlockSize);
	const int plane = (block.x >> 21) & 3;
	const int loc_mem_offset = 4 * (thread.x & (64 - 4));
	const int scan_offset = ScanSize * ((block.x >> ScanShift) & ScanMask) + wi;
	const int coef_count = block.x & 0x7ff;

	const int input_offset = (block.y + wi * 4) << 2;
	int4 coefs = int4(0, 0, 0, 0);
	if (wi < coef_count)
	{
	coefs = (int4)buf_input.Load4(input_offset);
	buf_input.Store4(input_offset, int4(0, 0, 0, 0));
	}
	const int coef_min = -(1 << (cb_bitdepth + 7));
	const int coef_max = (1 << (cb_bitdepth + 7)) - 1;
	shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].x] = clamp(coefs.x, coef_min, coef_max);
	shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].y] = clamp(coefs.y, coef_min, coef_max);
	shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].z] = clamp(coefs.z, coef_min, coef_max);
	shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].w] = clamp(coefs.w, coef_min, coef_max);

	GroupMemoryBarrier();

	const int loc_offset_row = (thread.x & 63) * 4;
	const int loc_offset_col = loc_mem_offset + wi;

	int a1, b1, c1, d1, e1;
	a1 = shared_4x4_mem[loc_offset_row + 0] >> QuantShift;
	c1 = shared_4x4_mem[loc_offset_row + 1] >> QuantShift;
	d1 = shared_4x4_mem[loc_offset_row + 2] >> QuantShift;
	b1 = shared_4x4_mem[loc_offset_row + 3] >> QuantShift;

	a1 += c1;
	d1 -= b1;
	e1 = (a1 - d1) >> 1;
	b1 = e1 - b1;
	c1 = e1 - c1;
	a1 -= b1;
	d1 += c1;

	shared_4x4_mem[loc_offset_row + 0] = a1;
	shared_4x4_mem[loc_offset_row + 1] = b1;
	shared_4x4_mem[loc_offset_row + 2] = c1;
	shared_4x4_mem[loc_offset_row + 3] = d1;

	GroupMemoryBarrier();

	a1 = shared_4x4_mem[loc_offset_col + 0];
	c1 = shared_4x4_mem[loc_offset_col + 4];
	d1 = shared_4x4_mem[loc_offset_col + 8];
	b1 = shared_4x4_mem[loc_offset_col + 12];
	a1 += c1;
	d1 -= b1;
	e1 = (a1 - d1) >> 1;
	b1 = e1 - b1;
	c1 = e1 - c1;
	a1 -= b1;
	d1 += c1;
	shared_4x4_mem[loc_offset_col + 0] = a1;
	shared_4x4_mem[loc_offset_col + 4] = b1;
	shared_4x4_mem[loc_offset_col + 8] = c1;
	shared_4x4_mem[loc_offset_col + 12] = d1;

	GroupMemoryBarrier();

	const int stride = cb_planes[plane].x;
	const int offset =
	cb_planes[plane].y + 8 * ((block.z & 0xffff) + (wi >> 2)) + (4 * (block.z >> 16) + (wi & 3)) * stride;

	const int output_offset = loc_mem_offset + (wi & 3) * 4 + 4 * (wi >> 2);
	buf_dst.Store2(offset,
	int2((shared_4x4_mem[output_offset + 0] & 0xffff) \| (shared_4x4_mem[output_offset + 1] << 16),
	(shared_4x4_mem[output_offset + 2] & 0xffff) \| (shared_4x4_mem[output_offset + 3] << 16)));
	}