Moved some operations from CPU (function av1_read_coeffs_txb) to GPU
Change-Id: I2dbc785ca94b71479aa438953263022720ca68f4
diff --git a/libav1/av1/decoder/decodetxb.c b/libav1/av1/decoder/decodetxb.c
index c4451da..955cd06 100644
--- a/libav1/av1/decoder/decodetxb.c
+++ b/libav1/av1/decoder/decodetxb.c
@@ -316,10 +316,10 @@
if (sign) {
dq_coeff = -dq_coeff;
}
- tcoeffs_raw[c] = clamp(dq_coeff, min_value, max_value);
+ tcoeffs_raw[c] = dq_coeff;// clamp(dq_coeff, min_value, max_value);
}
- else
- tcoeffs_raw[c] = 0;
+ //else
+ // tcoeffs_raw[c] = 0;
}
@@ -328,49 +328,49 @@
// DC value
set_dc_sign(&cul_level, dc_val);
{
- const int tx_types_flags[] = {
- 0, // DCT_DCT,
- 0x080000, // ADST_DCT,
- 0x020000, // DCT_ADST,
- 0x0A0000, // ADST_ADST,
- 0x090000, // FLIPADST_DCT,
- 0x028000, // DCT_FLIPADST,
- 0x0B8000, // FLIPADST_FLIPADST,
- 0x0A8000, // ADST_FLIPADST,
- 0x0B0000, // FLIPADST_ADST,
- 0x140000, // IDTX,
- 0x040000, // V_DCT,
- 0x100000, // H_DCT,
- 0x0C0000, // V_ADST,
- 0x120000, // H_ADST,
- 0x0D0000, // V_FLIPADST,
- 0x128000 // H_FLIPADST,
- };
+ const int tx_types_flags[] = {
+ 0, // DCT_DCT,
+ 0x080000, // ADST_DCT,
+ 0x020000, // DCT_ADST,
+ 0x0A0000, // ADST_ADST,
+ 0x090000, // FLIPADST_DCT,
+ 0x028000, // DCT_FLIPADST,
+ 0x0B8000, // FLIPADST_FLIPADST,
+ 0x0A8000, // ADST_FLIPADST,
+ 0x0B0000, // FLIPADST_ADST,
+ 0x140000, // IDTX,
+ 0x040000, // V_DCT,
+ 0x100000, // H_DCT,
+ 0x0C0000, // V_ADST,
+ 0x120000, // H_ADST,
+ 0x0D0000, // V_FLIPADST,
+ 0x128000 // H_FLIPADST,
+ };
- int mi_col = mbmi->mi_col;
- int mi_row = mbmi->mi_row;
- if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[mbmi->sb_type] == 1))
- mi_row -= 1;
- if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[mbmi->sb_type] == 1))
- mi_col -= 1;
- const uint32_t x = ((mi_col * MI_SIZE) >> (pd->subsampling_x + 2)) + blk_col;
- const uint32_t y = ((mi_row * MI_SIZE) >> (pd->subsampling_y + 2)) + blk_row;
- tx_block_info_gpu * block = tile_data->idct_blocks_host + tile_data->idct_blocks_ptr;
- const uint32_t coef_count = ((*eob) + 3) >> 2;
- const int av1_idct_scans_lut[TX_TYPES] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 2
- };
+ int mi_col = mbmi->mi_col;
+ int mi_row = mbmi->mi_row;
+ if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[mbmi->sb_type] == 1))
+ mi_row -= 1;
+ if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[mbmi->sb_type] == 1))
+ mi_col -= 1;
+ const uint32_t x = ((mi_col * MI_SIZE) >> (pd->subsampling_x + 2)) + blk_col;
+ const uint32_t y = ((mi_row * MI_SIZE) >> (pd->subsampling_y + 2)) + blk_row;
+ tx_block_info_gpu * block = tile_data->idct_blocks_host + tile_data->idct_blocks_ptr;
+ const uint32_t coef_count = ((*eob) + 3) >> 2;
+ const int av1_idct_scans_lut[TX_TYPES] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 2
+ };
- for (uint32_t c = *eob; c < (coef_count << 2); ++c)
- tcoeffs_raw[c] = 0;
- const int type_index = (xd->lossless[xd->mi[0]->segment_id] && tx_size == 0)? TX_SIZES_ALL : tx_size;
- block->flags = coef_count | tx_types_flags[tx_type] | (plane << 21) | (av1_idct_scans_lut[tx_type] << 11);
- block->input_offset = tile_data->dq_buffer_ptr;
- block->output_pos = x | (y << 16);
- block->sorting_idx = (tile_data->idct_blocks_sizes[type_index] << 8) + type_index;
- ++tile_data->idct_blocks_sizes[type_index];
- ++tile_data->idct_blocks_ptr;
- tile_data->dq_buffer_ptr += coef_count << 2;
+ //for (uint32_t c = *eob; c < (coef_count << 2); ++c)
+ // tcoeffs_raw[c] = 0;
+ const int type_index = (xd->lossless[xd->mi[0]->segment_id] && tx_size == 0)? TX_SIZES_ALL : tx_size;
+ block->flags = coef_count | tx_types_flags[tx_type] | (plane << 21) | (av1_idct_scans_lut[tx_type] << 11);
+ block->input_offset = tile_data->dq_buffer_ptr;
+ block->output_pos = x | (y << 16);
+ block->sorting_idx = (tile_data->idct_blocks_sizes[type_index] << 8) + type_index;
+ ++tile_data->idct_blocks_sizes[type_index];
+ ++tile_data->idct_blocks_ptr;
+ tile_data->dq_buffer_ptr += coef_count << 2;
}
return cul_level;
}
diff --git a/libav1/dx/av1_compute.cpp b/libav1/dx/av1_compute.cpp
index 209bc40..03fccbc 100644
--- a/libav1/dx/av1_compute.cpp
+++ b/libav1/dx/av1_compute.cpp
@@ -194,7 +194,7 @@
if (!shader_lib) return 0;
ID3D12Device* _device = static_cast<ID3D12Device*>(d3d12device);
- err |= (shader_lib->sig_idct = create_root_sig(_device, 2, 1, 2, 2)) == NULL;
+ err |= (shader_lib->sig_idct = create_root_sig(_device, 1, 2, 2, 2)) == NULL;
err |= (shader_lib->sig_common111 = create_root_sig(_device, 1, 1, 1, 0)) == NULL;
err |= (shader_lib->sig_common0102 = create_root_sig(_device, 0, 1, 0, 2)) == NULL;
err |= (shader_lib->sig_common0110 = create_root_sig(_device, 0, 1, 1, 0)) == NULL;
diff --git a/libav1/dx/av1_core.cpp b/libav1/dx/av1_core.cpp
index 9f43a34..01822f8 100644
--- a/libav1/dx/av1_core.cpp
+++ b/libav1/dx/av1_core.cpp
@@ -165,13 +165,14 @@
CHECK_RESULT(dec->loopfilter_blocks, mem->create_buffer(lf_blk_count * 32, MemoryType::DeviceOnly), do_assign);
CHECK_RESULT(dec->mode_info_pool, mem->create_buffer(sizeof(MB_MODE_INFO) * mi_size, MemoryType::HostRW), do_assign);
const int coef_buffer_size = target_width * target_height * 3 * IdctCoefCountNum / (IdctCoefCountDenum * 2);
- CHECK_RESULT(dec->idct_coefs, mem->create_buffer(sizeof(int) * coef_buffer_size * 2, MemoryType::DeviceUpload),
+ CHECK_RESULT(dec->idct_coefs, mem->create_buffer(sizeof(int) * coef_buffer_size * 2, MemoryType::UploadUAV),
do_assign);
for (int i = 0; i < cfg.gpu_pipeline_depth; ++i) {
av1_frame_thread_data *td = &dec->frame_thread_data[i];
td->mode_info_max = mi_size >> 1;
td->mode_info_offset = i * (mi_size >> 1);
td->coef_buffer_offset = i * coef_buffer_size;
+ td->coef_buffer_size = coef_buffer_size * sizeof(int);
CHECK_RESULT(td->command_buffer.cb_alloc, mem->create_buffer(1024 * 1024, MemoryType::DeviceUpload), do_assign);
CHECK_RESULT(td->tile_data, (av1_tile_data *)mem->host_allocate(sizeof(av1_tile_data) * max_tiles), do_assign);
CHECK_RESULT(td->gen_mi_block_indexes, mem->create_buffer(sizeof(int) * block_count_4x4 * 2, MemoryType::HostRW),
@@ -179,11 +180,8 @@
CHECK_RESULT(td->gen_intra_inter_grid, mem->create_buffer(grid_w * grid_h * 6, MemoryType::HostRW), do_assign);
CHECK_RESULT(td->gen_block_map, mem->create_buffer(dec->pred_map_size * sizeof(int), MemoryType::HostRW),
do_assign);
- // CHECK_RESULT(td->mode_info, mem->create_buffer(sizeof(MB_MODE_INFO) * mi_size, MemoryType::HostRW), do_assign);
CHECK_RESULT(td->mode_info_grid, mem->create_buffer(sizeof(MB_MODE_INFO *) * mi_size, MemoryType::HostRW),
do_assign);
- // CHECK_RESULT(td->idct_coefs, mem->create_buffer(sizeof(int) * target_width * target_height * 3 / 2,
- // MemoryType::DeviceUpload), do_assign);
CHECK_RESULT(td->idct_blocks_unordered,
mem->create_buffer(block_count_4x4 * IdctBlockSize, MemoryType::DeviceUpload), do_assign);
@@ -195,7 +193,9 @@
CHECK_RESULT(td->loop_rest_wiener, mem->create_buffer(64 * (block_count_4x4 >> 8), MemoryType::DeviceUpload),
do_assign);
CHECK_RESULT(td->filmgrain_rand_offset,
- mem->create_buffer(sizeof(int) * (120 * (68 + 1)), MemoryType::DeviceUpload), do_assign);
+ mem->create_buffer(sizeof(int) * ((target_width / 32 + 4) * (target_height / 32 + 1)),
+ MemoryType::DeviceUpload),
+ do_assign);
CHECK_RESULT(td->palette_buffer, mem->create_buffer(fb_size, MemoryType::DeviceUpload), do_assign);
}
@@ -266,8 +266,7 @@
D3D12_COMMAND_QUEUE_DESC desc = {};
if (!context->queue) {
desc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE;
- desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT; // enable_cpu_output == EnableHostOutput ?
- // D3D12_COMMAND_LIST_TYPE_DIRECT : D3D12_COMMAND_LIST_TYPE_COMPUTE;
+ desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
hr = device->CreateCommandQueue(&desc, IID_PPV_ARGS(&context->queue));
if (FAILED(hr)) return hr;
}
@@ -295,9 +294,6 @@
dx_compute_context *compute = &dec->compute;
mem->set_dx_context(compute);
- // if (!cfg->out_buffers_cb.get_out_buffer_cb ||
- // !cfg->out_buffers_cb.release_out_buffer_cb)
- // return -1;
dec->cb_get_output_image = cfg->out_buffers_cb.get_out_buffer_cb;
dec->cb_release_image = cfg->out_buffers_cb.release_out_buffer_cb;
dec->cb_notify_frame_ready = cfg->out_buffers_cb.notify_frame_ready_cb;
@@ -315,6 +311,7 @@
rcfg.gpu_pipeline_depth = FrameThreadDataCount;
rcfg.enable_superres = 1;
if (av1_allocate_buffers(dec, mem, rcfg, 1)) return -1;
+ memset(dec->idct_coefs->host_ptr, 0, dec->idct_coefs->size);
Microsoft::WRL::ComPtr<ID3D12Device> device = compute->device;
dec->tryhdr10x3 = cfg->tryHDR10x3;
@@ -358,7 +355,6 @@
img->size = 0;
img->fb_ptr = NULL;
img->is_valid = 0;
- // img->hw_buf = dec->output_frame_buffers[i];
MTQueuePush(&dec->image_pool, img);
}
@@ -368,8 +364,6 @@
for (int i = 0; i < rcfg.ref_fb_count; ++i) {
const int offset = dec->fb_offset + dec->fb_size * i;
HwFrameBuffer *fb = &dec->fb_pool_src[i];
- // fb->pool_ptr = pool;
- // fb->fb_ptr = pool + offset;
fb->size = dec->fb_size;
fb->base_offset = offset;
fb->ref_cnt = 0;
@@ -745,6 +739,8 @@
MTQueuePush(&dec->frame_data_pool, td->sec_thread_data);
}
MTQueuePush(&dec->frame_data_pool, td);
+ int * coef_buf = (int *)dec->idct_coefs->host_ptr;
+ memset(coef_buf + td->coef_buffer_offset, 0, td->coef_buffer_size);
dec->curr_frame_data = NULL;
}
diff --git a/libav1/dx/av1_memory.cpp b/libav1/dx/av1_memory.cpp
index 7c75fe9..ab660e8 100644
--- a/libav1/dx/av1_memory.cpp
+++ b/libav1/dx/av1_memory.cpp
@@ -91,25 +91,23 @@
obj->size = (size + DeviceMemAlign1) & (~(DeviceMemAlign1));
D3D12_HEAP_TYPE heapLut[] = {D3D12_HEAP_TYPE_DEFAULT, D3D12_HEAP_TYPE_UPLOAD, D3D12_HEAP_TYPE_CUSTOM,
- D3D12_HEAP_TYPE_READBACK, D3D12_HEAP_TYPE_DEFAULT};
- const D3D12_HEAP_TYPE heapType = heapLut[mem];
- const D3D12_RESOURCE_STATES state = (mem == DeviceOnly) ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
- : (mem == DeviceOnlyConst)
- ? D3D12_RESOURCE_STATE_COPY_DEST
- : (mem == ReadBack) ? D3D12_RESOURCE_STATE_COPY_DEST
- : D3D12_RESOURCE_STATE_GENERIC_READ;
+ D3D12_HEAP_TYPE_READBACK, D3D12_HEAP_TYPE_DEFAULT, D3D12_HEAP_TYPE_CUSTOM };
+ D3D12_RESOURCE_STATES stateLut[] = { D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_GENERIC_READ, D3D12_RESOURCE_STATE_GENERIC_READ,
+ D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS };
+ D3D12_CPU_PAGE_PROPERTY pagePropLut[] = { D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_CPU_PAGE_PROPERTY_WRITE_BACK,
+ D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE };
D3D12_HEAP_PROPERTIES heapProp;
- heapProp.Type = heapType;
- heapProp.CPUPageProperty = mem == HostRW ? D3D12_CPU_PAGE_PROPERTY_WRITE_BACK : D3D12_CPU_PAGE_PROPERTY_UNKNOWN; //
- heapProp.MemoryPoolPreference = mem == HostRW ? D3D12_MEMORY_POOL_L0 : D3D12_MEMORY_POOL_UNKNOWN;
+ heapProp.Type = heapLut[mem];
+ heapProp.CPUPageProperty = pagePropLut[mem]; //
+ heapProp.MemoryPoolPreference = (mem == HostRW || mem == UploadUAV) ? D3D12_MEMORY_POOL_L0 : D3D12_MEMORY_POOL_UNKNOWN;
heapProp.CreationNodeMask = 0;
heapProp.VisibleNodeMask = 0;
const D3D12_RESOURCE_FLAGS flags =
- (mem == DeviceOnly) ? D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS : D3D12_RESOURCE_FLAG_NONE;
+ (mem == DeviceOnly || mem == UploadUAV) ? D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS : D3D12_RESOURCE_FLAG_NONE;
HRESULT hr = context->device->CreateCommittedResource(&heapProp, D3D12_HEAP_FLAG_NONE,
- &CD3DX12_RESOURCE_DESC::Buffer(obj->size, flags), state, NULL,
+ &CD3DX12_RESOURCE_DESC::Buffer(obj->size, flags), stateLut[mem], NULL,
__uuidof(*obj->dev), reinterpret_cast<void **>(&obj->dev));
if (SUCCEEDED(hr) && mem != DeviceOnly && mem != DeviceOnlyConst) {
diff --git a/libav1/dx/av1_memory.h b/libav1/dx/av1_memory.h
index 6b88eda..38c2573 100644
--- a/libav1/dx/av1_memory.h
+++ b/libav1/dx/av1_memory.h
@@ -40,6 +40,7 @@
HostRW = 2,
ReadBack = 3,
DeviceOnlyConst = 4,
+ UploadUAV = 5,
};
using Microsoft::WRL::ComPtr;
diff --git a/libav1/dx/shaders/idct_lossless.hlsl b/libav1/dx/shaders/idct_lossless.hlsl
index 15de470..59ed5f9 100644
--- a/libav1/dx/shaders/idct_lossless.hlsl
+++ b/libav1/dx/shaders/idct_lossless.hlsl
@@ -34,12 +34,18 @@
const int coef_count = block.x & 0x7ff;
const int input_offset = (block.y + wi * 4) << 2;
- int4 coefs = (wi < coef_count) ? (int4)buf_input.Load4(input_offset) : int4(0, 0, 0, 0);
-
- shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].x] = coefs.x;
- shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].y] = coefs.y;
- shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].z] = coefs.z;
- shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].w] = coefs.w;
+ int4 coefs = int4(0, 0, 0, 0);
+ if (wi < coef_count)
+ {
+ coefs = (int4)buf_input.Load4(input_offset);
+ buf_input.Store4(input_offset, int4(0, 0, 0, 0));
+ }
+ const int coef_min = -(1 << (cb_bitdepth + 7));
+ const int coef_max = (1 << (cb_bitdepth + 7)) - 1;
+ shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].x] = clamp(coefs.x, coef_min, coef_max);
+ shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].y] = clamp(coefs.y, coef_min, coef_max);
+ shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].z] = clamp(coefs.z, coef_min, coef_max);
+ shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].w] = clamp(coefs.w, coef_min, coef_max);
GroupMemoryBarrier();
diff --git a/libav1/dx/shaders/idct_shader_common.h b/libav1/dx/shaders/idct_shader_common.h
index 63bed6a..425e768 100644
--- a/libav1/dx/shaders/idct_shader_common.h
+++ b/libav1/dx/shaders/idct_shader_common.h
@@ -19,9 +19,9 @@
#define IdctBlockSize 16
-ByteAddressBuffer buf_input : register(t0);
-ByteAddressBuffer buf_blocks : register(t1);
+ByteAddressBuffer buf_blocks : register(t0);
RWByteAddressBuffer buf_dst : register(u0);
+RWByteAddressBuffer buf_input : register(u1);
cbuffer cb_idct_frame_data : register(b1) {
uint4 cb_planes[3];
@@ -33,6 +33,9 @@
uint cb_wicount;
};
+int clamp_value(int value, int2 range) { return clamp(value, range.x, range.y); }
+
+
#define NewSqrt2Bits 12
#define NewSqrt2 5793
#define NewInvSqrt2 2896
@@ -61,7 +64,13 @@
for (i = 0; i < COEF_LOOP; ++i) { \
int4 coefs = int4(0, 0, 0, 0); \
if ((wi + i * N) < coef_count) { \
- coefs = (int4)buf_input.Load4(input_offset * 4 + i * N * 16); \
+ const int addr = input_offset * 4 + i * N * 16; \
+ coefs = (int4)buf_input.Load4(addr); \
+ buf_input.Store4(addr, uint4(0, 0, 0, 0)); \
+ coefs.x = clamp_value(coefs.x, row_clamp); \
+ coefs.y = clamp_value(coefs.y, row_clamp); \
+ coefs.z = clamp_value(coefs.z, row_clamp); \
+ coefs.w = clamp_value(coefs.w, row_clamp); \
if (SCALE_COEF) { \
coefs.x = round_shift(coefs.x * NewInvSqrt2, NewSqrt2Bits); \
coefs.y = round_shift(coefs.y * NewInvSqrt2, NewSqrt2Bits); \
@@ -178,8 +187,6 @@
static const int sinpi[] = {0, 1321, 2482, 3344, 3803};
-int clamp_value(int value, int2 range) { return clamp(value, range.x, range.y); }
-
int round_shift(int value, int bit) {
if (bit == 0) return value;
return (int)((value + (1 << (bit - 1))) >> bit);
diff --git a/libav1/dx/transform.cpp b/libav1/dx/transform.cpp
index 3b3e844..d087b86 100644
--- a/libav1/dx/transform.cpp
+++ b/libav1/dx/transform.cpp
@@ -96,9 +96,9 @@
av1_tile_data *tdata = td->tile_data;
command_list->SetComputeRootSignature(dec->shader_lib->sig_idct.Get());
- command_list->SetComputeRootShaderResourceView(0, dec->idct_coefs->dev->GetGPUVirtualAddress());
- command_list->SetComputeRootShaderResourceView(1, dec->idct_blocks->dev->GetGPUVirtualAddress());
- command_list->SetComputeRootUnorderedAccessView(2, dec->idct_residuals->dev->GetGPUVirtualAddress());
+ command_list->SetComputeRootShaderResourceView(0, dec->idct_blocks->dev->GetGPUVirtualAddress());
+ command_list->SetComputeRootUnorderedAccessView(1, dec->idct_residuals->dev->GetGPUVirtualAddress());
+ command_list->SetComputeRootUnorderedAccessView(2, dec->idct_coefs->dev->GetGPUVirtualAddress());
command_list->SetComputeRootConstantBufferView(4, cbo.dev_address);
for (int type = 0; type <= TX_SIZES_ALL; ++type) {
int offset = tdata->idct_blocks_sizes[type];
diff --git a/libav1/dx/types.h b/libav1/dx/types.h
index b4fba6d..b689df8 100644
--- a/libav1/dx/types.h
+++ b/libav1/dx/types.h
@@ -42,7 +42,6 @@
uint32_t flags;
MV mv0;
MV mv1;
- // uint32_t sorting_idx;
};
enum InterTypes {
@@ -159,6 +158,7 @@
int mode_info_offset;
int mode_info_max;
int coef_buffer_offset;
+ int coef_buffer_size;
av1_tile_data* tile_data;
HwFrameBuffer* frame_buffer;
HwFrameBuffer* back_buffer0;
@@ -242,5 +242,4 @@
int tryhdr10x3;
} Av1Core;
-Av1Core* Get(uint32_t may_be_null = 0);
void PutPerfMarker(av1_frame_thread_data* td, volatile int64_t* marker);