Moved some operations from CPU (function av1_read_coeffs_txb) to GPU Change-Id: I2dbc785ca94b71479aa438953263022720ca68f4
diff --git a/libav1/av1/decoder/decodetxb.c b/libav1/av1/decoder/decodetxb.c index c4451da..955cd06 100644 --- a/libav1/av1/decoder/decodetxb.c +++ b/libav1/av1/decoder/decodetxb.c
@@ -316,10 +316,10 @@ if (sign) { dq_coeff = -dq_coeff; } - tcoeffs_raw[c] = clamp(dq_coeff, min_value, max_value); + tcoeffs_raw[c] = dq_coeff;// clamp(dq_coeff, min_value, max_value); } - else - tcoeffs_raw[c] = 0; + //else + // tcoeffs_raw[c] = 0; } @@ -328,49 +328,49 @@ // DC value set_dc_sign(&cul_level, dc_val); { - const int tx_types_flags[] = { - 0, // DCT_DCT, - 0x080000, // ADST_DCT, - 0x020000, // DCT_ADST, - 0x0A0000, // ADST_ADST, - 0x090000, // FLIPADST_DCT, - 0x028000, // DCT_FLIPADST, - 0x0B8000, // FLIPADST_FLIPADST, - 0x0A8000, // ADST_FLIPADST, - 0x0B0000, // FLIPADST_ADST, - 0x140000, // IDTX, - 0x040000, // V_DCT, - 0x100000, // H_DCT, - 0x0C0000, // V_ADST, - 0x120000, // H_ADST, - 0x0D0000, // V_FLIPADST, - 0x128000 // H_FLIPADST, - }; + const int tx_types_flags[] = { + 0, // DCT_DCT, + 0x080000, // ADST_DCT, + 0x020000, // DCT_ADST, + 0x0A0000, // ADST_ADST, + 0x090000, // FLIPADST_DCT, + 0x028000, // DCT_FLIPADST, + 0x0B8000, // FLIPADST_FLIPADST, + 0x0A8000, // ADST_FLIPADST, + 0x0B0000, // FLIPADST_ADST, + 0x140000, // IDTX, + 0x040000, // V_DCT, + 0x100000, // H_DCT, + 0x0C0000, // V_ADST, + 0x120000, // H_ADST, + 0x0D0000, // V_FLIPADST, + 0x128000 // H_FLIPADST, + }; - int mi_col = mbmi->mi_col; - int mi_row = mbmi->mi_row; - if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[mbmi->sb_type] == 1)) - mi_row -= 1; - if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[mbmi->sb_type] == 1)) - mi_col -= 1; - const uint32_t x = ((mi_col * MI_SIZE) >> (pd->subsampling_x + 2)) + blk_col; - const uint32_t y = ((mi_row * MI_SIZE) >> (pd->subsampling_y + 2)) + blk_row; - tx_block_info_gpu * block = tile_data->idct_blocks_host + tile_data->idct_blocks_ptr; - const uint32_t coef_count = ((*eob) + 3) >> 2; - const int av1_idct_scans_lut[TX_TYPES] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 2 - }; + int mi_col = mbmi->mi_col; + int mi_row = mbmi->mi_row; + if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[mbmi->sb_type] == 1)) + mi_row -= 1; + if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[mbmi->sb_type] == 1)) + mi_col -= 1; + const uint32_t x = ((mi_col * MI_SIZE) >> (pd->subsampling_x + 2)) + blk_col; + const uint32_t y = ((mi_row * MI_SIZE) >> (pd->subsampling_y + 2)) + blk_row; + tx_block_info_gpu * block = tile_data->idct_blocks_host + tile_data->idct_blocks_ptr; + const uint32_t coef_count = ((*eob) + 3) >> 2; + const int av1_idct_scans_lut[TX_TYPES] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 2 + }; - for (uint32_t c = *eob; c < (coef_count << 2); ++c) - tcoeffs_raw[c] = 0; - const int type_index = (xd->lossless[xd->mi[0]->segment_id] && tx_size == 0)? TX_SIZES_ALL : tx_size; - block->flags = coef_count | tx_types_flags[tx_type] | (plane << 21) | (av1_idct_scans_lut[tx_type] << 11); - block->input_offset = tile_data->dq_buffer_ptr; - block->output_pos = x | (y << 16); - block->sorting_idx = (tile_data->idct_blocks_sizes[type_index] << 8) + type_index; - ++tile_data->idct_blocks_sizes[type_index]; - ++tile_data->idct_blocks_ptr; - tile_data->dq_buffer_ptr += coef_count << 2; + //for (uint32_t c = *eob; c < (coef_count << 2); ++c) + // tcoeffs_raw[c] = 0; + const int type_index = (xd->lossless[xd->mi[0]->segment_id] && tx_size == 0)? TX_SIZES_ALL : tx_size; + block->flags = coef_count | tx_types_flags[tx_type] | (plane << 21) | (av1_idct_scans_lut[tx_type] << 11); + block->input_offset = tile_data->dq_buffer_ptr; + block->output_pos = x | (y << 16); + block->sorting_idx = (tile_data->idct_blocks_sizes[type_index] << 8) + type_index; + ++tile_data->idct_blocks_sizes[type_index]; + ++tile_data->idct_blocks_ptr; + tile_data->dq_buffer_ptr += coef_count << 2; } return cul_level; }
diff --git a/libav1/dx/av1_compute.cpp b/libav1/dx/av1_compute.cpp index 209bc40..03fccbc 100644 --- a/libav1/dx/av1_compute.cpp +++ b/libav1/dx/av1_compute.cpp
@@ -194,7 +194,7 @@ if (!shader_lib) return 0; ID3D12Device* _device = static_cast<ID3D12Device*>(d3d12device); - err |= (shader_lib->sig_idct = create_root_sig(_device, 2, 1, 2, 2)) == NULL; + err |= (shader_lib->sig_idct = create_root_sig(_device, 1, 2, 2, 2)) == NULL; err |= (shader_lib->sig_common111 = create_root_sig(_device, 1, 1, 1, 0)) == NULL; err |= (shader_lib->sig_common0102 = create_root_sig(_device, 0, 1, 0, 2)) == NULL; err |= (shader_lib->sig_common0110 = create_root_sig(_device, 0, 1, 1, 0)) == NULL;
diff --git a/libav1/dx/av1_core.cpp b/libav1/dx/av1_core.cpp index 9f43a34..01822f8 100644 --- a/libav1/dx/av1_core.cpp +++ b/libav1/dx/av1_core.cpp
@@ -165,13 +165,14 @@ CHECK_RESULT(dec->loopfilter_blocks, mem->create_buffer(lf_blk_count * 32, MemoryType::DeviceOnly), do_assign); CHECK_RESULT(dec->mode_info_pool, mem->create_buffer(sizeof(MB_MODE_INFO) * mi_size, MemoryType::HostRW), do_assign); const int coef_buffer_size = target_width * target_height * 3 * IdctCoefCountNum / (IdctCoefCountDenum * 2); - CHECK_RESULT(dec->idct_coefs, mem->create_buffer(sizeof(int) * coef_buffer_size * 2, MemoryType::DeviceUpload), + CHECK_RESULT(dec->idct_coefs, mem->create_buffer(sizeof(int) * coef_buffer_size * 2, MemoryType::UploadUAV), do_assign); for (int i = 0; i < cfg.gpu_pipeline_depth; ++i) { av1_frame_thread_data *td = &dec->frame_thread_data[i]; td->mode_info_max = mi_size >> 1; td->mode_info_offset = i * (mi_size >> 1); td->coef_buffer_offset = i * coef_buffer_size; + td->coef_buffer_size = coef_buffer_size * sizeof(int); CHECK_RESULT(td->command_buffer.cb_alloc, mem->create_buffer(1024 * 1024, MemoryType::DeviceUpload), do_assign); CHECK_RESULT(td->tile_data, (av1_tile_data *)mem->host_allocate(sizeof(av1_tile_data) * max_tiles), do_assign); CHECK_RESULT(td->gen_mi_block_indexes, mem->create_buffer(sizeof(int) * block_count_4x4 * 2, MemoryType::HostRW), @@ -179,11 +180,8 @@ CHECK_RESULT(td->gen_intra_inter_grid, mem->create_buffer(grid_w * grid_h * 6, MemoryType::HostRW), do_assign); CHECK_RESULT(td->gen_block_map, mem->create_buffer(dec->pred_map_size * sizeof(int), MemoryType::HostRW), do_assign); - // CHECK_RESULT(td->mode_info, mem->create_buffer(sizeof(MB_MODE_INFO) * mi_size, MemoryType::HostRW), do_assign); CHECK_RESULT(td->mode_info_grid, mem->create_buffer(sizeof(MB_MODE_INFO *) * mi_size, MemoryType::HostRW), do_assign); - // CHECK_RESULT(td->idct_coefs, mem->create_buffer(sizeof(int) * target_width * target_height * 3 / 2, - // MemoryType::DeviceUpload), do_assign); CHECK_RESULT(td->idct_blocks_unordered, mem->create_buffer(block_count_4x4 * IdctBlockSize, MemoryType::DeviceUpload), do_assign); @@ -195,7 +193,9 @@ CHECK_RESULT(td->loop_rest_wiener, mem->create_buffer(64 * (block_count_4x4 >> 8), MemoryType::DeviceUpload), do_assign); CHECK_RESULT(td->filmgrain_rand_offset, - mem->create_buffer(sizeof(int) * (120 * (68 + 1)), MemoryType::DeviceUpload), do_assign); + mem->create_buffer(sizeof(int) * ((target_width / 32 + 4) * (target_height / 32 + 1)), + MemoryType::DeviceUpload), + do_assign); CHECK_RESULT(td->palette_buffer, mem->create_buffer(fb_size, MemoryType::DeviceUpload), do_assign); } @@ -266,8 +266,7 @@ D3D12_COMMAND_QUEUE_DESC desc = {}; if (!context->queue) { desc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE; - desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT; // enable_cpu_output == EnableHostOutput ? - // D3D12_COMMAND_LIST_TYPE_DIRECT : D3D12_COMMAND_LIST_TYPE_COMPUTE; + desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT; hr = device->CreateCommandQueue(&desc, IID_PPV_ARGS(&context->queue)); if (FAILED(hr)) return hr; } @@ -295,9 +294,6 @@ dx_compute_context *compute = &dec->compute; mem->set_dx_context(compute); - // if (!cfg->out_buffers_cb.get_out_buffer_cb || - // !cfg->out_buffers_cb.release_out_buffer_cb) - // return -1; dec->cb_get_output_image = cfg->out_buffers_cb.get_out_buffer_cb; dec->cb_release_image = cfg->out_buffers_cb.release_out_buffer_cb; dec->cb_notify_frame_ready = cfg->out_buffers_cb.notify_frame_ready_cb; @@ -315,6 +311,7 @@ rcfg.gpu_pipeline_depth = FrameThreadDataCount; rcfg.enable_superres = 1; if (av1_allocate_buffers(dec, mem, rcfg, 1)) return -1; + memset(dec->idct_coefs->host_ptr, 0, dec->idct_coefs->size); Microsoft::WRL::ComPtr<ID3D12Device> device = compute->device; dec->tryhdr10x3 = cfg->tryHDR10x3; @@ -358,7 +355,6 @@ img->size = 0; img->fb_ptr = NULL; img->is_valid = 0; - // img->hw_buf = dec->output_frame_buffers[i]; MTQueuePush(&dec->image_pool, img); } @@ -368,8 +364,6 @@ for (int i = 0; i < rcfg.ref_fb_count; ++i) { const int offset = dec->fb_offset + dec->fb_size * i; HwFrameBuffer *fb = &dec->fb_pool_src[i]; - // fb->pool_ptr = pool; - // fb->fb_ptr = pool + offset; fb->size = dec->fb_size; fb->base_offset = offset; fb->ref_cnt = 0; @@ -745,6 +739,8 @@ MTQueuePush(&dec->frame_data_pool, td->sec_thread_data); } MTQueuePush(&dec->frame_data_pool, td); + int * coef_buf = (int *)dec->idct_coefs->host_ptr; + memset(coef_buf + td->coef_buffer_offset, 0, td->coef_buffer_size); dec->curr_frame_data = NULL; }
diff --git a/libav1/dx/av1_memory.cpp b/libav1/dx/av1_memory.cpp index 7c75fe9..ab660e8 100644 --- a/libav1/dx/av1_memory.cpp +++ b/libav1/dx/av1_memory.cpp
@@ -91,25 +91,23 @@ obj->size = (size + DeviceMemAlign1) & (~(DeviceMemAlign1)); D3D12_HEAP_TYPE heapLut[] = {D3D12_HEAP_TYPE_DEFAULT, D3D12_HEAP_TYPE_UPLOAD, D3D12_HEAP_TYPE_CUSTOM, - D3D12_HEAP_TYPE_READBACK, D3D12_HEAP_TYPE_DEFAULT}; - const D3D12_HEAP_TYPE heapType = heapLut[mem]; - const D3D12_RESOURCE_STATES state = (mem == DeviceOnly) ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS - : (mem == DeviceOnlyConst) - ? D3D12_RESOURCE_STATE_COPY_DEST - : (mem == ReadBack) ? D3D12_RESOURCE_STATE_COPY_DEST - : D3D12_RESOURCE_STATE_GENERIC_READ; + D3D12_HEAP_TYPE_READBACK, D3D12_HEAP_TYPE_DEFAULT, D3D12_HEAP_TYPE_CUSTOM }; + D3D12_RESOURCE_STATES stateLut[] = { D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_GENERIC_READ, D3D12_RESOURCE_STATE_GENERIC_READ, + D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS }; + D3D12_CPU_PAGE_PROPERTY pagePropLut[] = { D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_CPU_PAGE_PROPERTY_WRITE_BACK, + D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE }; D3D12_HEAP_PROPERTIES heapProp; - heapProp.Type = heapType; - heapProp.CPUPageProperty = mem == HostRW ? D3D12_CPU_PAGE_PROPERTY_WRITE_BACK : D3D12_CPU_PAGE_PROPERTY_UNKNOWN; // - heapProp.MemoryPoolPreference = mem == HostRW ? D3D12_MEMORY_POOL_L0 : D3D12_MEMORY_POOL_UNKNOWN; + heapProp.Type = heapLut[mem]; + heapProp.CPUPageProperty = pagePropLut[mem]; // + heapProp.MemoryPoolPreference = (mem == HostRW || mem == UploadUAV) ? D3D12_MEMORY_POOL_L0 : D3D12_MEMORY_POOL_UNKNOWN; heapProp.CreationNodeMask = 0; heapProp.VisibleNodeMask = 0; const D3D12_RESOURCE_FLAGS flags = - (mem == DeviceOnly) ? D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS : D3D12_RESOURCE_FLAG_NONE; + (mem == DeviceOnly || mem == UploadUAV) ? D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS : D3D12_RESOURCE_FLAG_NONE; HRESULT hr = context->device->CreateCommittedResource(&heapProp, D3D12_HEAP_FLAG_NONE, - &CD3DX12_RESOURCE_DESC::Buffer(obj->size, flags), state, NULL, + &CD3DX12_RESOURCE_DESC::Buffer(obj->size, flags), stateLut[mem], NULL, __uuidof(*obj->dev), reinterpret_cast<void **>(&obj->dev)); if (SUCCEEDED(hr) && mem != DeviceOnly && mem != DeviceOnlyConst) {
diff --git a/libav1/dx/av1_memory.h b/libav1/dx/av1_memory.h index 6b88eda..38c2573 100644 --- a/libav1/dx/av1_memory.h +++ b/libav1/dx/av1_memory.h
@@ -40,6 +40,7 @@ HostRW = 2, ReadBack = 3, DeviceOnlyConst = 4, + UploadUAV = 5, }; using Microsoft::WRL::ComPtr;
diff --git a/libav1/dx/shaders/idct_lossless.hlsl b/libav1/dx/shaders/idct_lossless.hlsl index 15de470..59ed5f9 100644 --- a/libav1/dx/shaders/idct_lossless.hlsl +++ b/libav1/dx/shaders/idct_lossless.hlsl
@@ -34,12 +34,18 @@ const int coef_count = block.x & 0x7ff; const int input_offset = (block.y + wi * 4) << 2; - int4 coefs = (wi < coef_count) ? (int4)buf_input.Load4(input_offset) : int4(0, 0, 0, 0); - - shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].x] = coefs.x; - shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].y] = coefs.y; - shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].z] = coefs.z; - shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].w] = coefs.w; + int4 coefs = int4(0, 0, 0, 0); + if (wi < coef_count) + { + coefs = (int4)buf_input.Load4(input_offset); + buf_input.Store4(input_offset, int4(0, 0, 0, 0)); + } + const int coef_min = -(1 << (cb_bitdepth + 7)); + const int coef_max = (1 << (cb_bitdepth + 7)) - 1; + shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].x] = clamp(coefs.x, coef_min, coef_max); + shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].y] = clamp(coefs.y, coef_min, coef_max); + shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].z] = clamp(coefs.z, coef_min, coef_max); + shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].w] = clamp(coefs.w, coef_min, coef_max); GroupMemoryBarrier();
diff --git a/libav1/dx/shaders/idct_shader_common.h b/libav1/dx/shaders/idct_shader_common.h index 63bed6a..425e768 100644 --- a/libav1/dx/shaders/idct_shader_common.h +++ b/libav1/dx/shaders/idct_shader_common.h
@@ -19,9 +19,9 @@ #define IdctBlockSize 16 -ByteAddressBuffer buf_input : register(t0); -ByteAddressBuffer buf_blocks : register(t1); +ByteAddressBuffer buf_blocks : register(t0); RWByteAddressBuffer buf_dst : register(u0); +RWByteAddressBuffer buf_input : register(u1); cbuffer cb_idct_frame_data : register(b1) { uint4 cb_planes[3]; @@ -33,6 +33,9 @@ uint cb_wicount; }; +int clamp_value(int value, int2 range) { return clamp(value, range.x, range.y); } + + #define NewSqrt2Bits 12 #define NewSqrt2 5793 #define NewInvSqrt2 2896 @@ -61,7 +64,13 @@ for (i = 0; i < COEF_LOOP; ++i) { \ int4 coefs = int4(0, 0, 0, 0); \ if ((wi + i * N) < coef_count) { \ - coefs = (int4)buf_input.Load4(input_offset * 4 + i * N * 16); \ + const int addr = input_offset * 4 + i * N * 16; \ + coefs = (int4)buf_input.Load4(addr); \ + buf_input.Store4(addr, uint4(0, 0, 0, 0)); \ + coefs.x = clamp_value(coefs.x, row_clamp); \ + coefs.y = clamp_value(coefs.y, row_clamp); \ + coefs.z = clamp_value(coefs.z, row_clamp); \ + coefs.w = clamp_value(coefs.w, row_clamp); \ if (SCALE_COEF) { \ coefs.x = round_shift(coefs.x * NewInvSqrt2, NewSqrt2Bits); \ coefs.y = round_shift(coefs.y * NewInvSqrt2, NewSqrt2Bits); \ @@ -178,8 +187,6 @@ static const int sinpi[] = {0, 1321, 2482, 3344, 3803}; -int clamp_value(int value, int2 range) { return clamp(value, range.x, range.y); } - int round_shift(int value, int bit) { if (bit == 0) return value; return (int)((value + (1 << (bit - 1))) >> bit);
diff --git a/libav1/dx/transform.cpp b/libav1/dx/transform.cpp index 3b3e844..d087b86 100644 --- a/libav1/dx/transform.cpp +++ b/libav1/dx/transform.cpp
@@ -96,9 +96,9 @@ av1_tile_data *tdata = td->tile_data; command_list->SetComputeRootSignature(dec->shader_lib->sig_idct.Get()); - command_list->SetComputeRootShaderResourceView(0, dec->idct_coefs->dev->GetGPUVirtualAddress()); - command_list->SetComputeRootShaderResourceView(1, dec->idct_blocks->dev->GetGPUVirtualAddress()); - command_list->SetComputeRootUnorderedAccessView(2, dec->idct_residuals->dev->GetGPUVirtualAddress()); + command_list->SetComputeRootShaderResourceView(0, dec->idct_blocks->dev->GetGPUVirtualAddress()); + command_list->SetComputeRootUnorderedAccessView(1, dec->idct_residuals->dev->GetGPUVirtualAddress()); + command_list->SetComputeRootUnorderedAccessView(2, dec->idct_coefs->dev->GetGPUVirtualAddress()); command_list->SetComputeRootConstantBufferView(4, cbo.dev_address); for (int type = 0; type <= TX_SIZES_ALL; ++type) { int offset = tdata->idct_blocks_sizes[type];
diff --git a/libav1/dx/types.h b/libav1/dx/types.h index b4fba6d..b689df8 100644 --- a/libav1/dx/types.h +++ b/libav1/dx/types.h
@@ -42,7 +42,6 @@ uint32_t flags; MV mv0; MV mv1; - // uint32_t sorting_idx; }; enum InterTypes { @@ -159,6 +158,7 @@ int mode_info_offset; int mode_info_max; int coef_buffer_offset; + int coef_buffer_size; av1_tile_data* tile_data; HwFrameBuffer* frame_buffer; HwFrameBuffer* back_buffer0; @@ -242,5 +242,4 @@ int tryhdr10x3; } Av1Core; -Av1Core* Get(uint32_t may_be_null = 0); void PutPerfMarker(av1_frame_thread_data* td, volatile int64_t* marker);