blob: 9f43a34ac2d17ec3417422c23ece4a0eee18a81a [file] [log] [blame]
/*
* Copyright 2020 Google LLC
*
*/
/*
* Copyright (c) 2020, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "dx/av1_core.h"
#include "dx/av1_memory.h"
#include "dx/av1_compute.h"
#include <assert.h>
#include <new>
#include "dx/types.h"
#include "av1/decoder/decoder.h"
#include "av1\common\scan.h"
#include "av1\common\idct.h"
#include "av1\common\filter.h"
#include "av1/common/reconinter.h"
#include "av1/common/warped_motion.h"
#include "av1/common/reconintra.h"
#include "aom_dsp/intrapred_common.h"
enum {
IdctBlockSize = 16,
IdctCoefCountDenum = 4 * 4 + 2 * 2 * 2,
IdctCoefCountNum = 17, //~71%
TileSbSizeThreshold = 2,
FbYStripe = 256,
FbUvStripe = 128,
};
int av1_postprocess_copy_output(Av1Core *dec, AV1_COMMON *cm);
void av1_loopfilter_gpu(Av1Core *dec, AV1_COMMON *cm, MACROBLOCKD *xd);
void av1_cdef_looprestoration(Av1Core *dec, AV1_COMMON *cm, void *lr_ctxt);
void av1_prediction_run_all(Av1Core *dec, AV1_COMMON *cm, TileInfo *tile);
void av1_idct_run(Av1Core *dec);
void av1_inter_ext_borders(Av1Core *dec, AV1_COMMON *cm);
void av1_mi_push_block(AV1Decoder *pbi, AV1_COMMON *cm, MACROBLOCKD *xd);
void av1_prediction_gen_blocks(AV1Decoder *pbi, Av1Core *dec);
static THREADFN gpu_thread_hook(void *pdata);
static int get_random_number_test(int val) {
unsigned int bit;
bit = ((val >> 0) ^ (val >> 1) ^ (val >> 3) ^ (val >> 12)) & 1;
val = (val >> 1) | (bit << 15);
return val;
}
#define CHECK_RESULT(DST, FUNC, DO_ASSIGN) \
{ \
auto x = FUNC; \
if (DO_ASSIGN) { \
if (!x) \
return -1; \
else \
DST = x; \
} \
}
struct resource_config {
int bitdepth;
int width;
int height;
int fb_count;
int ref_fb_count;
int gpu_pipeline_depth;
int enable_superres;
};
int av1_allocate_buffers(Av1Core *dec, av1_memory_manager_base *mem, const resource_config &cfg, int do_assign) {
const int target_width = (cfg.width + 127) & ~127;
const int target_height = (cfg.height + 127) & ~127;
const int block_count_4x4 = target_width * target_height * 3 / (2 * 4 * 4);
const int mi_cols = target_width >> 2;
const int mi_rows = target_height >> 2;
const int max_tile_cols = target_width >> 6;
const int max_tile_rows = target_height >> 6;
dec->block_count4x4 = block_count_4x4;
CHECK_RESULT(dec->pbi_alloc, (void *)mem->host_allocate(sizeof(AV1Decoder)), do_assign);
CHECK_RESULT(dec->buf_pool_alloc, (void *)mem->host_allocate(sizeof(BufferPool)), do_assign);
for (int p = 0; p < 5; ++p)
CHECK_RESULT(dec->above_context_alloc[p], (void **)mem->host_allocate(sizeof(void *) * max_tile_rows), do_assign);
for (int row = 0; row < max_tile_rows; ++row)
for (int p = 0; p < 5; ++p)
CHECK_RESULT(dec->above_context_alloc[p][row], (void *)mem->host_allocate(sizeof(ENTROPY_CONTEXT) * mi_cols),
do_assign);
const int y_max_rst_units = max_tile_cols * max_tile_rows;
const int uv_max_rst_units = (max_tile_cols >> 1) * (max_tile_rows >> 1);
CHECK_RESULT(dec->restoration_info_alloc[0],
(void *)mem->host_allocate(sizeof(RestorationUnitInfo) * y_max_rst_units), do_assign);
CHECK_RESULT(dec->restoration_info_alloc[1],
(void *)mem->host_allocate(sizeof(RestorationUnitInfo) * uv_max_rst_units), do_assign);
CHECK_RESULT(dec->restoration_info_alloc[2],
(void *)mem->host_allocate(sizeof(RestorationUnitInfo) * uv_max_rst_units), do_assign);
const int tmvs_size = ((mi_rows + MAX_MIB_SIZE) >> 1) * (mi_cols >> 1);
CHECK_RESULT(dec->tplmvs_alloc, (void *)mem->host_allocate(sizeof(TPL_MV_REF) * tmvs_size), do_assign);
const int residuals_size = target_width * target_height +
2 * ((((target_width >> 1) + 127) & (~127)) * (((target_height >> 1) + 127) & (~127)));
CHECK_RESULT(dec->idct_residuals, mem->create_buffer(sizeof(short) * residuals_size, MemoryType::DeviceOnly),
do_assign);
CHECK_RESULT(dec->idct_blocks, mem->create_buffer(block_count_4x4 * IdctBlockSize, MemoryType::DeviceOnly),
do_assign);
CHECK_RESULT(dec->inter_mask_lut, mem->create_buffer(sizeof(wedge_mask_buf) * 2, MemoryType::DeviceOnlyConst),
do_assign);
CHECK_RESULT(dec->inter_warp_filter, mem->create_buffer(sizeof(warped_filter) * 2, MemoryType::DeviceOnlyConst),
do_assign);
CHECK_RESULT(dec->filmgrain_noise, mem->create_buffer(sizeof(int) * (96 * 96 + 48 * 48 * 2), MemoryType::DeviceOnly),
do_assign);
CHECK_RESULT(dec->filmgrain_gaus, mem->create_buffer(sizeof(dx_gaussian_sequence), MemoryType::DeviceOnlyConst),
do_assign);
CHECK_RESULT(dec->filmgrain_random_luma, mem->create_buffer(sizeof(int) * (65536 + 1), MemoryType::DeviceOnlyConst),
do_assign);
CHECK_RESULT(dec->filmgrain_random_chroma, mem->create_buffer(sizeof(int) * (65536 + 1), MemoryType::DeviceOnlyConst),
do_assign);
// frame buffer size:
const int border = 16;
const int y_stride = target_width + 2 * border;
const int y_size = y_stride * target_height;
const int uv_stride = (target_width >> 1) + border * 2;
const int uv_size = uv_stride * (target_height >> 1);
const int bpp = cfg.bitdepth > 8 ? 2 : 1;
const int fb_size = ((y_size + 2 * uv_size) * bpp + 255) & ~255;
dec->fb_size = fb_size;
dec->fb_offset = fb_size;
dec->enable_superres = cfg.enable_superres;
CHECK_RESULT(dec->frame_buffer_pool,
mem->create_buffer(fb_size * cfg.fb_count + dec->fb_offset, MemoryType::DeviceOnly), do_assign);
const int grid_w = mi_cols + 2 + 128;
const int grid_h = mi_rows + 2 + 128;
const int cdef_blocks = target_width * target_height / 64 / 64;
const int block_count_8x8 = block_count_4x4 >> 2;
const int mi_size = mi_cols * mi_rows;
const int lf_blk_count = (target_width / 64) * (target_height / 4) + // vert luma
(target_width / 128) * (target_height / 8) * 2 + // vert chroma
(target_height / 64) * (target_width / 4) + // hor luma
(target_height / 128) * (target_width / 8) * 2; // hor chroma
const int max_tiles = max_tile_cols * max_tile_rows;
dec->pred_map_size = (256 + 32) * max_tiles + (mi_size >> 2) * 10;
CHECK_RESULT(dec->prediction_blocks, mem->create_buffer(block_count_4x4 * 16 * 2, MemoryType::DeviceOnly), do_assign);
CHECK_RESULT(dec->prediction_blocks_warp, mem->create_buffer(block_count_8x8 * 48, MemoryType::DeviceOnly),
do_assign);
CHECK_RESULT(dec->loopfilter_blocks, mem->create_buffer(lf_blk_count * 32, MemoryType::DeviceOnly), do_assign);
CHECK_RESULT(dec->mode_info_pool, mem->create_buffer(sizeof(MB_MODE_INFO) * mi_size, MemoryType::HostRW), do_assign);
const int coef_buffer_size = target_width * target_height * 3 * IdctCoefCountNum / (IdctCoefCountDenum * 2);
CHECK_RESULT(dec->idct_coefs, mem->create_buffer(sizeof(int) * coef_buffer_size * 2, MemoryType::DeviceUpload),
do_assign);
for (int i = 0; i < cfg.gpu_pipeline_depth; ++i) {
av1_frame_thread_data *td = &dec->frame_thread_data[i];
td->mode_info_max = mi_size >> 1;
td->mode_info_offset = i * (mi_size >> 1);
td->coef_buffer_offset = i * coef_buffer_size;
CHECK_RESULT(td->command_buffer.cb_alloc, mem->create_buffer(1024 * 1024, MemoryType::DeviceUpload), do_assign);
CHECK_RESULT(td->tile_data, (av1_tile_data *)mem->host_allocate(sizeof(av1_tile_data) * max_tiles), do_assign);
CHECK_RESULT(td->gen_mi_block_indexes, mem->create_buffer(sizeof(int) * block_count_4x4 * 2, MemoryType::HostRW),
do_assign);
CHECK_RESULT(td->gen_intra_inter_grid, mem->create_buffer(grid_w * grid_h * 6, MemoryType::HostRW), do_assign);
CHECK_RESULT(td->gen_block_map, mem->create_buffer(dec->pred_map_size * sizeof(int), MemoryType::HostRW),
do_assign);
// CHECK_RESULT(td->mode_info, mem->create_buffer(sizeof(MB_MODE_INFO) * mi_size, MemoryType::HostRW), do_assign);
CHECK_RESULT(td->mode_info_grid, mem->create_buffer(sizeof(MB_MODE_INFO *) * mi_size, MemoryType::HostRW),
do_assign);
// CHECK_RESULT(td->idct_coefs, mem->create_buffer(sizeof(int) * target_width * target_height * 3 / 2,
// MemoryType::DeviceUpload), do_assign);
CHECK_RESULT(td->idct_blocks_unordered,
mem->create_buffer(block_count_4x4 * IdctBlockSize, MemoryType::DeviceUpload), do_assign);
CHECK_RESULT(td->cdef_indexes, mem->create_buffer(sizeof(int) * cdef_blocks, MemoryType::DeviceUpload), do_assign);
CHECK_RESULT(td->cdef_skips, mem->create_buffer(sizeof(int) * cdef_blocks * 16 * 16, MemoryType::DeviceUpload),
do_assign);
CHECK_RESULT(td->loop_rest_types, mem->create_buffer(16 * (block_count_4x4 >> 8), MemoryType::DeviceUpload),
do_assign);
CHECK_RESULT(td->loop_rest_wiener, mem->create_buffer(64 * (block_count_4x4 >> 8), MemoryType::DeviceUpload),
do_assign);
CHECK_RESULT(td->filmgrain_rand_offset,
mem->create_buffer(sizeof(int) * (120 * (68 + 1)), MemoryType::DeviceUpload), do_assign);
CHECK_RESULT(td->palette_buffer, mem->create_buffer(fb_size, MemoryType::DeviceUpload), do_assign);
}
const int mvs_size = (target_width >> 3) * (target_height >> 3) * sizeof(MV_REF);
const int seg_size = (target_width >> 2) * (target_height >> 2);
for (int i = 0; i < cfg.ref_fb_count; ++i) {
HwFrameBuffer *fb = &dec->fb_pool_src[i];
CHECK_RESULT(fb->mvs_alloc, mem->host_allocate(mvs_size), do_assign);
CHECK_RESULT(fb->seg_alloc, (uint8_t *)mem->host_allocate(seg_size), do_assign);
}
return 0;
}
extern "C" int av1_query_memory_requirements(aom_codec_dec_cfg_t *cfg) {
av1_memory_allocator_dummy mem;
mem.host_allocate(32 * 1024); // assume aom_codec_alg_priv size, actually ~20kb;
mem.host_allocate(sizeof(av1_memory_allocator));
mem.host_allocate(sizeof(Av1Core));
resource_config rcfg;
rcfg.bitdepth = cfg->bitdepth;
rcfg.width = cfg->width;
rcfg.height = cfg->height;
rcfg.ref_fb_count = 12;
rcfg.fb_count = rcfg.ref_fb_count;
rcfg.gpu_pipeline_depth = FrameThreadDataCount;
rcfg.enable_superres = 1;
Av1Core dec;
av1_allocate_buffers(&dec, &mem, rcfg, 0);
cfg->host_size = mem.get_host_size();
return 0;
}
int create_device(dx_compute_context *context) {
HRESULT hr = S_OK;
if (context->device == 0) {
UINT dxgiFactoryFlags = 0;
#if defined(_DEBUG)
{
ComPtr<ID3D12Debug> debugController;
hr = D3D12GetDebugInterface(IID_PPV_ARGS(&debugController));
if (SUCCEEDED(hr)) {
debugController->EnableDebugLayer();
dxgiFactoryFlags |= DXGI_CREATE_FACTORY_DEBUG;
}
}
#endif
ComPtr<IDXGIFactory4> factory;
hr = CreateDXGIFactory2(dxgiFactoryFlags, IID_PPV_ARGS(&factory));
if (FAILED(hr)) return hr;
ComPtr<IDXGIAdapter1> hardwareAdapter;
hr = E_FAIL;
for (int index = 0; DXGI_ERROR_NOT_FOUND != factory->EnumAdapters1(index, &hardwareAdapter); ++index) {
DXGI_ADAPTER_DESC1 desc;
hardwareAdapter->GetDesc1(&desc);
if (desc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE) continue;
hr = D3D12CreateDevice(hardwareAdapter.Get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&context->device));
if (SUCCEEDED(hr)) break;
}
if (context->device == NULL || FAILED(hr)) return E_FAIL;
}
Microsoft::WRL::ComPtr<ID3D12Device> device = context->device;
D3D12_COMMAND_QUEUE_DESC desc = {};
if (!context->queue) {
desc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE;
desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT; // enable_cpu_output == EnableHostOutput ?
// D3D12_COMMAND_LIST_TYPE_DIRECT : D3D12_COMMAND_LIST_TYPE_COMPUTE;
hr = device->CreateCommandQueue(&desc, IID_PPV_ARGS(&context->queue));
if (FAILED(hr)) return hr;
}
context->queue_direct = context->queue;
if (FAILED(hr)) return hr;
return hr;
}
extern "C" int av1_create_gpu_decoder(Av1Core **gpu_dec, aom_codec_dec_cfg_t *cfg) {
if (cfg->host_size < sizeof(av1_memory_allocator)) return -1;
av1_memory_allocator *mem = new (cfg->host_memory) av1_memory_allocator;
mem->setup((uint8_t *)cfg->host_memory, cfg->host_size);
mem->host_allocate(sizeof(*mem));
Av1Core *dec = (Av1Core *)mem->host_allocate(sizeof(Av1Core));
if (!dec) return -1;
memset(dec, 0, sizeof(*dec));
dec->memory = mem;
dec->compute.device = static_cast<ID3D12Device *>(cfg->dx12device);
dec->compute.queue = static_cast<ID3D12CommandQueue *>(cfg->dx12command_queue);
if (FAILED(create_device(&dec->compute))) return -1;
dx_compute_context *compute = &dec->compute;
mem->set_dx_context(compute);
// if (!cfg->out_buffers_cb.get_out_buffer_cb ||
// !cfg->out_buffers_cb.release_out_buffer_cb)
// return -1;
dec->cb_get_output_image = cfg->out_buffers_cb.get_out_buffer_cb;
dec->cb_release_image = cfg->out_buffers_cb.release_out_buffer_cb;
dec->cb_notify_frame_ready = cfg->out_buffers_cb.notify_frame_ready_cb;
dec->image_alloc_priv = cfg->out_buffers_cb.out_buffers_priv;
dec->shader_lib = static_cast<compute_shader_lib *>(cfg->dxPsos);
if (!dec->shader_lib) return -1;
if (wait_shader_create_complete(dec->shader_lib)) return -1;
resource_config rcfg;
rcfg.bitdepth = cfg->bitdepth;
rcfg.width = cfg->width;
rcfg.height = cfg->height;
rcfg.ref_fb_count = 12;
rcfg.fb_count = rcfg.ref_fb_count;
rcfg.gpu_pipeline_depth = FrameThreadDataCount;
rcfg.enable_superres = 1;
if (av1_allocate_buffers(dec, mem, rcfg, 1)) return -1;
Microsoft::WRL::ComPtr<ID3D12Device> device = compute->device;
dec->tryhdr10x3 = cfg->tryHDR10x3;
HRESULT hr;
MTQueueInit(&dec->frame_data_pool);
for (int i = 0; i < FrameThreadDataCount; ++i) {
av1_frame_thread_data *td = &dec->frame_thread_data[i];
ComputeCommandBuffer *cb = &td->command_buffer;
hr = device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&cb->allocator));
if (FAILED(hr)) return -1;
cb->fence_value = 0;
hr = device->CreateFence(cb->fence_value, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&cb->fence));
if (FAILED(hr)) return -1;
cb->event = CreateEvent(nullptr, false, false, nullptr);
td->frame_number = 0;
pthread_mutex_init(&td->sec_data_mutex, NULL);
MTQueuePush(&dec->frame_data_pool, td);
}
hr = device->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT,
dec->frame_thread_data[0].command_buffer.allocator.Get(), NULL,
IID_PPV_ARGS(&compute->command_list));
if (FAILED(hr)) return -1;
if (FAILED(compute->command_list->Close())) return -1;
if (FAILED(av1_upload_luts(dec))) return -1;
MTQueueInit(&dec->gpu_item_pool);
MTQueueInit(&dec->gpu_waiting_queue);
for (int i = 0; i < 8; ++i) {
dec->gpu_item_pool_src[i].data = NULL;
dec->gpu_item_pool_src[i].image = NULL;
MTQueuePush(&dec->gpu_item_pool, &dec->gpu_item_pool_src[i]);
}
MTQueueInit(&dec->output_queue);
MTQueueInit(&dec->image_pool);
for (int i = 0; i < ImagePoolSize; ++i) {
HwOutputImage *img = &dec->image_pool_src[i];
img->size = 0;
img->fb_ptr = NULL;
img->is_valid = 0;
// img->hw_buf = dec->output_frame_buffers[i];
MTQueuePush(&dec->image_pool, img);
}
dec->back_buffer1.size = dec->fb_offset;
dec->back_buffer1.base_offset = 0;
QueueInit(&dec->fb_pool);
for (int i = 0; i < rcfg.ref_fb_count; ++i) {
const int offset = dec->fb_offset + dec->fb_size * i;
HwFrameBuffer *fb = &dec->fb_pool_src[i];
// fb->pool_ptr = pool;
// fb->fb_ptr = pool + offset;
fb->size = dec->fb_size;
fb->base_offset = offset;
fb->ref_cnt = 0;
QueuePush(&dec->fb_pool, fb);
}
pthread_cond_init(&dec->fb_pool_empty_cond, NULL);
pthread_mutex_init(&dec->fb_pool_mutex, NULL);
*gpu_dec = dec;
return 0;
}
extern "C" void av1_allocate_pbi(Av1Core *dec, AV1Decoder **ppbi, BufferPool **pbp) {
AV1Decoder *pbi = (AV1Decoder *)dec->pbi_alloc;
BufferPool *bp = (BufferPool *)dec->buf_pool_alloc;
memset(pbi, 0, sizeof(*pbi));
memset(bp, 0, sizeof(*bp));
pbi->gpu_decoder = dec;
*ppbi = pbi;
*pbp = bp;
}
HwFrameBuffer *get_frame_buffer(Av1Core *dec) {
HwFrameBuffer *fb = NULL;
pthread_mutex_lock(&dec->fb_pool_mutex);
{
while (!dec->fb_pool.m_QueueNotEmpty) pthread_cond_wait(&dec->fb_pool_empty_cond, &dec->fb_pool_mutex);
fb = (HwFrameBuffer *)QueueGet(&dec->fb_pool);
}
fb->ref_cnt = 1;
pthread_mutex_unlock(&dec->fb_pool_mutex);
return fb;
}
void frame_buffer_acquire(Av1Core *dec, HwFrameBuffer *fb) {
pthread_mutex_lock(&dec->fb_pool_mutex);
++fb->ref_cnt;
pthread_mutex_unlock(&dec->fb_pool_mutex);
}
void frame_buffer_release(Av1Core *dec, HwFrameBuffer *fb) {
pthread_mutex_lock(&dec->fb_pool_mutex);
--fb->ref_cnt;
if (fb->ref_cnt == 0) {
QueuePush(&dec->fb_pool, fb);
pthread_cond_signal(&dec->fb_pool_empty_cond);
}
pthread_mutex_unlock(&dec->fb_pool_mutex);
}
void av1_destroy_gpu_decoder(Av1Core *dec) {
if (!dec) return;
av1_drain_gpu_decoder(dec);
if (dec->cb_release_image) {
for (int i = 0; i < ImagePoolSize; ++i) {
HwOutputImage *img = &dec->image_pool_src[i];
if (img->is_valid) {
dec->cb_release_image(dec->image_alloc_priv, img->alloc_priv);
}
}
}
for (int i = 0; i < FrameThreadDataCount; ++i) {
av1_frame_thread_data *td = &dec->frame_thread_data[i];
CloseHandle(td->command_buffer.event);
pthread_mutex_destroy(&td->sec_data_mutex);
}
pthread_cond_destroy(&dec->fb_pool_empty_cond);
pthread_mutex_destroy(&dec->fb_pool_mutex);
MTQueueDestroy(&dec->output_queue);
MTQueueDestroy(&dec->image_pool);
MTQueueDestroy(&dec->gpu_waiting_queue);
MTQueueDestroy(&dec->gpu_item_pool);
MTQueueDestroy(&dec->frame_data_pool);
if (dec->memory) dec->memory->release();
dec->~Av1Core();
}
void av1_prepare_command_buffer(Av1Core *dec) {
av1_frame_thread_data *td = dec->curr_frame_data;
dx_compute_context *compute = &dec->compute;
td->command_buffer.allocator->Reset();
compute->command_list->Reset(td->command_buffer.allocator.Get(), NULL);
td->command_buffer.Reset();
PutPerfMarker(td, &td->perf_markers[0]);
}
void av1_commit_command_buffer(Av1Core *dec) {
av1_frame_thread_data *td = dec->curr_frame_data;
PutPerfMarker(td, &td->perf_markers[15]);
dx_compute_context *compute = &dec->compute;
ComputeCommandBuffer *cb = &td->command_buffer;
compute->command_list->Close();
ID3D12CommandList *list[] = {compute->command_list.Get()};
++cb->fence_value;
compute->queue->ExecuteCommandLists(1, list);
compute->queue->Signal(cb->fence.Get(), cb->fence_value);
cb->fence->SetEventOnCompletion(cb->fence_value, cb->event);
}
void PutPerfMarker(av1_frame_thread_data *td, volatile int64_t *marker) {}
extern "C" void av1_setup_context_buffers(AV1Decoder *pbi) {
AV1_COMMON *cm = &pbi->common;
Av1Core *dec = pbi->gpu_decoder;
// above context:
const int cols = (cm->mi_cols + 31) & ~31;
const int rows = cm->tile_rows;
cm->num_allocated_above_contexts = rows;
cm->num_allocated_above_context_mi_col = cols;
cm->num_allocated_above_context_planes = cm->seq_params.monochrome ? 1 : MAX_MB_PLANE;
cm->above_context[0] = (ENTROPY_CONTEXT **)dec->above_context_alloc[0];
cm->above_context[1] = (ENTROPY_CONTEXT **)dec->above_context_alloc[1];
cm->above_context[2] = (ENTROPY_CONTEXT **)dec->above_context_alloc[2];
cm->above_seg_context = (PARTITION_CONTEXT **)dec->above_context_alloc[3];
cm->above_txfm_context = (TXFM_CONTEXT **)dec->above_context_alloc[4];
cm->rst_info[0].unit_info = (RestorationUnitInfo *)dec->restoration_info_alloc[0];
cm->rst_info[1].unit_info = (RestorationUnitInfo *)dec->restoration_info_alloc[1];
cm->rst_info[2].unit_info = (RestorationUnitInfo *)dec->restoration_info_alloc[2];
cm->tpl_mvs = (TPL_MV_REF *)dec->tplmvs_alloc;
}
extern "C" void av1_show_frame(AV1Decoder *pbi, YV12_BUFFER_CONFIG *buf, int is_visible) {}
void copy_to_img(Av1Core *dec, HwOutputImage *hw_img, aom_image_t *dst) {
dst->bit_depth = hw_img->hbd ? 10 : 8;
dst->w = hw_img->y_crop_width;
dst->h = hw_img->y_crop_height;
dst->d_w = hw_img->y_crop_width;
dst->d_h = hw_img->y_crop_height;
dst->r_w = hw_img->y_crop_width;
dst->r_h = hw_img->y_crop_height;
dst->x_chroma_shift = 1;
dst->y_chroma_shift = 1;
dst->planes[AOM_PLANE_Y] = hw_img->fb_ptr + hw_img->planes[0].offset;
dst->planes[AOM_PLANE_U] = hw_img->fb_ptr + hw_img->planes[1].offset;
dst->planes[AOM_PLANE_V] = hw_img->fb_ptr + hw_img->planes[2].offset;
dst->stride[AOM_PLANE_Y] = hw_img->planes[0].stride;
dst->stride[AOM_PLANE_U] = hw_img->planes[1].stride;
dst->stride[AOM_PLANE_V] = hw_img->planes[2].stride;
dst->user_priv = hw_img->user_priv;
dst->fb2_priv = hw_img->alloc_priv;
dst->monochrome = hw_img->monochrome;
if (hw_img->monochrome) {
dst->planes[AOM_PLANE_U] = NULL;
dst->planes[AOM_PLANE_V] = NULL;
}
dst->is_hdr10x3 = dec->tryhdr10x3;
hw_img->fb_ptr = NULL;
hw_img->alloc_priv = NULL;
hw_img->user_priv = NULL;
hw_img->is_valid = 0;
}
extern "C" int get_output_frame(AV1Decoder *pbi, aom_image_t *dst) {
Av1Core *dec = pbi->gpu_decoder;
if (MTQueueIsEmpty(&dec->output_queue)) return 1;
HwOutputImage *hw_img = (HwOutputImage *)MTQueueGet(&dec->output_queue);
copy_to_img(dec, hw_img, dst);
MTQueuePush(&dec->image_pool, hw_img);
return 0;
}
void av1_sync_gpu(av1_frame_thread_data *td) {
ComputeCommandBuffer *cb = &td->command_buffer;
WaitForSingleObject(cb->event, INFINITE);
}
int release_fb(Av1Core *dec, HwFrameBuffer *fb) {
--fb->ref_cnt;
if (fb->ref_cnt == 0) {
QueuePush(&dec->fb_pool, fb);
return 1;
}
return 0;
}
THREADFN gpu_thread_hook(void *data) {
Av1Core *dec = (Av1Core *)data;
while (1) {
GpuWorkItem *item = (GpuWorkItem *)MTQueueGet(&dec->gpu_waiting_queue);
if (!item) {
if (dec->cb_notify_frame_ready && dec->image_alloc_priv) dec->cb_notify_frame_ready(dec->image_alloc_priv, 0);
break;
}
av1_frame_thread_data *td = item->data;
if (td) {
// sync gpu
av1_sync_gpu(td);
pthread_mutex_lock(&dec->fb_pool_mutex);
int signal = 0;
if (td->back_buffer0) signal |= release_fb(dec, td->back_buffer0);
signal |= release_fb(dec, td->dst_frame_buffer);
for (int r = 0; r < 7; ++r) {
if (td->refs[r]) signal |= release_fb(dec, td->refs[r]);
td->refs[r] = 0;
}
td->dst_frame_buffer = 0;
td->back_buffer0 = 0;
if (signal) {
pthread_cond_signal(&dec->fb_pool_empty_cond);
}
if (td->sec_thread_data) {
MTQueuePush(&dec->frame_data_pool, td->sec_thread_data);
}
pthread_mutex_unlock(&dec->fb_pool_mutex);
td->sec_thread_data = NULL;
MTQueuePush(&dec->frame_data_pool, td);
}
HwOutputImage *hw_img = item->image;
if (hw_img) {
if (dec->cb_notify_frame_ready && dec->image_alloc_priv) {
aom_image_t img = {};
aom_image_t *dst = &img;
copy_to_img(dec, hw_img, dst);
MTQueuePush(&dec->image_pool, hw_img);
dec->cb_notify_frame_ready(dec->image_alloc_priv, dst);
} else {
MTQueuePush(&dec->output_queue, hw_img);
}
}
MTQueuePush(&dec->gpu_item_pool, item);
}
return THREAD_RETURN(0);
}
extern "C" int av1_reallocate_frame_buffer(void *priv, YV12_BUFFER_CONFIG *ybf, int width, int height,
int upscaled_width, int hbd) {
if (!ybf || !priv) return -1;
Av1Core *dec = (Av1Core *)priv;
ybf->hw_show_image = NULL;
if (!ybf->hw_buffer) {
ybf->hw_buffer = get_frame_buffer(dec);
}
if (!ybf->hw_buffer) return -1;
const int bpp = hbd + 1;
const int h_border = 16;
const int aligned_width = (width + 127) & ~127;
const int aligned_height = (height + 127) & ~127;
const int y_stride = aligned_width + 2 * h_border;
const int y_height = aligned_height;
const int y_size = y_height * y_stride;
const int uv_stride = (aligned_width >> 1) + 2 * h_border;
const int uv_size = (y_height >> 1) * uv_stride;
const int fb_size = (y_size + 2 * uv_size) * (1 + hbd);
const int superres_w = (upscaled_width + 127) & ~127;
const int superres_size =
(y_height * (superres_w + 2 * h_border) + 2 * (y_height >> 1) * ((superres_w >> 1) + 2 * h_border)) * (1 + hbd);
if (ybf->hw_buffer->size < fb_size || ybf->hw_buffer->size < superres_size ||
(width != upscaled_width && dec->enable_superres == 0)) {
frame_buffer_release(dec, ybf->hw_buffer);
ybf->hw_buffer = NULL;
return -1;
}
ybf->buffer_alloc = NULL;
ybf->buffer_alloc_sz = 0;
ybf->y_crop_width = width;
ybf->y_crop_height = height;
ybf->y_width = (width + 7) & ~7;
ybf->y_height = (height + 7) & ~7;
ybf->y_stride = y_stride;
ybf->uv_crop_width = (width + 1) >> 1;
ybf->uv_crop_height = (height + 1) >> 1;
ybf->uv_width = ybf->y_width >> 1;
ybf->uv_height = ybf->y_height >> 1;
ybf->uv_stride = uv_stride;
ybf->border = h_border;
ybf->frame_size = (size_t)fb_size;
ybf->subsampling_x = 1;
ybf->subsampling_y = 1;
ybf->use_external_reference_buffers = 0;
ybf->flags = hbd ? YV12_FLAG_HIGHBITDEPTH : 0;
ybf->y_buffer = NULL;
ybf->u_buffer = NULL;
ybf->v_buffer = NULL;
HwFrameBuffer *buf = ybf->hw_buffer;
buf->width = ybf->y_width;
buf->height = ybf->y_height;
buf->hbd = hbd;
buf->y_crop_width = ybf->y_crop_width;
buf->y_crop_height = ybf->y_crop_height;
buf->uv_crop_width = ybf->uv_crop_width;
buf->uv_crop_height = ybf->uv_crop_height;
buf->planes[0].stride = ybf->y_stride * bpp;
buf->planes[1].stride = ybf->uv_stride * bpp;
buf->planes[2].stride = ybf->uv_stride * bpp;
buf->planes[0].offset = static_cast<int>(buf->base_offset + h_border * bpp);
buf->planes[1].offset = static_cast<int>(buf->base_offset + (y_size + h_border) * bpp);
buf->planes[2].offset = static_cast<int>(buf->planes[1].offset + uv_size * bpp);
buf->planes[0].res_stride = sizeof(short) * ((ybf->y_width + 127) & (~127));
buf->planes[1].res_stride = sizeof(short) * ((ybf->uv_width + 127) & (~127));
buf->planes[2].res_stride = sizeof(short) * ((ybf->uv_width + 127) & (~127));
buf->planes[0].res_offset = 0;
buf->planes[1].res_offset = buf->planes[0].res_stride * ((ybf->y_height + 127) & (~127));
buf->planes[2].res_offset = buf->planes[1].res_offset + buf->planes[1].res_stride * ((ybf->uv_height + 127) & (~127));
return 0;
}
void av1_release_fb_callback(void *priv, YV12_BUFFER_CONFIG *ybf) {
Av1Core *dec = (Av1Core *)priv;
if (ybf->hw_buffer) {
frame_buffer_release(dec, ybf->hw_buffer);
ybf->hw_buffer = NULL;
}
HwOutputImage *img = ybf->hw_show_image;
ybf->hw_show_image = NULL;
if (img) {
dec->cb_release_image(dec->image_alloc_priv, img->alloc_priv);
img->fb_ptr = NULL;
img->size = 0;
img->alloc_priv = NULL;
img->is_valid = 0;
MTQueuePush(&dec->image_pool, img);
}
}
void av1_drain_gpu_decoder(Av1Core *dec) {
if (dec && dec->gpu_thread) {
MTQueuePush(&dec->gpu_waiting_queue, NULL);
pthread_join(dec->gpu_thread, 0);
dec->gpu_thread = NULL;
}
}
void get_sec_data(Av1Core *dec, av1_frame_thread_data *td) {
pthread_mutex_lock(&td->sec_data_mutex);
if (td->sec_thread_data == NULL) {
td->sec_thread_data = (av1_frame_thread_data *)MTQueueGet(&dec->frame_data_pool);
av1_frame_thread_data *sec = td->sec_thread_data;
for (int i = 0; i < td->tile_count; ++i) sec->tile_data[i].mi_count = 0;
}
pthread_mutex_unlock(&td->sec_data_mutex);
}
extern "C" void av1_setup_frame(AV1Decoder *pbi, AV1_COMMON *cm) {
Av1Core *dec = pbi->gpu_decoder;
av1_frame_thread_data *td = dec->curr_frame_data;
if (td) {
pthread_mutex_lock(&dec->fb_pool_mutex);
if (td->back_buffer0) release_fb(dec, td->back_buffer0);
release_fb(dec, td->dst_frame_buffer);
for (int r = 0; r < 7; ++r) {
if (td->refs[r]) release_fb(dec, td->refs[r]);
td->refs[r] = 0;
}
td->dst_frame_buffer = 0;
td->back_buffer0 = 0;
pthread_mutex_unlock(&dec->fb_pool_mutex);
if (td->sec_thread_data) {
MTQueuePush(&dec->frame_data_pool, td->sec_thread_data);
}
MTQueuePush(&dec->frame_data_pool, td);
dec->curr_frame_data = NULL;
}
td = (av1_frame_thread_data *)MTQueueGet(&dec->frame_data_pool);
td->sec_thread_data = NULL;
dec->curr_frame_data = td;
td->frame_number = dec->frame_number;
YV12_BUFFER_CONFIG *buf = &cm->cur_frame->buf;
buf->hw_show_image = NULL;
cm->mi_grid_base = (MB_MODE_INFO **)td->mode_info_grid->host_ptr;
cm->mi_grid_visible = cm->mi_grid_base;
cm->mi_alloc_size = cm->mi_stride * ((cm->mi_rows + 31) & ~31);
memset(cm->mi_grid_base, 0, sizeof(MB_MODE_INFO **) * cm->mi_alloc_size);
const int grid_w = (((buf->y_width + 63) & (~63)) >> 2) + 2 + 128;
const int grid_h = (((buf->y_height + 63) & (~63)) >> 2) + 2 + 128;
td->iter_grid_stride = grid_w;
td->iter_grid_stride_uv = grid_w >> 1;
td->bitdepth = buf->bit_depth;
td->tile_count = cm->tile_cols * cm->tile_rows;
dec->thread_count = AOMMIN(pbi->max_threads, td->tile_count);
td->gen_intra_iter_y = (int *)td->gen_intra_inter_grid->host_ptr;
td->iter_grid_offset_uv = grid_w * grid_h;
td->gen_intra_iter_uv = td->gen_intra_iter_y + td->iter_grid_offset_uv;
memset(td->gen_intra_iter_y, -1, td->iter_grid_offset_uv * 4);
memset(td->gen_intra_iter_uv, -1, td->iter_grid_offset_uv);
td->is_hbd = buf->bit_depth > 8;
td->shaders = td->is_hbd ? &dec->shader_lib->shaders_hbd : &dec->shader_lib->shaders_8bit;
HwFrameBuffer *fb = buf->hw_buffer;
assert(fb);
fb->frame_number = dec->frame_number;
fb->hbd = td->is_hbd;
++dec->frame_number;
td->do_superres = 1 && dec->enable_superres && cm->width != cm->superres_upscaled_width;
td->do_loop_rest = 1 && (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
cm->rst_info[2].frame_restoration_type != RESTORE_NONE);
td->do_cdef = 1 && !cm->skip_loop_filter && !cm->coded_lossless &&
(cm->cdef_info.cdef_bits || cm->cdef_info.cdef_strengths[0] || cm->cdef_info.cdef_uv_strengths[0]);
td->do_filmgrain = 1 && cm->film_grain_params.apply_grain;
td->dst_frame_buffer = fb;
td->back_buffer0 = NULL;
if (td->do_loop_rest || td->do_cdef || td->do_superres) {
HwFrameBuffer *new_buf = get_frame_buffer(dec);
td->back_buffer0 = new_buf;
new_buf->width = fb->width;
new_buf->height = fb->height;
new_buf->hbd = td->is_hbd;
new_buf->y_crop_width = fb->y_crop_width;
new_buf->uv_crop_width = fb->uv_crop_width;
new_buf->y_crop_height = fb->y_crop_height;
new_buf->uv_crop_height = fb->uv_crop_height;
memcpy(new_buf->planes, fb->planes, sizeof(fb->planes));
new_buf->planes[0].offset += static_cast<int>(new_buf->base_offset - fb->base_offset);
new_buf->planes[1].offset += static_cast<int>(new_buf->base_offset - fb->base_offset);
new_buf->planes[2].offset += static_cast<int>(new_buf->base_offset - fb->base_offset);
td->frame_buffer = td->back_buffer0;
dec->back_buffer1.width = fb->width;
dec->back_buffer1.height = fb->height;
memcpy(dec->back_buffer1.planes, fb->planes, sizeof(fb->planes));
dec->back_buffer1.planes[0].offset += static_cast<int>(dec->back_buffer1.base_offset - fb->base_offset);
dec->back_buffer1.planes[1].offset += static_cast<int>(dec->back_buffer1.base_offset - fb->base_offset);
dec->back_buffer1.planes[2].offset += static_cast<int>(dec->back_buffer1.base_offset - fb->base_offset);
} else
td->frame_buffer = td->dst_frame_buffer;
pthread_mutex_lock(&dec->fb_pool_mutex);
++fb->ref_cnt;
for (int i = 0; i < 7; ++i) {
td->refs[i] = NULL;
if (cm->remapped_ref_idx[i] == -1) continue;
YV12_BUFFER_CONFIG *ref = &cm->ref_frame_map[cm->remapped_ref_idx[i]]->buf;
if (!ref) continue;
assert(ref->hw_buffer->size == dec->fb_size);
td->refs[i] = ref->hw_buffer;
++td->refs[i]->ref_cnt;
}
pthread_mutex_unlock(&dec->fb_pool_mutex);
td->ext_idct_buffer = 0;
if (IdctCoefCountNum != IdctCoefCountDenum) {
for (int r = 0; r < cm->tile_rows; ++r)
for (int c = 0; c < cm->tile_cols; ++c) {
td->ext_idct_buffer |= ((cm->tile_col_start_sb[c + 1] - cm->tile_col_start_sb[c]) *
(cm->tile_row_start_sb[r + 1] - cm->tile_row_start_sb[r])) <= TileSbSizeThreshold;
}
if (td->ext_idct_buffer) {
get_sec_data(dec, td);
}
}
assert(dec->thread_count <= EntropyThreadCount);
if (!dec->gpu_thread) pthread_create(&dec->gpu_thread, 0, (LPTHREAD_START_ROUTINE)gpu_thread_hook, dec);
td->scale_enable = 0;
for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
scale_factors *sf = get_ref_scale_factors(cm, i);
if (!sf) {
td->scale_factors[i].x_scale = REF_NO_SCALE;
td->scale_factors[i].y_scale = REF_NO_SCALE;
td->scale_factors[i].x_step = 0;
td->scale_factors[i].y_step = 0;
} else {
td->scale_factors[i].x_step = sf->x_step_q4;
td->scale_factors[i].y_step = sf->y_step_q4;
td->scale_factors[i].x_scale = sf->x_scale_fp;
td->scale_factors[i].y_scale = sf->y_scale_fp;
td->scale_enable |= av1_is_scaled(sf);
}
}
}
extern "C" void av1_setup_ext_coef_buffer(AV1Decoder *pbi, AV1_COMMON *cm, ThreadData *thread_data) {
av1_tile_data *tile = thread_data->tile_data;
Av1Core *dec = pbi->gpu_decoder;
av1_frame_thread_data *cur_td = dec->curr_frame_data;
get_sec_data(dec, cur_td);
av1_frame_thread_data *td = cur_td->sec_thread_data;
tile->dq_buffer_ptr = td->coef_buffer_offset + tile->dq_buffer_offset;
}
extern "C" void av1_setup_sec_data(AV1Decoder *pbi, AV1_COMMON *cm, ThreadData *thread_data) {
av1_tile_data *prev_t = thread_data->tile_data;
const int tile_id = thread_data->tile_id;
Av1Core *dec = pbi->gpu_decoder;
av1_frame_thread_data *cur_td = dec->curr_frame_data;
get_sec_data(dec, cur_td);
av1_frame_thread_data *td = cur_td->sec_thread_data;
av1_tile_data *t = &td->tile_data[tile_id];
t->mi_count = 0;
t->mi_offset = prev_t->mi_offset + td->mode_info_offset - cur_td->mode_info_offset;
thread_data->mi_count2 = 0;
thread_data->mi_pool2 = ((MB_MODE_INFO *)dec->mode_info_pool->host_ptr) + t->mi_offset;
thread_data->tile_data2 = t;
}
extern "C" void av1_setup_macroblockd(AV1Decoder *pbi, AV1_COMMON *cm, ThreadData *thread_data, TileInfo *tile) {
MACROBLOCKD *xd = &thread_data->xd;
Av1Core *dec = pbi->gpu_decoder;
const int tile_id = tile->tile_col + tile->tile_row * cm->tile_cols;
av1_frame_thread_data *td = dec->curr_frame_data;
av1_tile_data *t = &td->tile_data[tile_id];
memset(t, 0, sizeof(*t));
thread_data->tile_data = t;
xd->tile_data = t;
const int mi_offset = tile->mi_row_start * ((cm->mi_cols + 15) & ~15) +
tile->mi_col_start * ((tile->mi_row_end - tile->mi_row_start + 15) & (~15));
const int mi_max =
((tile->mi_row_end - tile->mi_row_start + 15) & (~15)) * ((tile->mi_col_end - tile->mi_col_start + 15) & (~15));
t->dq_buffer_offset = mi_offset * (td->ext_idct_buffer ? IdctCoefCountDenum : IdctCoefCountNum);
t->dq_buffer_ptr = td->coef_buffer_offset + t->dq_buffer_offset;
t->dq_buffer_base = (tran_low_t *)dec->idct_coefs->host_ptr;
const int mib_sz = cm->seq_params.mib_size;
const int sb_sz = mib_sz * mib_sz * IdctCoefCountDenum;
t->dq_buffer_max = t->dq_buffer_ptr + mi_max * IdctCoefCountNum - sb_sz;
assert(td->ext_idct_buffer || t->dq_buffer_max > t->dq_buffer_ptr);
const int blocks_4x4_count = (mi_offset * 3 + 1) >> 1;
t->intra_iter_max = -1;
t->intra_iter_max_uv = -1;
t->blocks_offset = blocks_4x4_count;
t->idct_blocks_host = (tx_block_info_gpu *)td->idct_blocks_unordered->host_ptr + blocks_4x4_count;
const int mi_offset_base =
tile->mi_row_start * cm->mi_stride + tile->mi_col_start * (tile->mi_row_end - tile->mi_row_start);
const int mi_max_base = (tile->mi_row_end - tile->mi_row_start) * (tile->mi_col_end - tile->mi_col_start);
t->mi_offset = (mi_offset_base >> 1) + td->mode_info_offset;
t->mi_count = 0;
thread_data->tile_id = tile_id;
thread_data->mi_count = 0;
thread_data->mi_count_max = mi_max_base >> 1;
thread_data->mi_pool = ((MB_MODE_INFO *)dec->mode_info_pool->host_ptr) + t->mi_offset;
thread_data->mi_count2 = 0;
thread_data->mi_pool2 = NULL;
thread_data->tile_data2 = NULL;
thread_data->ext_idct_buffer = td->ext_idct_buffer;
t->mi_col_start = tile->mi_col_start;
t->mi_row_start = tile->mi_row_start;
t->gen_index_ptr = 0;
t->gen_index_base = blocks_4x4_count * 2;
t->gen_indexes = ((unsigned int *)td->gen_mi_block_indexes->host_ptr) + t->gen_index_base;
t->gen_block_warp_offset = tile_id * (256 + 32) + (mi_offset >> 2) * 10;
t->gen_block_map_offset = t->gen_block_warp_offset + 32;
t->gen_block_map = ((int *)td->gen_block_map->host_ptr) + t->gen_block_map_offset;
t->gen_block_map_wrp = ((int *)td->gen_block_map->host_ptr) + t->gen_block_warp_offset;
t->gen_intra_iter_y = -1;
t->gen_intra_iter_uv = -1;
t->have_inter = 0;
t->gen_intra_max_iter = td->tile_count == 1 ? (dec->pred_map_size - 512) / 10
: ((((tile->mi_col_end - tile->mi_col_start + 15) & ~15) *
((tile->mi_row_end - tile->mi_row_start + 15) & ~15)) >>
2) -
2;
t->gen_intra_iter_set = AOMMIN(t->gen_intra_max_iter - 16, 256);
t->gen_iter_clear_offset = (t->gen_intra_iter_set + 16) * 10 + 256;
t->gen_iter_clear_size = sizeof(int) * (256 + t->gen_intra_max_iter * 10 - t->gen_iter_clear_offset);
memset(t->gen_block_map_wrp, 0, sizeof(int) * (32 + 16 + t->gen_iter_clear_offset));
}
extern "C" void av1_intra_palette(AV1Decoder *pbi, MB_MODE_INFO *mi, Av1ColorMapParam *params, int plane) {
Av1Core *dec = pbi->gpu_decoder;
av1_frame_thread_data *td = dec->curr_frame_data;
int i;
const HwFrameBuffer *fb = td->frame_buffer;
const int bsz = 4 >> plane;
uint8_t *map = params->color_map;
uint8_t *palette_buf = (uint8_t *)td->palette_buffer->host_ptr;
if (fb->hbd) {
const int stride = fb->planes[plane].stride >> 1;
const int offset = ((mi->mi_row * bsz) & (~3)) * stride + ((mi->mi_col * bsz) & (~3));
uint16_t *dst = (uint16_t *)(palette_buf + fb->planes[plane].offset - fb->base_offset) + offset;
for (i = 0; i < params->plane_height; ++i) {
for (int j = 0; j < params->plane_width; ++j) dst[j] = mi->palette_mode_info.palette_colors[map[j] + 8 * plane];
dst += stride;
map += params->plane_width;
}
if (plane) {
dst = (uint16_t *)(palette_buf + fb->planes[2].offset - fb->base_offset) + offset;
map = params->color_map;
for (i = 0; i < params->plane_height; ++i) {
for (int j = 0; j < params->plane_width; ++j) dst[j] = mi->palette_mode_info.palette_colors[map[j] + 16];
dst += stride;
map += params->plane_width;
}
}
} else {
const int stride = fb->planes[plane].stride;
const int offset = ((mi->mi_row * bsz) & (~3)) * stride + ((mi->mi_col * bsz) & (~3));
uint8_t *dst = palette_buf + fb->planes[plane].offset - fb->base_offset + offset;
for (i = 0; i < params->plane_height; ++i) {
for (int j = 0; j < params->plane_width; ++j)
dst[j] = static_cast<uint8_t>(mi->palette_mode_info.palette_colors[map[j] + 8 * plane]);
dst += stride;
map += params->plane_width;
}
if (plane) {
dst = palette_buf + fb->planes[2].offset - fb->base_offset + offset; // xd->dev_frame_planes[2] + offset;
map = params->color_map;
for (i = 0; i < params->plane_height; ++i) {
for (int j = 0; j < params->plane_width; ++j)
dst[j] = static_cast<uint8_t>(mi->palette_mode_info.palette_colors[map[j] + 16]);
dst += stride;
map += params->plane_width;
}
}
}
}
extern "C" void av1_decode_sef(AV1Decoder *pbi) {
Av1Core *dec = pbi->gpu_decoder;
AV1_COMMON *cm = &pbi->common;
YV12_BUFFER_CONFIG *buf = &cm->cur_frame->buf;
GpuWorkItem *item = (GpuWorkItem *)MTQueueGet(&dec->gpu_item_pool);
item->data = NULL;
item->image = buf->hw_show_image;
if (item->image == NULL && buf->hw_buffer) {
av1_frame_thread_data *td = (av1_frame_thread_data *)MTQueueGet(&dec->frame_data_pool);
td->dst_frame_buffer = buf->hw_buffer;
td->is_hbd = buf->bit_depth > 8;
td->shaders = td->is_hbd ? &dec->shader_lib->shaders_hbd : &dec->shader_lib->shaders_8bit;
td->do_filmgrain = cm->film_grain_params.apply_grain;
dec->curr_frame_data = td;
memset(td->refs, 0, sizeof(td->refs));
td->back_buffer0 = 0;
frame_buffer_acquire(dec, td->dst_frame_buffer);
av1_prepare_command_buffer(dec);
av1_postprocess_copy_output(dec, cm);
av1_commit_command_buffer(dec);
item->image = buf->hw_show_image;
item->data = td;
}
dec->curr_frame_data = NULL;
if (item->image) item->image->user_priv = pbi->user_priv;
buf->hw_show_image = NULL;
MTQueuePush(&dec->gpu_waiting_queue, item);
}
extern "C" int av1_decode_frame_gpu(AV1Decoder *pbi) {
Av1Core *dec = pbi->gpu_decoder;
AV1_COMMON *cm = &pbi->common;
av1_prepare_command_buffer(dec);
av1_prediction_gen_blocks(pbi, dec);
av1_idct_run(dec);
av1_prediction_run_all(dec, cm, NULL);
av1_loopfilter_gpu(dec, cm, &pbi->mb);
av1_cdef_looprestoration(dec, cm, &pbi->lr_ctxt);
av1_inter_ext_borders(dec, cm);
av1_postprocess_copy_output(dec, cm);
av1_commit_command_buffer(dec);
GpuWorkItem *item = (GpuWorkItem *)MTQueueGet(&dec->gpu_item_pool);
item->data = dec->curr_frame_data;
item->image = NULL;
dec->curr_frame_data = NULL;
if (cm->show_frame) {
item->image = cm->cur_frame->buf.hw_show_image;
item->image->user_priv = pbi->user_priv;
cm->cur_frame->buf.hw_show_image = NULL;
}
MTQueuePush(&dec->gpu_waiting_queue, item);
return 0;
}