Moved some operations from CPU (function av1_read_coeffs_txb) to GPU

Change-Id: I2dbc785ca94b71479aa438953263022720ca68f4
diff --git a/libav1/av1/decoder/decodetxb.c b/libav1/av1/decoder/decodetxb.c
index c4451da..955cd06 100644
--- a/libav1/av1/decoder/decodetxb.c
+++ b/libav1/av1/decoder/decodetxb.c
@@ -316,10 +316,10 @@
       if (sign) {
         dq_coeff = -dq_coeff;
       }
-      tcoeffs_raw[c] = clamp(dq_coeff, min_value, max_value);
+    tcoeffs_raw[c] = dq_coeff;// clamp(dq_coeff, min_value, max_value);
     }
-    else
-        tcoeffs_raw[c] = 0;
+  //else
+  //  tcoeffs_raw[c] = 0;
 
   }
 
@@ -328,49 +328,49 @@
   // DC value
   set_dc_sign(&cul_level, dc_val);
   {
-      const int tx_types_flags[] = {
-          0,        //    DCT_DCT,          
-          0x080000,    //    ADST_DCT,         
-          0x020000,    //    DCT_ADST,         
-          0x0A0000,    //    ADST_ADST,        
-          0x090000,    //    FLIPADST_DCT,     
-          0x028000,    //    DCT_FLIPADST,     
-          0x0B8000,    //    FLIPADST_FLIPADST,
-          0x0A8000,    //    ADST_FLIPADST,    
-          0x0B0000,    //    FLIPADST_ADST,    
-          0x140000,    //    IDTX,             
-          0x040000,    //    V_DCT,            
-          0x100000,    //    H_DCT,            
-          0x0C0000,    //    V_ADST,           
-          0x120000,    //    H_ADST,           
-          0x0D0000,    //    V_FLIPADST,       
-          0x128000    //    H_FLIPADST,       
-      };
+    const int tx_types_flags[] = {
+      0,    //  DCT_DCT,
+      0x080000,  //  ADST_DCT,
+      0x020000,  //  DCT_ADST,
+      0x0A0000,  //  ADST_ADST,
+      0x090000,  //  FLIPADST_DCT,
+      0x028000,  //  DCT_FLIPADST,
+      0x0B8000,  //  FLIPADST_FLIPADST,
+      0x0A8000,  //  ADST_FLIPADST,
+      0x0B0000,  //  FLIPADST_ADST,
+      0x140000,  //  IDTX,
+      0x040000,  //  V_DCT,
+      0x100000,  //  H_DCT,
+      0x0C0000,  //  V_ADST,
+      0x120000,  //  H_ADST,
+      0x0D0000,  //  V_FLIPADST,
+      0x128000  //  H_FLIPADST,
+    };
 
-      int mi_col = mbmi->mi_col;
-      int mi_row = mbmi->mi_row;
-      if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[mbmi->sb_type] == 1))
-          mi_row -= 1;
-      if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[mbmi->sb_type] == 1))
-          mi_col -= 1;
-      const uint32_t x = ((mi_col * MI_SIZE) >> (pd->subsampling_x + 2)) + blk_col;
-      const uint32_t y = ((mi_row * MI_SIZE) >> (pd->subsampling_y + 2)) + blk_row;
-      tx_block_info_gpu * block = tile_data->idct_blocks_host + tile_data->idct_blocks_ptr;
-      const uint32_t coef_count = ((*eob) + 3) >> 2;
-      const int av1_idct_scans_lut[TX_TYPES] = {
-         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 2
-      };
+    int mi_col = mbmi->mi_col;
+    int mi_row = mbmi->mi_row;
+    if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[mbmi->sb_type] == 1))
+      mi_row -= 1;
+    if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[mbmi->sb_type] == 1))
+      mi_col -= 1;
+    const uint32_t x = ((mi_col * MI_SIZE) >> (pd->subsampling_x + 2)) + blk_col;
+    const uint32_t y = ((mi_row * MI_SIZE) >> (pd->subsampling_y + 2)) + blk_row;
+    tx_block_info_gpu * block = tile_data->idct_blocks_host + tile_data->idct_blocks_ptr;
+    const uint32_t coef_count = ((*eob) + 3) >> 2;
+    const int av1_idct_scans_lut[TX_TYPES] = {
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 2
+    };
 
-      for (uint32_t c = *eob; c < (coef_count << 2); ++c)
-          tcoeffs_raw[c] = 0;
-      const int type_index = (xd->lossless[xd->mi[0]->segment_id] && tx_size == 0)? TX_SIZES_ALL : tx_size;
-      block->flags = coef_count | tx_types_flags[tx_type] | (plane << 21) | (av1_idct_scans_lut[tx_type] << 11);
-      block->input_offset = tile_data->dq_buffer_ptr;
-      block->output_pos = x | (y << 16);
-      block->sorting_idx = (tile_data->idct_blocks_sizes[type_index] << 8) + type_index;
-      ++tile_data->idct_blocks_sizes[type_index];
-      ++tile_data->idct_blocks_ptr;
-      tile_data->dq_buffer_ptr += coef_count << 2;
+    //for (uint32_t c = *eob; c < (coef_count << 2); ++c)
+    //  tcoeffs_raw[c] = 0;
+    const int type_index = (xd->lossless[xd->mi[0]->segment_id] && tx_size == 0)? TX_SIZES_ALL : tx_size;
+    block->flags = coef_count | tx_types_flags[tx_type] | (plane << 21) | (av1_idct_scans_lut[tx_type] << 11);
+    block->input_offset = tile_data->dq_buffer_ptr;
+    block->output_pos = x | (y << 16);
+    block->sorting_idx = (tile_data->idct_blocks_sizes[type_index] << 8) + type_index;
+    ++tile_data->idct_blocks_sizes[type_index];
+    ++tile_data->idct_blocks_ptr;
+    tile_data->dq_buffer_ptr += coef_count << 2;
   }
   return cul_level;
 }
diff --git a/libav1/dx/av1_compute.cpp b/libav1/dx/av1_compute.cpp
index 209bc40..03fccbc 100644
--- a/libav1/dx/av1_compute.cpp
+++ b/libav1/dx/av1_compute.cpp
@@ -194,7 +194,7 @@
   if (!shader_lib) return 0;
   ID3D12Device* _device = static_cast<ID3D12Device*>(d3d12device);
 
-  err |= (shader_lib->sig_idct = create_root_sig(_device, 2, 1, 2, 2)) == NULL;
+  err |= (shader_lib->sig_idct = create_root_sig(_device, 1, 2, 2, 2)) == NULL;
   err |= (shader_lib->sig_common111 = create_root_sig(_device, 1, 1, 1, 0)) == NULL;
   err |= (shader_lib->sig_common0102 = create_root_sig(_device, 0, 1, 0, 2)) == NULL;
   err |= (shader_lib->sig_common0110 = create_root_sig(_device, 0, 1, 1, 0)) == NULL;
diff --git a/libav1/dx/av1_core.cpp b/libav1/dx/av1_core.cpp
index 9f43a34..01822f8 100644
--- a/libav1/dx/av1_core.cpp
+++ b/libav1/dx/av1_core.cpp
@@ -165,13 +165,14 @@
   CHECK_RESULT(dec->loopfilter_blocks, mem->create_buffer(lf_blk_count * 32, MemoryType::DeviceOnly), do_assign);
   CHECK_RESULT(dec->mode_info_pool, mem->create_buffer(sizeof(MB_MODE_INFO) * mi_size, MemoryType::HostRW), do_assign);
   const int coef_buffer_size = target_width * target_height * 3 * IdctCoefCountNum / (IdctCoefCountDenum * 2);
-  CHECK_RESULT(dec->idct_coefs, mem->create_buffer(sizeof(int) * coef_buffer_size * 2, MemoryType::DeviceUpload),
+  CHECK_RESULT(dec->idct_coefs, mem->create_buffer(sizeof(int) * coef_buffer_size * 2, MemoryType::UploadUAV),
                do_assign);
   for (int i = 0; i < cfg.gpu_pipeline_depth; ++i) {
     av1_frame_thread_data *td = &dec->frame_thread_data[i];
     td->mode_info_max = mi_size >> 1;
     td->mode_info_offset = i * (mi_size >> 1);
     td->coef_buffer_offset = i * coef_buffer_size;
+    td->coef_buffer_size = coef_buffer_size * sizeof(int);
     CHECK_RESULT(td->command_buffer.cb_alloc, mem->create_buffer(1024 * 1024, MemoryType::DeviceUpload), do_assign);
     CHECK_RESULT(td->tile_data, (av1_tile_data *)mem->host_allocate(sizeof(av1_tile_data) * max_tiles), do_assign);
     CHECK_RESULT(td->gen_mi_block_indexes, mem->create_buffer(sizeof(int) * block_count_4x4 * 2, MemoryType::HostRW),
@@ -179,11 +180,8 @@
     CHECK_RESULT(td->gen_intra_inter_grid, mem->create_buffer(grid_w * grid_h * 6, MemoryType::HostRW), do_assign);
     CHECK_RESULT(td->gen_block_map, mem->create_buffer(dec->pred_map_size * sizeof(int), MemoryType::HostRW),
                  do_assign);
-    // CHECK_RESULT(td->mode_info, mem->create_buffer(sizeof(MB_MODE_INFO) * mi_size, MemoryType::HostRW), do_assign);
     CHECK_RESULT(td->mode_info_grid, mem->create_buffer(sizeof(MB_MODE_INFO *) * mi_size, MemoryType::HostRW),
                  do_assign);
-    // CHECK_RESULT(td->idct_coefs, mem->create_buffer(sizeof(int) * target_width * target_height * 3 / 2,
-    // MemoryType::DeviceUpload), do_assign);
     CHECK_RESULT(td->idct_blocks_unordered,
                  mem->create_buffer(block_count_4x4 * IdctBlockSize, MemoryType::DeviceUpload), do_assign);
 
@@ -195,7 +193,9 @@
     CHECK_RESULT(td->loop_rest_wiener, mem->create_buffer(64 * (block_count_4x4 >> 8), MemoryType::DeviceUpload),
                  do_assign);
     CHECK_RESULT(td->filmgrain_rand_offset,
-                 mem->create_buffer(sizeof(int) * (120 * (68 + 1)), MemoryType::DeviceUpload), do_assign);
+                 mem->create_buffer(sizeof(int) * ((target_width / 32 + 4) * (target_height / 32 + 1)),
+                                    MemoryType::DeviceUpload),
+                 do_assign);
 
     CHECK_RESULT(td->palette_buffer, mem->create_buffer(fb_size, MemoryType::DeviceUpload), do_assign);
   }
@@ -266,8 +266,7 @@
   D3D12_COMMAND_QUEUE_DESC desc = {};
   if (!context->queue) {
     desc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE;
-    desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;  // enable_cpu_output == EnableHostOutput ?
-                                                 // D3D12_COMMAND_LIST_TYPE_DIRECT : D3D12_COMMAND_LIST_TYPE_COMPUTE;
+    desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
     hr = device->CreateCommandQueue(&desc, IID_PPV_ARGS(&context->queue));
     if (FAILED(hr)) return hr;
   }
@@ -295,9 +294,6 @@
   dx_compute_context *compute = &dec->compute;
   mem->set_dx_context(compute);
 
-  //    if (!cfg->out_buffers_cb.get_out_buffer_cb ||
-  //        !cfg->out_buffers_cb.release_out_buffer_cb)
-  //        return -1;
   dec->cb_get_output_image = cfg->out_buffers_cb.get_out_buffer_cb;
   dec->cb_release_image = cfg->out_buffers_cb.release_out_buffer_cb;
   dec->cb_notify_frame_ready = cfg->out_buffers_cb.notify_frame_ready_cb;
@@ -315,6 +311,7 @@
   rcfg.gpu_pipeline_depth = FrameThreadDataCount;
   rcfg.enable_superres = 1;
   if (av1_allocate_buffers(dec, mem, rcfg, 1)) return -1;
+  memset(dec->idct_coefs->host_ptr, 0, dec->idct_coefs->size);
 
   Microsoft::WRL::ComPtr<ID3D12Device> device = compute->device;
   dec->tryhdr10x3 = cfg->tryHDR10x3;
@@ -358,7 +355,6 @@
     img->size = 0;
     img->fb_ptr = NULL;
     img->is_valid = 0;
-    //        img->hw_buf = dec->output_frame_buffers[i];
     MTQueuePush(&dec->image_pool, img);
   }
 
@@ -368,8 +364,6 @@
   for (int i = 0; i < rcfg.ref_fb_count; ++i) {
     const int offset = dec->fb_offset + dec->fb_size * i;
     HwFrameBuffer *fb = &dec->fb_pool_src[i];
-    // fb->pool_ptr = pool;
-    // fb->fb_ptr = pool + offset;
     fb->size = dec->fb_size;
     fb->base_offset = offset;
     fb->ref_cnt = 0;
@@ -745,6 +739,8 @@
       MTQueuePush(&dec->frame_data_pool, td->sec_thread_data);
     }
     MTQueuePush(&dec->frame_data_pool, td);
+    int * coef_buf = (int *)dec->idct_coefs->host_ptr;
+    memset(coef_buf + td->coef_buffer_offset, 0, td->coef_buffer_size);
     dec->curr_frame_data = NULL;
   }
 
diff --git a/libav1/dx/av1_memory.cpp b/libav1/dx/av1_memory.cpp
index 7c75fe9..ab660e8 100644
--- a/libav1/dx/av1_memory.cpp
+++ b/libav1/dx/av1_memory.cpp
@@ -91,25 +91,23 @@
   obj->size = (size + DeviceMemAlign1) & (~(DeviceMemAlign1));
 
   D3D12_HEAP_TYPE heapLut[] = {D3D12_HEAP_TYPE_DEFAULT, D3D12_HEAP_TYPE_UPLOAD, D3D12_HEAP_TYPE_CUSTOM,
-                               D3D12_HEAP_TYPE_READBACK, D3D12_HEAP_TYPE_DEFAULT};
-  const D3D12_HEAP_TYPE heapType = heapLut[mem];
-  const D3D12_RESOURCE_STATES state = (mem == DeviceOnly) ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
-                                                          : (mem == DeviceOnlyConst)
-                                                                ? D3D12_RESOURCE_STATE_COPY_DEST
-                                                                : (mem == ReadBack) ? D3D12_RESOURCE_STATE_COPY_DEST
-                                                                                    : D3D12_RESOURCE_STATE_GENERIC_READ;
+                               D3D12_HEAP_TYPE_READBACK, D3D12_HEAP_TYPE_DEFAULT, D3D12_HEAP_TYPE_CUSTOM };
+  D3D12_RESOURCE_STATES stateLut[] = { D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_GENERIC_READ, D3D12_RESOURCE_STATE_GENERIC_READ,
+                               D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS };
+  D3D12_CPU_PAGE_PROPERTY pagePropLut[] = { D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_CPU_PAGE_PROPERTY_WRITE_BACK,
+                              D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE };
 
   D3D12_HEAP_PROPERTIES heapProp;
-  heapProp.Type = heapType;
-  heapProp.CPUPageProperty = mem == HostRW ? D3D12_CPU_PAGE_PROPERTY_WRITE_BACK : D3D12_CPU_PAGE_PROPERTY_UNKNOWN;  //
-  heapProp.MemoryPoolPreference = mem == HostRW ? D3D12_MEMORY_POOL_L0 : D3D12_MEMORY_POOL_UNKNOWN;
+  heapProp.Type = heapLut[mem];
+  heapProp.CPUPageProperty = pagePropLut[mem];  //
+  heapProp.MemoryPoolPreference = (mem == HostRW || mem == UploadUAV) ? D3D12_MEMORY_POOL_L0 : D3D12_MEMORY_POOL_UNKNOWN;
   heapProp.CreationNodeMask = 0;
   heapProp.VisibleNodeMask = 0;
 
   const D3D12_RESOURCE_FLAGS flags =
-      (mem == DeviceOnly) ? D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS : D3D12_RESOURCE_FLAG_NONE;
+      (mem == DeviceOnly || mem == UploadUAV) ? D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS : D3D12_RESOURCE_FLAG_NONE;
   HRESULT hr = context->device->CreateCommittedResource(&heapProp, D3D12_HEAP_FLAG_NONE,
-                                                        &CD3DX12_RESOURCE_DESC::Buffer(obj->size, flags), state, NULL,
+                                                        &CD3DX12_RESOURCE_DESC::Buffer(obj->size, flags), stateLut[mem], NULL,
                                                         __uuidof(*obj->dev), reinterpret_cast<void **>(&obj->dev));
 
   if (SUCCEEDED(hr) && mem != DeviceOnly && mem != DeviceOnlyConst) {
diff --git a/libav1/dx/av1_memory.h b/libav1/dx/av1_memory.h
index 6b88eda..38c2573 100644
--- a/libav1/dx/av1_memory.h
+++ b/libav1/dx/av1_memory.h
@@ -40,6 +40,7 @@
   HostRW = 2,
   ReadBack = 3,
   DeviceOnlyConst = 4,
+  UploadUAV = 5,
 };
 
 using Microsoft::WRL::ComPtr;
diff --git a/libav1/dx/shaders/idct_lossless.hlsl b/libav1/dx/shaders/idct_lossless.hlsl
index 15de470..59ed5f9 100644
--- a/libav1/dx/shaders/idct_lossless.hlsl
+++ b/libav1/dx/shaders/idct_lossless.hlsl
@@ -34,12 +34,18 @@
   const int coef_count = block.x & 0x7ff;
 
   const int input_offset = (block.y + wi * 4) << 2;
-  int4 coefs = (wi < coef_count) ? (int4)buf_input.Load4(input_offset) : int4(0, 0, 0, 0);
-
-  shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].x] = coefs.x;
-  shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].y] = coefs.y;
-  shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].z] = coefs.z;
-  shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].w] = coefs.w;
+  int4 coefs = int4(0, 0, 0, 0);
+  if (wi < coef_count)
+  {
+      coefs = (int4)buf_input.Load4(input_offset);
+      buf_input.Store4(input_offset, int4(0, 0, 0, 0));
+  }
+  const int coef_min = -(1 << (cb_bitdepth + 7));
+  const int coef_max = (1 << (cb_bitdepth + 7)) - 1;
+  shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].x] = clamp(coefs.x, coef_min, coef_max);
+  shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].y] = clamp(coefs.y, coef_min, coef_max);
+  shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].z] = clamp(coefs.z, coef_min, coef_max);
+  shared_4x4_mem[loc_mem_offset + cb_scans[scan_offset].w] = clamp(coefs.w, coef_min, coef_max);
 
   GroupMemoryBarrier();
 
diff --git a/libav1/dx/shaders/idct_shader_common.h b/libav1/dx/shaders/idct_shader_common.h
index 63bed6a..425e768 100644
--- a/libav1/dx/shaders/idct_shader_common.h
+++ b/libav1/dx/shaders/idct_shader_common.h
@@ -19,9 +19,9 @@
 
 #define IdctBlockSize 16
 
-ByteAddressBuffer buf_input : register(t0);
-ByteAddressBuffer buf_blocks : register(t1);
+ByteAddressBuffer buf_blocks : register(t0);
 RWByteAddressBuffer buf_dst : register(u0);
+RWByteAddressBuffer buf_input : register(u1);
 
 cbuffer cb_idct_frame_data : register(b1) {
   uint4 cb_planes[3];
@@ -33,6 +33,9 @@
   uint cb_wicount;
 };
 
+int clamp_value(int value, int2 range) { return clamp(value, range.x, range.y); }
+
+
 #define NewSqrt2Bits 12
 #define NewSqrt2 5793
 #define NewInvSqrt2 2896
@@ -61,7 +64,13 @@
     for (i = 0; i < COEF_LOOP; ++i) {                                                                                 \
       int4 coefs = int4(0, 0, 0, 0);                                                                                  \
       if ((wi + i * N) < coef_count) {                                                                                \
-        coefs = (int4)buf_input.Load4(input_offset * 4 + i * N * 16);                                                 \
+        const int addr = input_offset * 4 + i * N * 16;                                                               \
+        coefs = (int4)buf_input.Load4(addr);                                                                          \
+        buf_input.Store4(addr, uint4(0, 0, 0, 0));                                                                    \
+        coefs.x = clamp_value(coefs.x, row_clamp);                                                                    \
+        coefs.y = clamp_value(coefs.y, row_clamp);                                                                    \
+        coefs.z = clamp_value(coefs.z, row_clamp);                                                                    \
+        coefs.w = clamp_value(coefs.w, row_clamp);                                                                    \
         if (SCALE_COEF) {                                                                                             \
           coefs.x = round_shift(coefs.x * NewInvSqrt2, NewSqrt2Bits);                                                 \
           coefs.y = round_shift(coefs.y * NewInvSqrt2, NewSqrt2Bits);                                                 \
@@ -178,8 +187,6 @@
 
 static const int sinpi[] = {0, 1321, 2482, 3344, 3803};
 
-int clamp_value(int value, int2 range) { return clamp(value, range.x, range.y); }
-
 int round_shift(int value, int bit) {
   if (bit == 0) return value;
   return (int)((value + (1 << (bit - 1))) >> bit);
diff --git a/libav1/dx/transform.cpp b/libav1/dx/transform.cpp
index 3b3e844..d087b86 100644
--- a/libav1/dx/transform.cpp
+++ b/libav1/dx/transform.cpp
@@ -96,9 +96,9 @@
   av1_tile_data *tdata = td->tile_data;
 
   command_list->SetComputeRootSignature(dec->shader_lib->sig_idct.Get());
-  command_list->SetComputeRootShaderResourceView(0, dec->idct_coefs->dev->GetGPUVirtualAddress());
-  command_list->SetComputeRootShaderResourceView(1, dec->idct_blocks->dev->GetGPUVirtualAddress());
-  command_list->SetComputeRootUnorderedAccessView(2, dec->idct_residuals->dev->GetGPUVirtualAddress());
+  command_list->SetComputeRootShaderResourceView(0, dec->idct_blocks->dev->GetGPUVirtualAddress());
+  command_list->SetComputeRootUnorderedAccessView(1, dec->idct_residuals->dev->GetGPUVirtualAddress());
+  command_list->SetComputeRootUnorderedAccessView(2, dec->idct_coefs->dev->GetGPUVirtualAddress());
   command_list->SetComputeRootConstantBufferView(4, cbo.dev_address);
   for (int type = 0; type <= TX_SIZES_ALL; ++type) {
     int offset = tdata->idct_blocks_sizes[type];
diff --git a/libav1/dx/types.h b/libav1/dx/types.h
index b4fba6d..b689df8 100644
--- a/libav1/dx/types.h
+++ b/libav1/dx/types.h
@@ -42,7 +42,6 @@
   uint32_t flags;
   MV mv0;
   MV mv1;
-  // uint32_t sorting_idx;
 };
 
 enum InterTypes {
@@ -159,6 +158,7 @@
   int mode_info_offset;
   int mode_info_max;
   int coef_buffer_offset;
+  int coef_buffer_size;
   av1_tile_data* tile_data;
   HwFrameBuffer* frame_buffer;
   HwFrameBuffer* back_buffer0;
@@ -242,5 +242,4 @@
   int tryhdr10x3;
 } Av1Core;
 
-Av1Core* Get(uint32_t may_be_null = 0);
 void PutPerfMarker(av1_frame_thread_data* td, volatile int64_t* marker);