blob: 5bfbee97ee1f6cc2a9743198467e01a92e174077 [file] [log] [blame]
/*
* Copyright 2020 Google LLC
*
*/
/*
* Copyright (c) 2020, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "mode_info.h"
#define Warp 0
#define CasualInter 1
#define CompoundAvrg 2
#define CompoundDiff 3
#define CompoundMasked 4
#define CompoundGlobalWarp 5
#define CompoundDiffUv 6
#define CompoundDiffUvGlobalWarp 7
#define ObmcAbove 8
#define ObmcLeft 9
#define Inter2x2 10
#define Inter2x2Comp 11
#define Inter2x2CompP2 12
#define InterSizesAllCommon 24
#define Inter2x2ArrOffset 216
#define Inter2x2Count 3
#define InterCountsAll 219
#define CompoundTypeAvrg 0
#define CompoundTypeMasked 1
#define CompoundTypeDiffY 2
#define CompoundTypeDiffUv 3
#define IntraSizes 9
#define ReconstructBlockSizes 36
#define IntraTypeCount 10
#define IntraBlockOffset 264
#define ReconBlockOffset 219
#define DC_PRED 0
#define V_PRED 1
#define H_PRED 2
#define D45_PRED 3
#define D135_PRED 4
#define D113_PRED 5
#define D157_PRED 6
#define D203_PRED 7
#define D67_PRED 8
#define SMOOTH_PRED 9
#define SMOOTH_V_PRED 10
#define SMOOTH_H_PRED 11
#define PAETH_PRED 12
#define UV_CFL_PRED 13
#define NEARESTMV 13
#define NEARMV 14
#define GLOBALMV 15
#define NEWMV 16
// Compound ref compound modes
#define NEAREST_NEARESTMV 17
#define NEAR_NEARMV 18
#define NEAREST_NEWMV 19
#define NEW_NEARESTMV 20
#define NEAR_NEWMV 21
#define NEW_NEARMV 22
#define GLOBAL_GLOBALMV 23
#define NEW_NEWMV 24
#define MB_MODE_COUNT 25
#define SINGLE_INTER_MODE_START NEARESTMV
#define SINGLE_INTER_MODE_END NEAREST_NEARESTMV
#define BLOCK_4X4 0
#define BLOCK_4X8 1
#define BLOCK_8X4 2
#define BLOCK_8X8 3
#define BLOCK_8X16 4
#define BLOCK_16X8 5
#define BLOCK_16X16 6
#define BLOCK_16X32 7
#define BLOCK_32X16 8
#define BLOCK_32X32 9
#define BLOCK_32X64 10
#define BLOCK_64X32 11
#define BLOCK_64X64 12
#define BLOCK_64X128 13
#define BLOCK_128X64 14
#define BLOCK_128X128 15
#define BLOCK_4X16 16
#define BLOCK_16X4 17
#define BLOCK_8X32 18
#define BLOCK_32X8 19
#define BLOCK_16X64 20
#define BLOCK_64X16 21
#define SIMPLE_TRANSLATION 0
#define OBMC_CAUSAL 1
#define WARPED_CAUSAL 2
#define MOTION_MODES 3
#define COMPOUND_AVERAGE 0
#define COMPOUND_DISTWTD 1
#define COMPOUND_WEDGE 2
#define COMPOUND_DIFFWTD 3
#define COMPOUND_TYPES 4
#define MASKED_COMPOUND_TYPES 2
#define InterNoSkipFlag 0x2000
#define NeedAboveLut 0x3f7f
#define NeedLeftLut 0x3Ef7
#define NeedRightLut 0x010A
#define NeedBotLut 0x0084
#define NeedAboveLeftLut 0x11ff
#define InterFilterLut 0x25432010
StructuredBuffer<MB_MODE_INFO> buffer_mi : register(t0);
ByteAddressBuffer blocks_indexes : register(t1);
ByteAddressBuffer blocks_index_base : register(t2);
ByteAddressBuffer mi_grid : register(t3);
ByteAddressBuffer intra_iter_grid : register(t4);
RWByteAddressBuffer pred_blocks : register(u0);
RWByteAddressBuffer pred_blocks_warp : register(u1);
cbuffer GenBlockData : register(b0) {
uint cb_mi_cols;
uint cb_mi_rows;
uint cb_mi_stride;
uint cb_mi_addr_base;
uint cb_iter_grid_stride;
uint cb_iter_grid_offset_uv;
uint cb_iter_grid_stride_uv;
uint cb_disable_edge_filter;
uint cb_force_integet_mv;
int3 cb_reserved;
int4 cb_wedge_offsets[22]; //??
int4 cb_dist_wtd[8 * 8];
int4 cb_lossless_seg[8];
int4 cb_global_warp[8];
struct {
WarpedMotionParams params;
int pad;
} cb_wm_params[8];
};
cbuffer GenBlockSRT : register(b1) {
uint cb_wi_count;
uint cb_mi_offset;
uint cb_mi_idx_base;
uint cb_col_srart;
uint cb_row_srart;
uint cb_index_offset;
uint cb_index_offset_warp;
};
int intra_edge_filter_strength(int blk_wh, int d, int type) {
int strength = 0;
if (type == 0) {
if (blk_wh <= 8) {
if (d >= 56) strength = 1;
} else if (blk_wh <= 12) {
if (d >= 40) strength = 1;
} else if (blk_wh <= 16) {
if (d >= 40) strength = 1;
} else if (blk_wh <= 24) {
if (d >= 8) strength = 1;
if (d >= 16) strength = 2;
if (d >= 32) strength = 3;
} else if (blk_wh <= 32) {
if (d >= 1) strength = 1;
if (d >= 4) strength = 2;
if (d >= 32) strength = 3;
} else {
if (d >= 1) strength = 3;
}
} else {
if (blk_wh <= 8) {
if (d >= 40) strength = 1;
if (d >= 64) strength = 2;
} else if (blk_wh <= 16) {
if (d >= 20) strength = 1;
if (d >= 48) strength = 2;
} else if (blk_wh <= 24) {
if (d >= 4) strength = 3;
} else {
if (d >= 1) strength = 3;
}
}
return strength;
}
uint get_mi_index(ByteAddressBuffer grid, int index, uint base) {
uint addr = grid.Load(index * 8);
addr -= base;
return addr / ModeInfoSize;
}
[numthreads(64, 1, 1)] void main(uint3 thread
: SV_DispatchThreadID) {
if (thread.x >= cb_wi_count) return;
const int InterBlockSizeIndexLUT[6][6] = {
// h: 4 8 16 32 64 128
{0, 1, 2, -1, -1, -1}, // w = 4 (4)
{3, 4, 5, 6, -1, -1}, // w = 8
{7, 8, 9, 10, 11, -1}, // w = 16
{-1, 12, 13, 14, 15, -1}, // w = 32
{-1, -1, 16, 17, 18, 19}, // w = 64
{-1, -1, -1, -1, 20, 21} // w = 128
};
const int mi_size_wide_log2[] = {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 0, 2, 1, 3, 2, 4};
const int mi_size_high_log2[] = {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 2, 0, 3, 1, 4, 2};
const int mi_addr = cb_mi_offset + thread.x;
MB_MODE_INFO mi = buffer_mi[mi_addr];
int bsize = mi.block_type & 255;
const int bw_log = mi_size_wide_log2[bsize];
const int bh_log = mi_size_high_log2[bsize];
const int bw = 1 << bw_log;
const int bh = 1 << bh_log;
const int bw_log_uv = max(0, bw_log - 1);
const int bh_log_uv = max(0, bh_log - 1);
const uint mi_row = mi.mi_row;
const uint mi_col = mi.mi_col;
const int is_chroma_ref = ((mi_row & 1) == 0 && (bh & 1) == 1) || ((mi_col & 1) == 0 && (bw & 1) == 1);
const int ref0 = ((int)mi.block_type << 8) >> 24;
const int ref1 = ((int)mi.block_type) >> 24;
const int mode = mi.modes & 255;
const int is_inter_intra = ref0 > 0 && ref1 == 0 && bsize >= BLOCK_8X8 && bsize <= BLOCK_32X32 &&
mode >= SINGLE_INTER_MODE_START && mode < SINGLE_INTER_MODE_END;
int index_addr = (mi.index_base + cb_mi_idx_base) * 4;
const int motion_mode = mi.modes >> 24;
const int is_obmc_left = ref0 > 0 && motion_mode == OBMC_CAUSAL && mi_col > cb_col_srart;
const int is_obmc_above = ref0 > 0 && motion_mode == OBMC_CAUSAL && mi_row > cb_row_srart;
if (ref0 > 0) {
const int is_compound = ref1 > 0;
const int allow_warp = cb_force_integet_mv == 0 && bw_log > 0 && bh_log > 0;
const int allow_global_warp = allow_warp && (mode == GLOBALMV || mode == GLOBAL_GLOBALMV);
const int is_global_warp0 = cb_global_warp[ref0 - 1].x && allow_global_warp;
const int is_global_warp1 = (is_compound == 0 || allow_global_warp == 0) ? 0 : cb_global_warp[ref1 - 1].x;
const int is_local_warp = motion_mode == WARPED_CAUSAL && (mi.wm_params.type & 0x10000) == 0;
const int is_luma_warp = (is_local_warp || is_global_warp0) && allow_warp;
const int no_skip_flag =
((mi.tx_info & 0xff00) == 0 && !is_inter_intra && !is_obmc_left && !is_obmc_above) ? InterNoSkipFlag : 0;
const int block_size_id_y = InterBlockSizeIndexLUT[bw_log][bh_log];
const int comp_type = mi.interinter_comp.type;
uint wtd = 0;
const int wedge_idx = mi.interinter_comp.wedge_sign + mi.interinter_comp.wedge_index * 2;
if (is_compound) {
wtd = 0x88;
if (comp_type == COMPOUND_DISTWTD) {
wtd = cb_dist_wtd[ref0 - 1 + (ref1 - 1) * 8].x;
} else if (comp_type == COMPOUND_WEDGE) {
wtd = cb_wedge_offsets[bsize].x + (wedge_idx << (bw_log + bh_log - 2));
} else if (comp_type == COMPOUND_DIFFWTD) {
wtd = mi.interinter_comp.mask_type;
}
wtd <<= 17;
const int is_warp_compound = is_global_warp0 || is_global_warp1;
const uint filter_type_h =
(InterFilterLut >> ((((mi.interp_filters >> 16) & 15) << 2) + ((bw_log > 0) << 4))) & 7;
const uint filter_type_v = (InterFilterLut >> (((mi.interp_filters & 15) << 2) + ((bh_log > 0) << 4))) & 7;
const int gpu_comp_type = (0x2100 >> (comp_type * 4)) & 15;
uint4 block;
block.x = mi_col | (mi_row << 16);
block.y = ((ref0 - 1) << 2) | (filter_type_h << 5) | (filter_type_v << 9) | ((ref1 - 1) << 14) | wtd |
no_skip_flag | (gpu_comp_type << 30);
block.z = (mi.mv[0] << 1) & 0xfffeffff;
block.w = (mi.mv[1] << 1) & 0xfffeffff;
const int pass_type = is_warp_compound ? 5 : ((0x3422 >> (comp_type * 4)) & 15);
int pass_type_index = (pass_type - 1) * InterSizesAllCommon + block_size_id_y;
int dst_ptr = blocks_index_base.Load(4 * (cb_index_offset + pass_type_index)) + blocks_indexes.Load(index_addr);
index_addr += 4;
pred_blocks.Store4(dst_ptr * 16, block);
} else {
if (is_luma_warp) {
int dst_ptr =
blocks_index_base.Load(4 * (cb_index_offset_warp + block_size_id_y)) + blocks_indexes.Load(index_addr);
index_addr += 4;
dst_ptr *= 48;
pred_blocks_warp.Store(dst_ptr, mi_col | (mi_row << 16));
pred_blocks_warp.Store(dst_ptr + 4, ((ref0 - 1) << 2) | no_skip_flag);
WarpedMotionParams params;
if (is_local_warp)
params = mi.wm_params;
else
params = cb_wm_params[ref0 - 1].params;
pred_blocks_warp.Store4(dst_ptr + 8, params.mat[0]);
pred_blocks_warp.Store2(dst_ptr + 24, params.mat[1].xy);
int4 angle32;
angle32.x = ((int)(params.angles.x << 16)) >> 16;
angle32.y = ((int)params.angles.x) >> 16;
angle32.w = ((int)(params.angles.y << 16)) >> 16;
angle32.z = ((int)params.angles.y) >> 16;
pred_blocks_warp.Store4(dst_ptr + 32, angle32);
} else {
uint4 block;
block.x = mi_col | (mi_row << 16);
const uint filter_type_h =
(InterFilterLut >> ((((mi.interp_filters >> 16) & 15) << 2) + ((bw_log > 0) << 4))) & 7;
const uint filter_type_v = (InterFilterLut >> (((mi.interp_filters & 15) << 2) + ((bh_log > 0) << 4))) & 7;
block.y = ((ref0 - 1) << 2) | (filter_type_h << 5) | (filter_type_v << 9) | no_skip_flag;
block.z = (mi.mv[0] << 1) & 0xfffeffff;
block.w = 0;
int dst_ptr = blocks_index_base.Load(4 * (cb_index_offset + block_size_id_y)) + blocks_indexes.Load(index_addr);
index_addr += 4;
pred_blocks.Store4(dst_ptr * 16, block);
}
}
if (!is_chroma_ref) {
const int block_size_id_uv = InterBlockSizeIndexLUT[bw_log_uv][bh_log_uv];
const int mi_col_uv = mi_col >> 1;
const int mi_row_uv = mi_row >> 1;
int sub8x8 = bw_log == 0 || bh_log == 0;
int mi_addr_above = mi_addr;
int mi_addr_left = mi_addr;
int mi_addr_aboveleft = mi_addr;
if (sub8x8) {
int dy = bh_log == 0 ? -1 : 0;
int dx = bw_log == 0 ? -1 : 0;
mi_addr_left = get_mi_index(mi_grid, mi_col + dx + mi_row * cb_mi_stride, cb_mi_addr_base);
mi_addr_above = get_mi_index(mi_grid, mi_col + (mi_row + dy) * cb_mi_stride, cb_mi_addr_base);
mi_addr_aboveleft = get_mi_index(mi_grid, mi_col + dx + (mi_row + dy) * cb_mi_stride, cb_mi_addr_base);
sub8x8 &= (((int)buffer_mi[mi_addr_left].block_type << 8) >> 24) > 0;
sub8x8 &= (((int)buffer_mi[mi_addr_above].block_type << 8) >> 24) > 0;
sub8x8 &= (((int)buffer_mi[mi_addr_aboveleft].block_type << 8) >> 24) > 0;
}
if (sub8x8) {
int x = mi_col & (~1);
int y = mi_row & (~1);
const int brows = bh_log == 2 ? 4 : 2;
const int bcols = bw_log == 2 ? 4 : 2;
const int bh_flag = bh_log != 0 ? ((brows - 1) << 28) : 0; // for scale
const int bw_flag = bw_log != 0 ? ((bcols - 1) << 26) : 0;
for (int row = 0; row < brows; ++row) {
int mi_index_1 = row == 0 ? mi_addr_above : mi_addr;
if (bw_log == 0) {
const int mi_index_0 = row == 0 ? mi_addr_aboveleft : mi_addr_left;
const int block_type0 = (int)buffer_mi[mi_index_0].block_type;
const int block_type1 = (int)buffer_mi[mi_index_1].block_type;
const int is_compound0 = (block_type0 >> 24) > 0;
const int is_compound1 = (block_type1 >> 24) > 0;
const int diff_comp = is_compound0 != is_compound1;
int type_index0 = Inter2x2ArrOffset + (is_compound0 << diff_comp);
int type_index1 = Inter2x2ArrOffset + (is_compound1 << diff_comp);
const int interp_filters0 = buffer_mi[mi_index_0].interp_filters;
const int interp_filters1 = buffer_mi[mi_index_1].interp_filters;
const int flags = 1 | // U-plane
(is_compound0 == is_compound1) << 25 | // combo write
no_skip_flag | bh_flag;
int dst_ptr0 =
blocks_index_base.Load(4 * (cb_index_offset + type_index0)) + blocks_indexes.Load(index_addr);
index_addr += 4;
int dst_ptr1 = dst_ptr0 + 1;
if (diff_comp) {
dst_ptr1 = blocks_index_base.Load(4 * (cb_index_offset + type_index1)) + blocks_indexes.Load(index_addr);
index_addr += 4;
}
dst_ptr0 *= 16;
dst_ptr1 *= 16;
uint filter_type_h =
(InterFilterLut >> ((((interp_filters0 >> 16) & 15) << 2) + ((bw_log_uv > 0) << 4))) & 7;
uint filter_type_v = (InterFilterLut >> (((interp_filters0 & 15) << 2) + ((bh_log_uv > 0) << 4))) & 7;
uint4 block0;
block0.x = x | ((y + row) << 16);
block0.y = flags | ((((block_type0 << 8) >> 24) - 1) << 2) | (filter_type_h << 5) | (filter_type_v << 9) |
((((block_type0 >> 24) - 1) & 7) << 14);
block0.z = buffer_mi[mi_index_0].mv[0];
block0.w = buffer_mi[mi_index_0].mv[1];
filter_type_h = (InterFilterLut >> ((((interp_filters1 >> 16) & 15) << 2) + ((bw_log_uv > 0) << 4))) & 7;
filter_type_v = (InterFilterLut >> (((interp_filters1 & 15) << 2) + ((bh_log_uv > 0) << 4))) & 7;
uint4 block1;
block1.x = block0.x + 1;
block1.y = flags | ((((block_type1 << 8) >> 24) - 1) << 2) | (filter_type_h << 5) | (filter_type_v << 9) |
((((block_type1 >> 24) - 1) & 7) << 14);
block1.z = buffer_mi[mi_index_1].mv[0];
block1.w = buffer_mi[mi_index_1].mv[1];
pred_blocks.Store4(dst_ptr0, block0);
pred_blocks.Store4(dst_ptr1, block1);
block0.y ^= 3;
block1.y ^= 3;
dst_ptr0 += 32 >> diff_comp;
dst_ptr1 += 32 >> diff_comp;
pred_blocks.Store4(dst_ptr0, block0);
pred_blocks.Store4(dst_ptr1, block1);
} else {
const int type_index = Inter2x2ArrOffset + is_compound;
int dst_addr =
16 * (blocks_index_base.Load(4 * (cb_index_offset + type_index)) + blocks_indexes.Load(index_addr));
index_addr += 4;
const int interp_filters = buffer_mi[mi_index_1].interp_filters;
uint filter_type_h =
(InterFilterLut >> ((((interp_filters >> 16) & 15) << 2) + ((bw_log_uv > 0) << 4))) & 7;
uint filter_type_v = (InterFilterLut >> (((interp_filters & 15) << 2) + ((bh_log_uv > 0) << 4))) & 7;
const int block_type = (int)buffer_mi[mi_index_1].block_type;
uint mode_base = ((((block_type << 8) >> 24) - 1) << 2) | (filter_type_h << 5) | (filter_type_v << 9) |
((((block_type >> 24) - 1) & 7) << 14) | (1 << 25) | no_skip_flag | bw_flag;
uint4 block;
block.z = buffer_mi[mi_index_1].mv[0];
block.w = buffer_mi[mi_index_1].mv[1];
for (int p = 1; p < 3; ++p) {
for (int col = 0; col < bcols; ++col) {
block.x = (x + col) | ((y + row) << 16);
block.y = mode_base | p;
pred_blocks.Store4(dst_addr, block);
dst_addr += 16;
}
}
}
}
} else //! sub8x8
{
const uint filter_type_h =
(InterFilterLut >> ((((mi.interp_filters >> 16) & 15) << 2) + ((bw_log_uv > 0) << 4))) & 7;
const uint filter_type_v = (InterFilterLut >> (((mi.interp_filters & 15) << 2) + ((bh_log_uv > 0) << 4))) & 7;
if (is_compound) {
if (comp_type == COMPOUND_WEDGE) {
wtd = cb_wedge_offsets[bsize].y + (wedge_idx << max(0, bw_log_uv + bh_log_uv - 2));
wtd <<= 17;
}
const int is_warp_compound = (is_global_warp0 || is_global_warp1) && bw_log > 1 && bh_log > 1;
const int gpu_comp_type = (0x3100 >> (comp_type * 4)) & 15;
uint4 block;
block.x = mi_col_uv | (mi_row_uv << 16);
block.y = 1 | ((ref0 - 1) << 2) | (filter_type_h << 5) | (filter_type_v << 9) | ((ref1 - 1) << 14) | wtd |
no_skip_flag | (gpu_comp_type << 30);
block.z = mi.mv[0];
block.w = mi.mv[1];
const int pass_type =
is_warp_compound ? (comp_type == COMPOUND_DIFFWTD ? 7 : 5) : ((0x6422 >> (comp_type * 4)) & 15);
int pass_type_index = (pass_type - 1) * InterSizesAllCommon + block_size_id_uv;
int dst_ptr =
blocks_index_base.Load(4 * (cb_index_offset + pass_type_index)) + blocks_indexes.Load(index_addr);
index_addr += 4;
dst_ptr *= 16;
pred_blocks.Store4(dst_ptr, block);
block.y ^= 3;
pred_blocks.Store4(dst_ptr + 16, block);
} else {
const int is_chroma_warp = is_luma_warp && bw_log >= 2 && bh_log >= 2;
if (is_chroma_warp) {
int dst_ptr =
blocks_index_base.Load(4 * (cb_index_offset_warp + block_size_id_uv)) + blocks_indexes.Load(index_addr);
index_addr += 4;
dst_ptr *= 48;
WarpedMotionParams params;
if (is_local_warp)
params = mi.wm_params;
else
params = cb_wm_params[ref0 - 1].params;
int4 angle32;
angle32.x = ((int)(params.angles.x << 16)) >> 16;
angle32.y = ((int)params.angles.x) >> 16;
angle32.w = ((int)(params.angles.y << 16)) >> 16;
angle32.z = ((int)params.angles.y) >> 16;
pred_blocks_warp.Store(dst_ptr, mi_col_uv | (mi_row_uv << 16));
pred_blocks_warp.Store(dst_ptr + 4, 1 | ((ref0 - 1) << 2) | no_skip_flag);
pred_blocks_warp.Store4(dst_ptr + 8, params.mat[0]);
pred_blocks_warp.Store2(dst_ptr + 24, params.mat[1].xy);
pred_blocks_warp.Store4(dst_ptr + 32, angle32);
dst_ptr += 48;
pred_blocks_warp.Store(dst_ptr, mi_col_uv | (mi_row_uv << 16));
pred_blocks_warp.Store(dst_ptr + 4, 2 | ((ref0 - 1) << 2) | no_skip_flag);
pred_blocks_warp.Store4(dst_ptr + 8, params.mat[0]);
pred_blocks_warp.Store2(dst_ptr + 24, params.mat[1].xy);
pred_blocks_warp.Store4(dst_ptr + 32, angle32);
} else {
uint4 block;
block.x = mi_col_uv | (mi_row_uv << 16);
block.y = 1 | ((ref0 - 1) << 2) | (filter_type_h << 5) | (filter_type_v << 9) | no_skip_flag;
block.z = mi.mv[0];
block.w = 0;
int dst_ptr =
blocks_index_base.Load(4 * (cb_index_offset + block_size_id_uv)) + blocks_indexes.Load(index_addr);
index_addr += 4;
dst_ptr *= 16;
pred_blocks.Store4(dst_ptr, block);
dst_ptr += 16;
block.y ^= 3;
pred_blocks.Store4(dst_ptr, block);
}
}
}
}
if (is_obmc_above) {
const int x_mis = min(bw, cb_mi_cols - mi_col);
int h = bh_log > 4 ? 3 : (bh_log - 1);
int huv = h == 0 ? 0 : h - 1;
int obmc_chroma = bsize > BLOCK_16X8 && bsize != BLOCK_4X16 && bsize != BLOCK_16X4;
int count = 0;
for (int col = 0; col < x_mis && count < min(bw_log, 4);) {
int mi_addr_above = get_mi_index(mi_grid, mi_col + col + (mi_row - 1) * cb_mi_stride, cb_mi_addr_base);
int w = mi_size_wide_log2[buffer_mi[mi_addr_above].block_type & 255];
if (w == 0) {
w = 1;
mi_addr_above = get_mi_index(mi_grid, mi_col + col + 1 + (mi_row - 1) * cb_mi_stride, cb_mi_addr_base);
}
if (w > bw_log) w = bw_log;
uint above_ref = ((int)buffer_mi[mi_addr_above].block_type << 8) >> 24;
if (above_ref > 0) {
count += 1 + (w == 5);
const int filters = buffer_mi[mi_addr_above].interp_filters;
uint filter_type_h = (InterFilterLut >> ((((filters >> 16) & 15) << 2) + ((w > 0) << 4))) & 7;
uint filter_type_v = (InterFilterLut >> (((filters & 15) << 2) + ((h > 0) << 4))) & 7;
int4 block;
block.x = (mi_col + col) | (mi_row << 16);
block.y = 0 | ((above_ref - 1) << 2) | (filter_type_h << 5) | (filter_type_v << 9) | ((1 << h) << 17);
block.z = (buffer_mi[mi_addr_above].mv[0] << 1) & 0xfffeffff;
block.w = 0;
int type_index = (ObmcAbove - 1) * InterSizesAllCommon + ((w << 2) | h); // InterBlockSizeIndexLUT[w][h];
int dst_ptr = blocks_index_base.Load(4 * (cb_index_offset + type_index)) + blocks_indexes.Load(index_addr);
index_addr += 4;
pred_blocks.Store4(dst_ptr * 16, block);
if (obmc_chroma) {
filter_type_h = (InterFilterLut >> ((((filters >> 16) & 15) << 2) + ((w > 1) << 4))) & 7;
filter_type_v = (InterFilterLut >> (((filters & 15) << 2) + ((huv > 0) << 4))) & 7;
block.x = ((mi_col + col) >> 1) | ((mi_row >> 1) << 16);
block.y = 1 | ((above_ref - 1) << 2) | (filter_type_h << 5) | (filter_type_v << 9) |
(((0x84210 >> (h * 4)) & 15) << 17);
block.z = buffer_mi[mi_addr_above].mv[0];
type_index = (ObmcAbove - 1) * InterSizesAllCommon + (((w - 1) << 2) | huv);
dst_ptr = blocks_index_base.Load(4 * (cb_index_offset + type_index)) + blocks_indexes.Load(index_addr);
index_addr += 4;
dst_ptr *= 16;
pred_blocks.Store4(dst_ptr, block);
block.y ^= 3;
dst_ptr += 16;
pred_blocks.Store4(dst_ptr, block);
}
}
col += 1 << w;
}
}
if (is_obmc_left) {
const int y_mis = min(bh, cb_mi_rows - mi_row);
int w = bw_log > 4 ? 3 : (bw_log - 1);
int wuv = w == 0 ? 0 : w - 1;
int count = 0;
for (int row = 0; row < y_mis && count < min(bh_log, 4);) {
int mi_addr_left = get_mi_index(mi_grid, mi_col - 1 + (mi_row + row) * cb_mi_stride, cb_mi_addr_base);
int h = mi_size_high_log2[buffer_mi[mi_addr_left].block_type & 255];
if (h == 0) {
h = 1;
// left += xd->mi_stride;
mi_addr_left = get_mi_index(mi_grid, mi_col - 1 + (mi_row + row + 1) * cb_mi_stride, cb_mi_addr_base);
}
if (h > bh_log) h = bh_log;
uint left_ref = ((int)buffer_mi[mi_addr_left].block_type << 8) >> 24;
if (left_ref > 0) {
count += 1 + (h == 5);
const int filters = buffer_mi[mi_addr_left].interp_filters;
uint filter_type_h = (InterFilterLut >> ((((filters >> 16) & 15) << 2) + ((w > 0) << 4))) & 7;
uint filter_type_v = (InterFilterLut >> (((filters & 15) << 2) + ((h > 0) << 4))) & 7;
int4 block;
block.x = mi_col | ((mi_row + row) << 16);
block.y = 0 | ((left_ref - 1) << 2) | (filter_type_h << 5) | (filter_type_v << 9) | ((1 << w) << 17);
block.z = (buffer_mi[mi_addr_left].mv[0] << 1) & 0xfffeffff;
block.w = 0;
int type_index = (ObmcLeft - 1) * InterSizesAllCommon + ((h << 2) | w); // InterBlockSizeIndexLUT[w][h];
int dst_ptr = blocks_index_base.Load(4 * (cb_index_offset + type_index)) + blocks_indexes.Load(index_addr);
index_addr += 4;
pred_blocks.Store4(dst_ptr * 16, block);
filter_type_h = (InterFilterLut >> ((((filters >> 16) & 15) << 2) + ((wuv > 0) << 4))) & 7;
filter_type_v = (InterFilterLut >> (((filters & 15) << 2) + ((h > 1) << 4))) & 7;
block.x = (mi_col >> 1) | (((mi_row + row) >> 1) << 16);
block.y = 1 | ((left_ref - 1) << 2) | (filter_type_h << 5) | (filter_type_v << 9) |
(((0x84210 >> (w * 4)) & 15) << 17);
block.z = buffer_mi[mi_addr_left].mv[0];
type_index = (ObmcLeft - 1) * InterSizesAllCommon + (((h - 1) << 2) | wuv);
dst_ptr = blocks_index_base.Load(4 * (cb_index_offset + type_index)) + blocks_indexes.Load(index_addr);
index_addr += 4;
dst_ptr *= 16;
pred_blocks.Store4(dst_ptr, block);
block.y ^= 3;
dst_ptr += 16;
pred_blocks.Store4(dst_ptr, block);
}
row += 1 << h;
}
}
}
const int y_use_palette = (mi.palette_mode_info.sizes & 0xffff) != 0;
const int uv_use_palette = (mi.palette_mode_info.sizes & 0xffff0000) != 0;
if (ref0 <= 0 || is_inter_intra) {
const int tx_size_wide_log2[] = {0, 1, 2, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 2, 1, 3, 2, 4};
const int tx_size_high_log2[] = {0, 1, 2, 3, 4, 1, 0, 2, 1, 3, 2, 4, 3, 2, 0, 3, 1, 4, 2};
const int mode_to_angle_map[] = {
0, 90, 180, 45, 135, 113, 157, 203, 67,
};
const int disable_edge_filter = cb_disable_edge_filter;
const int intra_mode_flags = mi.intra_mode_flags;
const int is_intrabc = (intra_mode_flags & 0x100) != 0;
const int interintra_mode = (mi.modes >> 16) & 255;
uint tx_info = mi.tx_info;
uint tx_size = tx_info & 255;
int txw = tx_size_wide_log2[tx_size];
int txh = tx_size_high_log2[tx_size];
const int tx_uv_add = is_intrabc || is_inter_intra;
int txw_uv = min(bw_log_uv, 3 + tx_uv_add);
int txh_uv = min(bh_log_uv, 3 + tx_uv_add);
if (cb_lossless_seg[(tx_info >> 24) & 7].x && !tx_uv_add) {
txw = 0;
txh = 0;
txw_uv = 0;
txh_uv = 0;
}
txw = is_intrabc ? min(bw_log, 4) : is_inter_intra ? bw_log : txw;
txh = is_intrabc ? min(bh_log, 4) : is_inter_intra ? bh_log : txh;
const int max_cnt_x = (cb_mi_cols - mi_col + (1 << txw) - 1) >> txw;
const int max_cnt_y = (cb_mi_rows - mi_row + (1 << txh) - 1) >> txh;
const int unit_x_log = bw_log == 5 && !is_intrabc;
const int unit_y_log = bh_log == 5 && !is_intrabc;
int cnt_y = 1 << (bh_log - txh);
int cnt_x = 1 << (bw_log - txw);
const int cfl_max_x = (mi_col + (min(cnt_x, max_cnt_x) << txw)) << 2;
const int cfl_max_y = (mi_row + (min(cnt_y, max_cnt_y) << txh)) << 2;
if (!y_use_palette) {
const int mode1 = (is_inter_intra ? (0x9210 >> (interintra_mode * 4)) : mi.modes) & 15;
int need_above = (NeedAboveLut >> mode1) & 1;
int need_left = (NeedLeftLut >> mode1) & 1;
const int use_filter = mi.filter_intra_mode_info >> 8;
const int mode_gpu = is_intrabc ? (12 << 6) : use_filter ? (13 << 6) : mode1 ? ((mode1 - 1) << 6) : (14 << 6);
int is_dir = mode1 >= V_PRED && mode1 <= D67_PRED;
int mode_flags_base = txw | (((tx_info & 0xff00) == 0) << 5) | mode_gpu;
int dir_above_filter = 0;
int dir_left_filter = 0;
if (is_dir) {
int upsample_above = 0;
int upsample_left = 0;
int angle_delta = (intra_mode_flags << 8) >> 24;
int angle = mode_to_angle_map[mode1] + angle_delta * 3;
const int mode_angle = angle_delta + 3;
if (!disable_edge_filter) {
int filt_type = 0;
if (mi_row > cb_row_srart) {
const int above_idx = get_mi_index(mi_grid, mi_col + (mi_row - 1) * cb_mi_stride, cb_mi_addr_base);
const int m = buffer_mi[above_idx].modes & 255;
filt_type = m == SMOOTH_PRED || m == SMOOTH_V_PRED || m == SMOOTH_H_PRED;
}
if (mi_col > cb_col_srart) {
const int left_idx = get_mi_index(mi_grid, mi_col - 1 + mi_row * cb_mi_stride, cb_mi_addr_base);
const int m = buffer_mi[left_idx].modes & 255;
filt_type |= m == SMOOTH_PRED || m == SMOOTH_V_PRED || m == SMOOTH_H_PRED;
}
int d90 = abs(angle - 90);
int d180 = abs(angle - 180);
int blk_wh = (4 << txw) + (4 << txh);
upsample_above = d90 != 0 && d90 < 40 && blk_wh <= (16 >> filt_type);
upsample_left = d180 != 0 && d180 < 40 && blk_wh <= (16 >> filt_type);
dir_above_filter = intra_edge_filter_strength(blk_wh, d90, filt_type);
dir_left_filter = intra_edge_filter_strength(blk_wh, d180, filt_type);
}
mode_flags_base |= (upsample_above << 22) | (upsample_left << 23) | (mode_angle << 28);
}
mode_flags_base |= is_inter_intra << 31;
int mode_info0 = 0;
if (is_inter_intra) {
const int w_idx = mi.interintra_wedge_sign + mi.interintra_wedge_index * 2;
const int w_ofs = cb_wedge_offsets[bsize].x;
const int w_sz = 1 << max(0, bw_log + bh_log - 2);
mode_info0 = w_ofs + w_sz * ((intra_mode_flags & 1) ? w_idx : (32 + interintra_mode));
} else if (is_intrabc) {
mode_info0 = (mi.mv[0] << 1) & 0xfffeffff;
need_left = 0;
need_above = 0;
} else if (use_filter) {
mode_info0 = txh | ((mi.filter_intra_mode_info & 255) << 4);
}
const int type_size = txw + txh;
const int type_idx_base = use_filter ? IntraBlockOffset : (IntraBlockOffset - 1 - type_size);
cnt_x >>= unit_x_log;
cnt_y >>= unit_y_log;
for (int unit_y = 0; unit_y <= unit_y_log; ++unit_y) {
for (int unit_x = 0; unit_x <= unit_x_log; ++unit_x) {
const int x_start = unit_x * cnt_x;
const int x_end = min(x_start + cnt_x, max_cnt_x);
const int y_start = unit_y * cnt_y;
const int y_end = min(y_start + cnt_y, max_cnt_y);
for (int y = y_start; y < y_end; ++y) {
for (int x = x_start; x < x_end; ++x) {
const int col = mi_col + (x << txw);
const int row = mi_row + (y << txh);
const int subblk_w = 1 << txw;
const int subblk_h = 1 << txh;
const int have_top = y || (mi_row > cb_row_srart);
const int have_left = x || (mi_col > cb_col_srart);
uint block_index = blocks_indexes.Load(index_addr);
index_addr += 4;
int above_available = have_top;
if (need_above) {
const int xr = cb_mi_cols - col - subblk_w;
int have_top_right = block_index & 1;
above_available =
(have_top ? min(subblk_w, subblk_w + xr) : 0) + (have_top_right ? min(subblk_w, xr) : 0);
}
int left_available = have_left;
if (need_left) {
const int yd = cb_mi_rows - row - subblk_h;
int have_bottom_left = block_index & 2;
left_available =
(have_left ? min(subblk_h, subblk_h + yd) : 0) + (have_bottom_left ? min(subblk_h, yd) : 0);
}
int iter_grid_stride = cb_iter_grid_stride;
int iter = intra_iter_grid.Load((col + subblk_w + (row + 1) * iter_grid_stride) * 4);
const int type_index = iter * IntraTypeCount + type_idx_base;
const int dst_ptr = blocks_index_base.Load(4 * (cb_index_offset + type_index)) + (block_index >> 2);
uint4 block;
block.x = col | (row << 16);
block.y = mode_flags_base | (above_available << 10) | (left_available << 16) |
(above_available ? (dir_above_filter << 24) : 0) |
(left_available ? (dir_left_filter << 26) : 0);
block.z = mode_info0;
block.w = 0;
pred_blocks.Store4(dst_ptr * 16, block);
}
}
}
}
}
if (!uv_use_palette && !is_chroma_ref) {
const int mi_col_uv = mi_col >> 1;
const int mi_row_uv = mi_row >> 1;
const int mode1 = (is_inter_intra ? (0x9210 >> (interintra_mode * 4)) : (mi.modes >> 8)) & 15;
int need_above = (NeedAboveLut >> mode1) & 1;
int need_left = (NeedLeftLut >> mode1) & 1;
const int mode_gpu =
is_intrabc ? (12 << 6) : mode1 == UV_CFL_PRED ? (15 << 6) : mode1 ? ((mode1 - 1) << 6) : (14 << 6);
int is_dir = mode1 >= V_PRED && mode1 <= D67_PRED;
int mode_flags_base = txw_uv | (((tx_info & 0xff00) == 0) << 5) | mode_gpu;
int dir_above_filter = 0;
int dir_left_filter = 0;
const uint mi_col1 = mi_col & ~1;
const uint mi_row1 = mi_row & ~1;
if (is_dir) {
int upsample_above = 0;
int upsample_left = 0;
int angle_delta = intra_mode_flags >> 24;
int angle = mode_to_angle_map[mode1] + angle_delta * 3;
const int mode_angle = angle_delta + 3;
if (!disable_edge_filter) {
int filt_type = 0;
const int mi_base = mi_col1 + mi_row1 * cb_mi_stride;
if (mi_row1 > cb_row_srart) {
const int above_idx = get_mi_index(mi_grid, mi_base + 1 - cb_mi_stride, cb_mi_addr_base);
const int m = (buffer_mi[above_idx].modes >> 8) & 255;
filt_type = (m == SMOOTH_PRED || m == SMOOTH_V_PRED || m == SMOOTH_H_PRED) &&
(((int)buffer_mi[above_idx].block_type << 8) >> 24) <= 0 &&
(buffer_mi[above_idx].intra_mode_flags & 0x100) == 0;
}
if (mi_col1 > cb_col_srart) {
const int left_idx = get_mi_index(mi_grid, mi_base - 1 + cb_mi_stride, cb_mi_addr_base);
const int m = (buffer_mi[left_idx].modes >> 8) & 255;
filt_type |= (m == SMOOTH_PRED || m == SMOOTH_V_PRED || m == SMOOTH_H_PRED) &&
(((int)buffer_mi[left_idx].block_type << 8) >> 24) <= 0 &&
(buffer_mi[left_idx].intra_mode_flags & 0x100) == 0;
}
int d90 = abs(angle - 90);
int d180 = abs(angle - 180);
int blk_wh = (4 << txw_uv) + (4 << txh_uv);
upsample_above = d90 != 0 && d90 < 40 && blk_wh <= (16 >> filt_type);
upsample_left = d180 != 0 && d180 < 40 && blk_wh <= (16 >> filt_type);
dir_above_filter = intra_edge_filter_strength(blk_wh, d90, filt_type);
dir_left_filter = intra_edge_filter_strength(blk_wh, d180, filt_type);
}
mode_flags_base |= (upsample_above << 22) | (upsample_left << 23) | (mode_angle << 28);
}
mode_flags_base |= is_inter_intra << 31;
int mode_u = 1 << 3; // plane
int mode_v = 2 << 3; // plane
int mode_info0 = 0;
if (mode1 == UV_CFL_PRED) {
int sign_u = ((mi.cfl_alpha_signs + 1) * 11) >> 5; // CFL_SIGN_U(cfl_alpha_signs);
int sign_v = (mi.cfl_alpha_signs + 1) - 3 * sign_u; // CFL_SIGN_V(cfl_alpha_signs);
int idx_u = (sign_u == 2) ? (mi.cfl_alpha_idx >> 4) + 1 : (sign_u == 1) ? -(mi.cfl_alpha_idx >> 4) - 1 : 0;
int idx_v = (sign_v == 2) ? (mi.cfl_alpha_idx & 15) + 1 : (sign_v == 1) ? -(mi.cfl_alpha_idx & 15) - 1 : 0;
mode_u |= (idx_u + 16) << 22;
mode_v |= (idx_v + 16) << 22;
mode_info0 = cfl_max_x | (cfl_max_y << 16);
}
if (is_inter_intra) {
const int w_idx = mi.interintra_wedge_sign + mi.interintra_wedge_index * 2;
const int w_ofs = cb_wedge_offsets[bsize].y;
const int w_sz = 1 << max(0, bw_log_uv + bh_log_uv - 2);
mode_info0 = w_ofs + w_sz * ((intra_mode_flags & 1) ? w_idx : (32 + interintra_mode));
} else if (is_intrabc) {
mode_info0 = mi.mv[0];
need_left = 0;
need_above = 0;
}
const int type_idx_base = IntraBlockOffset - 1 - txw_uv - txh_uv;
const int cnt_y_uv = 1 << (bh_log_uv - txh_uv - unit_y_log);
const int cnt_x_uv = 1 << (bw_log_uv - txw_uv - unit_x_log);
for (int unit_y = 0; unit_y <= unit_y_log; ++unit_y) {
for (int unit_x = 0; unit_x <= unit_x_log; ++unit_x) {
for (int suby = 0; suby < cnt_y_uv; ++suby) {
for (int subx = 0; subx < cnt_x_uv; ++subx) {
const int x = subx + unit_x * cnt_x_uv;
const int y = suby + unit_y * cnt_y_uv;
const int col = mi_col_uv + (x << txw_uv);
const int row = mi_row_uv + (y << txh_uv);
const int subblk_w = 1 << txw_uv;
const int subblk_h = 1 << txh_uv;
const int have_top = y || (mi_row1 > cb_row_srart);
const int have_left = x || (mi_col1 > cb_col_srart);
uint block_index = blocks_indexes.Load(index_addr);
index_addr += 4;
int above_available = have_top;
if (need_above) {
const int xr = ((cb_mi_cols - mi_col - bw) + (2 << bw_log_uv) - ((x + 1) << (txw_uv + 1))) >> 1;
int have_top_right = block_index & 1;
above_available =
(have_top ? min(subblk_w, subblk_w + xr) : 0) + (have_top_right ? min(subblk_w, xr) : 0);
}
int left_available = have_left;
if (need_left) {
const int yd = ((cb_mi_rows - mi_row - bh) + (2 << bh_log_uv) - ((y + 1) << (txh_uv + 1))) >> 1;
int have_bottom_left = block_index & 2;
left_available =
(have_left ? min(subblk_h, subblk_h + yd) : 0) + (have_bottom_left ? min(subblk_h, yd) : 0);
}
int iter_grid_offset = cb_iter_grid_offset_uv;
int iter_grid_stride = cb_iter_grid_stride_uv;
int iter = intra_iter_grid.Load((iter_grid_offset + col + subblk_w + (row + 1) * iter_grid_stride) * 4);
const int type_index = iter * IntraTypeCount + type_idx_base;
int dst_ptr = blocks_index_base.Load(4 * (cb_index_offset + type_index)) + (block_index >> 2);
dst_ptr *= 16;
uint uv_mode = mode_flags_base | (above_available << 10) | (left_available << 16) |
(above_available ? (dir_above_filter << 24) : 0) |
(left_available ? (dir_left_filter << 26) : 0);
uint4 block;
block.x = col | (row << 16);
block.y = uv_mode | mode_u;
block.z = mode_info0;
block.w = 0;
pred_blocks.Store4(dst_ptr, block);
dst_ptr += 16;
block.y = uv_mode | mode_v;
pred_blocks.Store4(dst_ptr, block);
}
}
}
}
}
}
const int do_recon = (mi.tx_info & 0xff00) == 0;
const int inter_recon = do_recon && (is_obmc_above || is_obmc_left);
if (y_use_palette || inter_recon) {
const int type_index = ReconBlockOffset + bw_log + 6 * bh_log;
int dst_ptr = blocks_index_base.Load(4 * (cb_index_offset + type_index)) + blocks_indexes.Load(index_addr);
index_addr += 4;
dst_ptr *= 16;
uint4 block;
block.x = mi_col | (mi_row << 16);
block.y = (do_recon << 2) | (y_use_palette << 3);
block.z = 0;
block.w = 0;
pred_blocks.Store4(dst_ptr, block);
}
if ((uv_use_palette || inter_recon) && !is_chroma_ref) {
const int type_index = ReconBlockOffset + bw_log_uv + 6 * bh_log_uv;
int dst_ptr = blocks_index_base.Load(4 * (cb_index_offset + type_index)) + blocks_indexes.Load(index_addr);
index_addr += 4;
dst_ptr *= 16;
uint4 block;
block.x = (mi_col >> 1) | ((mi_row >> 1) << 16);
block.y = 1 | (do_recon << 2) | (uv_use_palette << 3);
block.z = 0;
block.w = 0;
pred_blocks.Store4(dst_ptr, block);
dst_ptr += 16;
block.y = 2 | (do_recon << 2) | (uv_use_palette << 3);
pred_blocks.Store4(dst_ptr, block);
}
}