libav1/dx/shaders/gen_pred_blocks.hlsl - av1-xbox-one - Git at Google

 /*
  * Copyright 2020 Google LLC
  *
  */

 /*
  * Copyright (c) 2020, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
  * was not distributed with this source code in the LICENSE file, you can
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */

 #include "mode_info.h"

 #define Warp 0
 #define CasualInter 1
 #define CompoundAvrg 2
 #define CompoundDiff 3
 #define CompoundMasked 4
 #define CompoundGlobalWarp 5
 #define CompoundDiffUv 6
 #define CompoundDiffUvGlobalWarp 7
 #define ObmcAbove 8
 #define ObmcLeft 9
 #define Inter2x2 10
 #define Inter2x2Comp 11
 #define Inter2x2CompP2 12
 #define InterSizesAllCommon 24
 #define Inter2x2ArrOffset 216
 #define Inter2x2Count 3
 #define InterCountsAll 219

 #define CompoundTypeAvrg 0
 #define CompoundTypeMasked 1
 #define CompoundTypeDiffY 2
 #define CompoundTypeDiffUv 3

 #define IntraSizes 9
 #define ReconstructBlockSizes 36
 #define IntraTypeCount 10
 #define IntraBlockOffset 264
 #define ReconBlockOffset 219

 #define DC_PRED 0
 #define V_PRED 1
 #define H_PRED 2
 #define D45_PRED 3
 #define D135_PRED 4
 #define D113_PRED 5
 #define D157_PRED 6
 #define D203_PRED 7
 #define D67_PRED 8
 #define SMOOTH_PRED 9
 #define SMOOTH_V_PRED 10
 #define SMOOTH_H_PRED 11
 #define PAETH_PRED 12
 #define UV_CFL_PRED 13
 #define NEARESTMV 13
 #define NEARMV 14
 #define GLOBALMV 15
 #define NEWMV 16
 // Compound ref compound modes
 #define NEAREST_NEARESTMV 17
 #define NEAR_NEARMV 18
 #define NEAREST_NEWMV 19
 #define NEW_NEARESTMV 20
 #define NEAR_NEWMV 21
 #define NEW_NEARMV 22
 #define GLOBAL_GLOBALMV 23
 #define NEW_NEWMV 24
 #define MB_MODE_COUNT 25
 #define SINGLE_INTER_MODE_START NEARESTMV
 #define SINGLE_INTER_MODE_END NEAREST_NEARESTMV

 #define BLOCK_4X4 0
 #define BLOCK_4X8 1
 #define BLOCK_8X4 2
 #define BLOCK_8X8 3
 #define BLOCK_8X16 4
 #define BLOCK_16X8 5
 #define BLOCK_16X16 6
 #define BLOCK_16X32 7
 #define BLOCK_32X16 8
 #define BLOCK_32X32 9
 #define BLOCK_32X64 10
 #define BLOCK_64X32 11
 #define BLOCK_64X64 12
 #define BLOCK_64X128 13
 #define BLOCK_128X64 14
 #define BLOCK_128X128 15
 #define BLOCK_4X16 16
 #define BLOCK_16X4 17
 #define BLOCK_8X32 18
 #define BLOCK_32X8 19
 #define BLOCK_16X64 20
 #define BLOCK_64X16 21

 #define SIMPLE_TRANSLATION 0
 #define OBMC_CAUSAL 1
 #define WARPED_CAUSAL 2
 #define MOTION_MODES 3

 #define COMPOUND_AVERAGE 0
 #define COMPOUND_DISTWTD 1
 #define COMPOUND_WEDGE 2
 #define COMPOUND_DIFFWTD 3
 #define COMPOUND_TYPES 4
 #define MASKED_COMPOUND_TYPES 2

 #define InterNoSkipFlag 0x2000
 #define NeedAboveLut 0x3f7f
 #define NeedLeftLut 0x3Ef7
 #define NeedRightLut 0x010A
 #define NeedBotLut 0x0084
 #define NeedAboveLeftLut 0x11ff
 #define InterFilterLut 0x25432010

 StructuredBuffer<MB_MODE_INFO> buffer_mi : register(t0);
 ByteAddressBuffer blocks_indexes : register(t1);
 ByteAddressBuffer blocks_index_base : register(t2);
 ByteAddressBuffer mi_grid : register(t3);
 ByteAddressBuffer intra_iter_grid : register(t4);

 RWByteAddressBuffer pred_blocks : register(u0);
 RWByteAddressBuffer pred_blocks_warp : register(u1);

 cbuffer GenBlockData : register(b0) {
   uint cb_mi_cols;
   uint cb_mi_rows;
   uint cb_mi_stride;
   uint cb_mi_addr_base;
   uint cb_iter_grid_stride;
   uint cb_iter_grid_offset_uv;
   uint cb_iter_grid_stride_uv;
   uint cb_disable_edge_filter;
   uint cb_force_integet_mv;
   int3 cb_reserved;
   int4 cb_wedge_offsets[22];  //??
   int4 cb_dist_wtd[8 * 8];
   int4 cb_lossless_seg[8];
   int4 cb_global_warp[8];
   struct {
     WarpedMotionParams params;
     int pad;
   } cb_wm_params[8];
 };

 cbuffer GenBlockSRT : register(b1) {
   uint cb_wi_count;
   uint cb_mi_offset;
   uint cb_mi_idx_base;
   uint cb_col_srart;
   uint cb_row_srart;
   uint cb_index_offset;
   uint cb_index_offset_warp;
 };

 int intra_edge_filter_strength(int blk_wh, int d, int type) {
   int strength = 0;
   if (type == 0) {
     if (blk_wh <= 8) {
       if (d >= 56) strength = 1;
     } else if (blk_wh <= 12) {
       if (d >= 40) strength = 1;
     } else if (blk_wh <= 16) {
       if (d >= 40) strength = 1;
     } else if (blk_wh <= 24) {
       if (d >= 8) strength = 1;
       if (d >= 16) strength = 2;
       if (d >= 32) strength = 3;
     } else if (blk_wh <= 32) {
       if (d >= 1) strength = 1;
       if (d >= 4) strength = 2;
       if (d >= 32) strength = 3;
     } else {
       if (d >= 1) strength = 3;
     }
   } else {
     if (blk_wh <= 8) {
       if (d >= 40) strength = 1;
       if (d >= 64) strength = 2;
     } else if (blk_wh <= 16) {
       if (d >= 20) strength = 1;
       if (d >= 48) strength = 2;
     } else if (blk_wh <= 24) {
       if (d >= 4) strength = 3;
     } else {
       if (d >= 1) strength = 3;
     }
   }
   return strength;
 }

 uint get_mi_index(ByteAddressBuffer grid, int index, uint base) {
   uint addr = grid.Load(index * 8);
   addr -= base;
   return addr / ModeInfoSize;
 }

 [numthreads(64, 1, 1)] void main(uint3 thread
                                  : SV_DispatchThreadID) {
   if (thread.x >= cb_wi_count) return;

   const int InterBlockSizeIndexLUT[6][6] = {
       // h:        4    8    16    32    64    128
       {0, 1, 2, -1, -1, -1},     // w = 4 (4)
       {3, 4, 5, 6, -1, -1},      // w = 8
       {7, 8, 9, 10, 11, -1},     // w = 16
       {-1, 12, 13, 14, 15, -1},  // w = 32
       {-1, -1, 16, 17, 18, 19},  // w = 64
       {-1, -1, -1, -1, 20, 21}   // w = 128
   };
   const int mi_size_wide_log2[] = {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 0, 2, 1, 3, 2, 4};
   const int mi_size_high_log2[] = {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 2, 0, 3, 1, 4, 2};

   const int mi_addr = cb_mi_offset + thread.x;
   MB_MODE_INFO mi = buffer_mi[mi_addr];

   int bsize = mi.block_type & 255;
   const int bw_log = mi_size_wide_log2[bsize];
   const int bh_log = mi_size_high_log2[bsize];
   const int bw = 1 << bw_log;
   const int bh = 1 << bh_log;
   const int bw_log_uv = max(0, bw_log - 1);
   const int bh_log_uv = max(0, bh_log - 1);

   const uint mi_row = mi.mi_row;
   const uint mi_col = mi.mi_col;
   const int is_chroma_ref = ((mi_row & 1) == 0 && (bh & 1) == 1) || ((mi_col & 1) == 0 && (bw & 1) == 1);

   const int ref0 = ((int)mi.block_type << 8) >> 24;
   const int ref1 = ((int)mi.block_type) >> 24;

   const int mode = mi.modes & 255;
   const int is_inter_intra = ref0 > 0 && ref1 == 0 && bsize >= BLOCK_8X8 && bsize <= BLOCK_32X32 &&
                              mode >= SINGLE_INTER_MODE_START && mode < SINGLE_INTER_MODE_END;
   int index_addr = (mi.index_base + cb_mi_idx_base) * 4;
   const int motion_mode = mi.modes >> 24;
   const int is_obmc_left = ref0 > 0 && motion_mode == OBMC_CAUSAL && mi_col > cb_col_srart;
   const int is_obmc_above = ref0 > 0 && motion_mode == OBMC_CAUSAL && mi_row > cb_row_srart;

   if (ref0 > 0) {
     const int is_compound = ref1 > 0;
     const int allow_warp = cb_force_integet_mv == 0 && bw_log > 0 && bh_log > 0;
     const int allow_global_warp = allow_warp && (mode == GLOBALMV || mode == GLOBAL_GLOBALMV);
     const int is_global_warp0 = cb_global_warp[ref0 - 1].x && allow_global_warp;
     const int is_global_warp1 = (is_compound == 0 || allow_global_warp == 0) ? 0 : cb_global_warp[ref1 - 1].x;
     const int is_local_warp = motion_mode == WARPED_CAUSAL && (mi.wm_params.type & 0x10000) == 0;
     const int is_luma_warp = (is_local_warp || is_global_warp0) && allow_warp;

     const int no_skip_flag =
         ((mi.tx_info & 0xff00) == 0 && !is_inter_intra && !is_obmc_left && !is_obmc_above) ? InterNoSkipFlag : 0;
     const int block_size_id_y = InterBlockSizeIndexLUT[bw_log][bh_log];
     const int comp_type = mi.interinter_comp.type;
     uint wtd = 0;
     const int wedge_idx = mi.interinter_comp.wedge_sign + mi.interinter_comp.wedge_index * 2;

     if (is_compound) {
       wtd = 0x88;
       if (comp_type == COMPOUND_DISTWTD) {
         wtd = cb_dist_wtd[ref0 - 1 + (ref1 - 1) * 8].x;
       } else if (comp_type == COMPOUND_WEDGE) {
         wtd = cb_wedge_offsets[bsize].x + (wedge_idx << (bw_log + bh_log - 2));
       } else if (comp_type == COMPOUND_DIFFWTD) {
         wtd = mi.interinter_comp.mask_type;
       }
       wtd <<= 17;

       const int is_warp_compound = is_global_warp0 || is_global_warp1;
       const uint filter_type_h =
           (InterFilterLut >> ((((mi.interp_filters >> 16) & 15) << 2) + ((bw_log > 0) << 4))) & 7;
       const uint filter_type_v = (InterFilterLut >> (((mi.interp_filters & 15) << 2) + ((bh_log > 0) << 4))) & 7;
       const int gpu_comp_type = (0x2100 >> (comp_type * 4)) & 15;

       uint4 block;
       block.x = mi_col | (mi_row << 16);
       block.y = ((ref0 - 1) << 2) | (filter_type_h << 5) | (filter_type_v << 9) | ((ref1 - 1) << 14) | wtd |
                 no_skip_flag | (gpu_comp_type << 30);

       block.z = (mi.mv[0] << 1) & 0xfffeffff;
       block.w = (mi.mv[1] << 1) & 0xfffeffff;

       const int pass_type = is_warp_compound ? 5 : ((0x3422 >> (comp_type * 4)) & 15);
       int pass_type_index = (pass_type - 1) * InterSizesAllCommon + block_size_id_y;

       int dst_ptr = blocks_index_base.Load(4 * (cb_index_offset + pass_type_index)) + blocks_indexes.Load(index_addr);
       index_addr += 4;
       pred_blocks.Store4(dst_ptr * 16, block);
     } else {
       if (is_luma_warp) {
         int dst_ptr =
             blocks_index_base.Load(4 * (cb_index_offset_warp + block_size_id_y)) + blocks_indexes.Load(index_addr);
         index_addr += 4;
         dst_ptr *= 48;
         pred_blocks_warp.Store(dst_ptr, mi_col | (mi_row << 16));
         pred_blocks_warp.Store(dst_ptr + 4, ((ref0 - 1) << 2) | no_skip_flag);

         WarpedMotionParams params;
         if (is_local_warp)
           params = mi.wm_params;
         else
           params = cb_wm_params[ref0 - 1].params;

         pred_blocks_warp.Store4(dst_ptr + 8, params.mat[0]);
         pred_blocks_warp.Store2(dst_ptr + 24, params.mat[1].xy);
         int4 angle32;
         angle32.x = ((int)(params.angles.x << 16)) >> 16;
         angle32.y = ((int)params.angles.x) >> 16;
         angle32.w = ((int)(params.angles.y << 16)) >> 16;
         angle32.z = ((int)params.angles.y) >> 16;
         pred_blocks_warp.Store4(dst_ptr + 32, angle32);
       } else {
         uint4 block;
         block.x = mi_col | (mi_row << 16);
         const uint filter_type_h =
             (InterFilterLut >> ((((mi.interp_filters >> 16) & 15) << 2) + ((bw_log > 0) << 4))) & 7;
         const uint filter_type_v = (InterFilterLut >> (((mi.interp_filters & 15) << 2) + ((bh_log > 0) << 4))) & 7;
         block.y = ((ref0 - 1) << 2) | (filter_type_h << 5) | (filter_type_v << 9) | no_skip_flag;
         block.z = (mi.mv[0] << 1) & 0xfffeffff;
         block.w = 0;

         int dst_ptr = blocks_index_base.Load(4 * (cb_index_offset + block_size_id_y)) + blocks_indexes.Load(index_addr);
         index_addr += 4;
         pred_blocks.Store4(dst_ptr * 16, block);
       }
     }

     if (!is_chroma_ref) {
       const int block_size_id_uv = InterBlockSizeIndexLUT[bw_log_uv][bh_log_uv];
       const int mi_col_uv = mi_col >> 1;
       const int mi_row_uv = mi_row >> 1;
       int sub8x8 = bw_log == 0 || bh_log == 0;

       int mi_addr_above = mi_addr;
       int mi_addr_left = mi_addr;
       int mi_addr_aboveleft = mi_addr;
       if (sub8x8) {
         int dy = bh_log == 0 ? -1 : 0;
         int dx = bw_log == 0 ? -1 : 0;

         mi_addr_left = get_mi_index(mi_grid, mi_col + dx + mi_row * cb_mi_stride, cb_mi_addr_base);
         mi_addr_above = get_mi_index(mi_grid, mi_col + (mi_row + dy) * cb_mi_stride, cb_mi_addr_base);
         mi_addr_aboveleft = get_mi_index(mi_grid, mi_col + dx + (mi_row + dy) * cb_mi_stride, cb_mi_addr_base);

         sub8x8 &= (((int)buffer_mi[mi_addr_left].block_type << 8) >> 24) > 0;
         sub8x8 &= (((int)buffer_mi[mi_addr_above].block_type << 8) >> 24) > 0;
         sub8x8 &= (((int)buffer_mi[mi_addr_aboveleft].block_type << 8) >> 24) > 0;
       }

       if (sub8x8) {
         int x = mi_col & (~1);
         int y = mi_row & (~1);
         const int brows = bh_log == 2 ? 4 : 2;
         const int bcols = bw_log == 2 ? 4 : 2;
         const int bh_flag = bh_log != 0 ? ((brows - 1) << 28) : 0;  // for scale
         const int bw_flag = bw_log != 0 ? ((bcols - 1) << 26) : 0;
         for (int row = 0; row < brows; ++row) {
           int mi_index_1 = row == 0 ? mi_addr_above : mi_addr;
           if (bw_log == 0) {
             const int mi_index_0 = row == 0 ? mi_addr_aboveleft : mi_addr_left;

             const int block_type0 = (int)buffer_mi[mi_index_0].block_type;
             const int block_type1 = (int)buffer_mi[mi_index_1].block_type;

             const int is_compound0 = (block_type0 >> 24) > 0;
             const int is_compound1 = (block_type1 >> 24) > 0;

             const int diff_comp = is_compound0 != is_compound1;
             int type_index0 = Inter2x2ArrOffset + (is_compound0 << diff_comp);
             int type_index1 = Inter2x2ArrOffset + (is_compound1 << diff_comp);

             const int interp_filters0 = buffer_mi[mi_index_0].interp_filters;
             const int interp_filters1 = buffer_mi[mi_index_1].interp_filters;
             const int flags = 1 |                                     // U-plane
                               (is_compound0 == is_compound1) << 25 |  // combo write
                               no_skip_flag | bh_flag;

             int dst_ptr0 =
                 blocks_index_base.Load(4 * (cb_index_offset + type_index0)) + blocks_indexes.Load(index_addr);
             index_addr += 4;
             int dst_ptr1 = dst_ptr0 + 1;
             if (diff_comp) {
               dst_ptr1 = blocks_index_base.Load(4 * (cb_index_offset + type_index1)) + blocks_indexes.Load(index_addr);
               index_addr += 4;
             }

             dst_ptr0 *= 16;
             dst_ptr1 *= 16;

             uint filter_type_h =
                 (InterFilterLut >> ((((interp_filters0 >> 16) & 15) << 2) + ((bw_log_uv > 0) << 4))) & 7;
             uint filter_type_v = (InterFilterLut >> (((interp_filters0 & 15) << 2) + ((bh_log_uv > 0) << 4))) & 7;
             uint4 block0;
             block0.x = x | ((y + row) << 16);
             block0.y = flags | ((((block_type0 << 8) >> 24) - 1) << 2) | (filter_type_h << 5) | (filter_type_v << 9) |
                        ((((block_type0 >> 24) - 1) & 7) << 14);
             block0.z = buffer_mi[mi_index_0].mv[0];
             block0.w = buffer_mi[mi_index_0].mv[1];

             filter_type_h = (InterFilterLut >> ((((interp_filters1 >> 16) & 15) << 2) + ((bw_log_uv > 0) << 4))) & 7;
             filter_type_v = (InterFilterLut >> (((interp_filters1 & 15) << 2) + ((bh_log_uv > 0) << 4))) & 7;
             uint4 block1;
             block1.x = block0.x + 1;
             block1.y = flags | ((((block_type1 << 8) >> 24) - 1) << 2) | (filter_type_h << 5) | (filter_type_v << 9) |
                        ((((block_type1 >> 24) - 1) & 7) << 14);
             block1.z = buffer_mi[mi_index_1].mv[0];
             block1.w = buffer_mi[mi_index_1].mv[1];

             pred_blocks.Store4(dst_ptr0, block0);
             pred_blocks.Store4(dst_ptr1, block1);
             block0.y ^= 3;
             block1.y ^= 3;
             dst_ptr0 += 32 >> diff_comp;
             dst_ptr1 += 32 >> diff_comp;
             pred_blocks.Store4(dst_ptr0, block0);
             pred_blocks.Store4(dst_ptr1, block1);
           } else {
             const int type_index = Inter2x2ArrOffset + is_compound;
             int dst_addr =
                 16 * (blocks_index_base.Load(4 * (cb_index_offset + type_index)) + blocks_indexes.Load(index_addr));
             index_addr += 4;

             const int interp_filters = buffer_mi[mi_index_1].interp_filters;
             uint filter_type_h =
                 (InterFilterLut >> ((((interp_filters >> 16) & 15) << 2) + ((bw_log_uv > 0) << 4))) & 7;
             uint filter_type_v = (InterFilterLut >> (((interp_filters & 15) << 2) + ((bh_log_uv > 0) << 4))) & 7;
             const int block_type = (int)buffer_mi[mi_index_1].block_type;

             uint mode_base = ((((block_type << 8) >> 24) - 1) << 2) | (filter_type_h << 5) | (filter_type_v << 9) |
                              ((((block_type >> 24) - 1) & 7) << 14) | (1 << 25) | no_skip_flag | bw_flag;

             uint4 block;
             block.z = buffer_mi[mi_index_1].mv[0];
             block.w = buffer_mi[mi_index_1].mv[1];

             for (int p = 1; p < 3; ++p) {
               for (int col = 0; col < bcols; ++col) {
                 block.x = (x + col) | ((y + row) << 16);
                 block.y = mode_base | p;
                 pred_blocks.Store4(dst_addr, block);
                 dst_addr += 16;
               }
             }
           }
         }
       } else  //! sub8x8
       {
         const uint filter_type_h =
             (InterFilterLut >> ((((mi.interp_filters >> 16) & 15) << 2) + ((bw_log_uv > 0) << 4))) & 7;
         const uint filter_type_v = (InterFilterLut >> (((mi.interp_filters & 15) << 2) + ((bh_log_uv > 0) << 4))) & 7;
         if (is_compound) {
           if (comp_type == COMPOUND_WEDGE) {
             wtd = cb_wedge_offsets[bsize].y + (wedge_idx << max(0, bw_log_uv + bh_log_uv - 2));
             wtd <<= 17;
           }

           const int is_warp_compound = (is_global_warp0 || is_global_warp1) && bw_log > 1 && bh_log > 1;
           const int gpu_comp_type = (0x3100 >> (comp_type * 4)) & 15;

           uint4 block;
           block.x = mi_col_uv | (mi_row_uv << 16);
           block.y = 1 | ((ref0 - 1) << 2) | (filter_type_h << 5) | (filter_type_v << 9) | ((ref1 - 1) << 14) | wtd |
                     no_skip_flag | (gpu_comp_type << 30);
           block.z = mi.mv[0];
           block.w = mi.mv[1];
           const int pass_type =
               is_warp_compound ? (comp_type == COMPOUND_DIFFWTD ? 7 : 5) : ((0x6422 >> (comp_type * 4)) & 15);
           int pass_type_index = (pass_type - 1) * InterSizesAllCommon + block_size_id_uv;
           int dst_ptr =
               blocks_index_base.Load(4 * (cb_index_offset + pass_type_index)) + blocks_indexes.Load(index_addr);
           index_addr += 4;
           dst_ptr *= 16;
           pred_blocks.Store4(dst_ptr, block);
           block.y ^= 3;
           pred_blocks.Store4(dst_ptr + 16, block);
         } else {
           const int is_chroma_warp = is_luma_warp && bw_log >= 2 && bh_log >= 2;
           if (is_chroma_warp) {
             int dst_ptr =
                 blocks_index_base.Load(4 * (cb_index_offset_warp + block_size_id_uv)) + blocks_indexes.Load(index_addr);
             index_addr += 4;
             dst_ptr *= 48;

             WarpedMotionParams params;
             if (is_local_warp)
               params = mi.wm_params;
             else
               params = cb_wm_params[ref0 - 1].params;

             int4 angle32;
             angle32.x = ((int)(params.angles.x << 16)) >> 16;
             angle32.y = ((int)params.angles.x) >> 16;
             angle32.w = ((int)(params.angles.y << 16)) >> 16;
             angle32.z = ((int)params.angles.y) >> 16;

             pred_blocks_warp.Store(dst_ptr, mi_col_uv | (mi_row_uv << 16));
             pred_blocks_warp.Store(dst_ptr + 4, 1 | ((ref0 - 1) << 2) | no_skip_flag);
             pred_blocks_warp.Store4(dst_ptr + 8, params.mat[0]);
             pred_blocks_warp.Store2(dst_ptr + 24, params.mat[1].xy);
             pred_blocks_warp.Store4(dst_ptr + 32, angle32);
             dst_ptr += 48;
             pred_blocks_warp.Store(dst_ptr, mi_col_uv | (mi_row_uv << 16));
             pred_blocks_warp.Store(dst_ptr + 4, 2 | ((ref0 - 1) << 2) | no_skip_flag);
             pred_blocks_warp.Store4(dst_ptr + 8, params.mat[0]);
             pred_blocks_warp.Store2(dst_ptr + 24, params.mat[1].xy);
             pred_blocks_warp.Store4(dst_ptr + 32, angle32);

           } else {
             uint4 block;
             block.x = mi_col_uv | (mi_row_uv << 16);
             block.y = 1 | ((ref0 - 1) << 2) | (filter_type_h << 5) | (filter_type_v << 9) | no_skip_flag;
             block.z = mi.mv[0];
             block.w = 0;

             int dst_ptr =
                 blocks_index_base.Load(4 * (cb_index_offset + block_size_id_uv)) + blocks_indexes.Load(index_addr);
             index_addr += 4;
             dst_ptr *= 16;
             pred_blocks.Store4(dst_ptr, block);
             dst_ptr += 16;
             block.y ^= 3;
             pred_blocks.Store4(dst_ptr, block);
           }
         }
       }
     }

     if (is_obmc_above) {
       const int x_mis = min(bw, cb_mi_cols - mi_col);
       int h = bh_log > 4 ? 3 : (bh_log - 1);
       int huv = h == 0 ? 0 : h - 1;
       int obmc_chroma = bsize > BLOCK_16X8 && bsize != BLOCK_4X16 && bsize != BLOCK_16X4;
       int count = 0;
       for (int col = 0; col < x_mis && count < min(bw_log, 4);) {
         int mi_addr_above = get_mi_index(mi_grid, mi_col + col + (mi_row - 1) * cb_mi_stride, cb_mi_addr_base);
         int w = mi_size_wide_log2[buffer_mi[mi_addr_above].block_type & 255];
         if (w == 0) {
           w = 1;
           mi_addr_above = get_mi_index(mi_grid, mi_col + col + 1 + (mi_row - 1) * cb_mi_stride, cb_mi_addr_base);
         }
         if (w > bw_log) w = bw_log;

         uint above_ref = ((int)buffer_mi[mi_addr_above].block_type << 8) >> 24;
         if (above_ref > 0) {
           count += 1 + (w == 5);
           const int filters = buffer_mi[mi_addr_above].interp_filters;

           uint filter_type_h = (InterFilterLut >> ((((filters >> 16) & 15) << 2) + ((w > 0) << 4))) & 7;
           uint filter_type_v = (InterFilterLut >> (((filters & 15) << 2) + ((h > 0) << 4))) & 7;

           int4 block;
           block.x = (mi_col + col) | (mi_row << 16);
           block.y = 0 | ((above_ref - 1) << 2) | (filter_type_h << 5) | (filter_type_v << 9) | ((1 << h) << 17);
           block.z = (buffer_mi[mi_addr_above].mv[0] << 1) & 0xfffeffff;
           block.w = 0;

           int type_index = (ObmcAbove - 1) * InterSizesAllCommon + ((w << 2) | h);  // InterBlockSizeIndexLUT[w][h];
           int dst_ptr = blocks_index_base.Load(4 * (cb_index_offset + type_index)) + blocks_indexes.Load(index_addr);
           index_addr += 4;
           pred_blocks.Store4(dst_ptr * 16, block);

           if (obmc_chroma) {
             filter_type_h = (InterFilterLut >> ((((filters >> 16) & 15) << 2) + ((w > 1) << 4))) & 7;
             filter_type_v = (InterFilterLut >> (((filters & 15) << 2) + ((huv > 0) << 4))) & 7;
             block.x = ((mi_col + col) >> 1) | ((mi_row >> 1) << 16);
             block.y = 1 | ((above_ref - 1) << 2) | (filter_type_h << 5) | (filter_type_v << 9) |
                       (((0x84210 >> (h * 4)) & 15) << 17);
             block.z = buffer_mi[mi_addr_above].mv[0];

             type_index = (ObmcAbove - 1) * InterSizesAllCommon + (((w - 1) << 2) | huv);
             dst_ptr = blocks_index_base.Load(4 * (cb_index_offset + type_index)) + blocks_indexes.Load(index_addr);
             index_addr += 4;
             dst_ptr *= 16;
             pred_blocks.Store4(dst_ptr, block);
             block.y ^= 3;
             dst_ptr += 16;
             pred_blocks.Store4(dst_ptr, block);
           }
         }
         col += 1 << w;
       }
     }

     if (is_obmc_left) {
       const int y_mis = min(bh, cb_mi_rows - mi_row);
       int w = bw_log > 4 ? 3 : (bw_log - 1);
       int wuv = w == 0 ? 0 : w - 1;
       int count = 0;
       for (int row = 0; row < y_mis && count < min(bh_log, 4);) {
         int mi_addr_left = get_mi_index(mi_grid, mi_col - 1 + (mi_row + row) * cb_mi_stride, cb_mi_addr_base);
         int h = mi_size_high_log2[buffer_mi[mi_addr_left].block_type & 255];
         if (h == 0) {
           h = 1;
           // left += xd->mi_stride;
           mi_addr_left = get_mi_index(mi_grid, mi_col - 1 + (mi_row + row + 1) * cb_mi_stride, cb_mi_addr_base);
         }
         if (h > bh_log) h = bh_log;

         uint left_ref = ((int)buffer_mi[mi_addr_left].block_type << 8) >> 24;
         if (left_ref > 0) {
           count += 1 + (h == 5);

           const int filters = buffer_mi[mi_addr_left].interp_filters;
           uint filter_type_h = (InterFilterLut >> ((((filters >> 16) & 15) << 2) + ((w > 0) << 4))) & 7;
           uint filter_type_v = (InterFilterLut >> (((filters & 15) << 2) + ((h > 0) << 4))) & 7;

           int4 block;
           block.x = mi_col | ((mi_row + row) << 16);
           block.y = 0 | ((left_ref - 1) << 2) | (filter_type_h << 5) | (filter_type_v << 9) | ((1 << w) << 17);
           block.z = (buffer_mi[mi_addr_left].mv[0] << 1) & 0xfffeffff;
           block.w = 0;

           int type_index = (ObmcLeft - 1) * InterSizesAllCommon + ((h << 2) | w);  // InterBlockSizeIndexLUT[w][h];
           int dst_ptr = blocks_index_base.Load(4 * (cb_index_offset + type_index)) + blocks_indexes.Load(index_addr);
           index_addr += 4;
           pred_blocks.Store4(dst_ptr * 16, block);
           filter_type_h = (InterFilterLut >> ((((filters >> 16) & 15) << 2) + ((wuv > 0) << 4))) & 7;
           filter_type_v = (InterFilterLut >> (((filters & 15) << 2) + ((h > 1) << 4))) & 7;
           block.x = (mi_col >> 1) | (((mi_row + row) >> 1) << 16);
           block.y = 1 | ((left_ref - 1) << 2) | (filter_type_h << 5) | (filter_type_v << 9) |
                     (((0x84210 >> (w * 4)) & 15) << 17);
           block.z = buffer_mi[mi_addr_left].mv[0];

           type_index = (ObmcLeft - 1) * InterSizesAllCommon + (((h - 1) << 2) | wuv);
           dst_ptr = blocks_index_base.Load(4 * (cb_index_offset + type_index)) + blocks_indexes.Load(index_addr);
           index_addr += 4;
           dst_ptr *= 16;
           pred_blocks.Store4(dst_ptr, block);
           block.y ^= 3;
           dst_ptr += 16;
           pred_blocks.Store4(dst_ptr, block);
         }
         row += 1 << h;
       }
     }
   }

   const int y_use_palette = (mi.palette_mode_info.sizes & 0xffff) != 0;
   const int uv_use_palette = (mi.palette_mode_info.sizes & 0xffff0000) != 0;

   if (ref0 <= 0 || is_inter_intra) {
     const int tx_size_wide_log2[] = {0, 1, 2, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 2, 1, 3, 2, 4};
     const int tx_size_high_log2[] = {0, 1, 2, 3, 4, 1, 0, 2, 1, 3, 2, 4, 3, 2, 0, 3, 1, 4, 2};
     const int mode_to_angle_map[] = {
         0, 90, 180, 45, 135, 113, 157, 203, 67,
     };

     const int disable_edge_filter = cb_disable_edge_filter;
     const int intra_mode_flags = mi.intra_mode_flags;
     const int is_intrabc = (intra_mode_flags & 0x100) != 0;
     const int interintra_mode = (mi.modes >> 16) & 255;

     uint tx_info = mi.tx_info;
     uint tx_size = tx_info & 255;

     int txw = tx_size_wide_log2[tx_size];
     int txh = tx_size_high_log2[tx_size];
     const int tx_uv_add = is_intrabc || is_inter_intra;
     int txw_uv = min(bw_log_uv, 3 + tx_uv_add);
     int txh_uv = min(bh_log_uv, 3 + tx_uv_add);
     if (cb_lossless_seg[(tx_info >> 24) & 7].x && !tx_uv_add) {
       txw = 0;
       txh = 0;
       txw_uv = 0;
       txh_uv = 0;
     }

     txw = is_intrabc ? min(bw_log, 4) : is_inter_intra ? bw_log : txw;
     txh = is_intrabc ? min(bh_log, 4) : is_inter_intra ? bh_log : txh;

     const int max_cnt_x = (cb_mi_cols - mi_col + (1 << txw) - 1) >> txw;
     const int max_cnt_y = (cb_mi_rows - mi_row + (1 << txh) - 1) >> txh;
     const int unit_x_log = bw_log == 5 && !is_intrabc;
     const int unit_y_log = bh_log == 5 && !is_intrabc;

     int cnt_y = 1 << (bh_log - txh);
     int cnt_x = 1 << (bw_log - txw);
     const int cfl_max_x = (mi_col + (min(cnt_x, max_cnt_x) << txw)) << 2;
     const int cfl_max_y = (mi_row + (min(cnt_y, max_cnt_y) << txh)) << 2;

     if (!y_use_palette) {
       const int mode1 = (is_inter_intra ? (0x9210 >> (interintra_mode * 4)) : mi.modes) & 15;
       int need_above = (NeedAboveLut >> mode1) & 1;
       int need_left = (NeedLeftLut >> mode1) & 1;

       const int use_filter = mi.filter_intra_mode_info >> 8;

       const int mode_gpu = is_intrabc ? (12 << 6) : use_filter ? (13 << 6) : mode1 ? ((mode1 - 1) << 6) : (14 << 6);

       int is_dir = mode1 >= V_PRED && mode1 <= D67_PRED;

       int mode_flags_base = txw | (((tx_info & 0xff00) == 0) << 5) | mode_gpu;

       int dir_above_filter = 0;
       int dir_left_filter = 0;

       if (is_dir) {
         int upsample_above = 0;
         int upsample_left = 0;
         int angle_delta = (intra_mode_flags << 8) >> 24;
         int angle = mode_to_angle_map[mode1] + angle_delta * 3;
         const int mode_angle = angle_delta + 3;
         if (!disable_edge_filter) {
           int filt_type = 0;
           if (mi_row > cb_row_srart) {
             const int above_idx = get_mi_index(mi_grid, mi_col + (mi_row - 1) * cb_mi_stride, cb_mi_addr_base);
             const int m = buffer_mi[above_idx].modes & 255;
             filt_type = m == SMOOTH_PRED || m == SMOOTH_V_PRED || m == SMOOTH_H_PRED;
           }
           if (mi_col > cb_col_srart) {
             const int left_idx = get_mi_index(mi_grid, mi_col - 1 + mi_row * cb_mi_stride, cb_mi_addr_base);
             const int m = buffer_mi[left_idx].modes & 255;
             filt_type |= m == SMOOTH_PRED || m == SMOOTH_V_PRED || m == SMOOTH_H_PRED;
           }
           int d90 = abs(angle - 90);
           int d180 = abs(angle - 180);
           int blk_wh = (4 << txw) + (4 << txh);
           upsample_above = d90 != 0 && d90 < 40 && blk_wh <= (16 >> filt_type);
           upsample_left = d180 != 0 && d180 < 40 && blk_wh <= (16 >> filt_type);
           dir_above_filter = intra_edge_filter_strength(blk_wh, d90, filt_type);
           dir_left_filter = intra_edge_filter_strength(blk_wh, d180, filt_type);
         }
         mode_flags_base |= (upsample_above << 22) | (upsample_left << 23) | (mode_angle << 28);
       }

       mode_flags_base |= is_inter_intra << 31;

       int mode_info0 = 0;
       if (is_inter_intra) {
         const int w_idx = mi.interintra_wedge_sign + mi.interintra_wedge_index * 2;
         const int w_ofs = cb_wedge_offsets[bsize].x;
         const int w_sz = 1 << max(0, bw_log + bh_log - 2);
         mode_info0 = w_ofs + w_sz * ((intra_mode_flags & 1) ? w_idx : (32 + interintra_mode));
       } else if (is_intrabc) {
         mode_info0 = (mi.mv[0] << 1) & 0xfffeffff;
         need_left = 0;
         need_above = 0;
       } else if (use_filter) {
         mode_info0 = txh | ((mi.filter_intra_mode_info & 255) << 4);
       }

       const int type_size = txw + txh;
       const int type_idx_base = use_filter ? IntraBlockOffset : (IntraBlockOffset - 1 - type_size);
       cnt_x >>= unit_x_log;
       cnt_y >>= unit_y_log;

       for (int unit_y = 0; unit_y <= unit_y_log; ++unit_y) {
         for (int unit_x = 0; unit_x <= unit_x_log; ++unit_x) {
           const int x_start = unit_x * cnt_x;
           const int x_end = min(x_start + cnt_x, max_cnt_x);
           const int y_start = unit_y * cnt_y;
           const int y_end = min(y_start + cnt_y, max_cnt_y);
           for (int y = y_start; y < y_end; ++y) {
             for (int x = x_start; x < x_end; ++x) {
               const int col = mi_col + (x << txw);
               const int row = mi_row + (y << txh);
               const int subblk_w = 1 << txw;
               const int subblk_h = 1 << txh;
               const int have_top = y || (mi_row > cb_row_srart);
               const int have_left = x || (mi_col > cb_col_srart);
               uint block_index = blocks_indexes.Load(index_addr);
               index_addr += 4;

               int above_available = have_top;
               if (need_above) {
                 const int xr = cb_mi_cols - col - subblk_w;
                 int have_top_right = block_index & 1;
                 above_available =
                     (have_top ? min(subblk_w, subblk_w + xr) : 0) + (have_top_right ? min(subblk_w, xr) : 0);
               }

               int left_available = have_left;
               if (need_left) {
                 const int yd = cb_mi_rows - row - subblk_h;
                 int have_bottom_left = block_index & 2;
                 left_available =
                     (have_left ? min(subblk_h, subblk_h + yd) : 0) + (have_bottom_left ? min(subblk_h, yd) : 0);
               }

               int iter_grid_stride = cb_iter_grid_stride;
               int iter = intra_iter_grid.Load((col + subblk_w + (row + 1) * iter_grid_stride) * 4);
               const int type_index = iter * IntraTypeCount + type_idx_base;
               const int dst_ptr = blocks_index_base.Load(4 * (cb_index_offset + type_index)) + (block_index >> 2);

               uint4 block;
               block.x = col | (row << 16);
               block.y = mode_flags_base | (above_available << 10) | (left_available << 16) |
                         (above_available ? (dir_above_filter << 24) : 0) |
                         (left_available ? (dir_left_filter << 26) : 0);
               block.z = mode_info0;
               block.w = 0;
               pred_blocks.Store4(dst_ptr * 16, block);
             }
           }
         }
       }
     }

     if (!uv_use_palette && !is_chroma_ref) {
       const int mi_col_uv = mi_col >> 1;
       const int mi_row_uv = mi_row >> 1;
       const int mode1 = (is_inter_intra ? (0x9210 >> (interintra_mode * 4)) : (mi.modes >> 8)) & 15;

       int need_above = (NeedAboveLut >> mode1) & 1;
       int need_left = (NeedLeftLut >> mode1) & 1;

       const int mode_gpu =
           is_intrabc ? (12 << 6) : mode1 == UV_CFL_PRED ? (15 << 6) : mode1 ? ((mode1 - 1) << 6) : (14 << 6);

       int is_dir = mode1 >= V_PRED && mode1 <= D67_PRED;

       int mode_flags_base = txw_uv | (((tx_info & 0xff00) == 0) << 5) | mode_gpu;

       int dir_above_filter = 0;
       int dir_left_filter = 0;

       const uint mi_col1 = mi_col & ~1;
       const uint mi_row1 = mi_row & ~1;
       if (is_dir) {
         int upsample_above = 0;
         int upsample_left = 0;
         int angle_delta = intra_mode_flags >> 24;
         int angle = mode_to_angle_map[mode1] + angle_delta * 3;
         const int mode_angle = angle_delta + 3;
         if (!disable_edge_filter) {
           int filt_type = 0;
           const int mi_base = mi_col1 + mi_row1 * cb_mi_stride;
           if (mi_row1 > cb_row_srart) {
             const int above_idx = get_mi_index(mi_grid, mi_base + 1 - cb_mi_stride, cb_mi_addr_base);
             const int m = (buffer_mi[above_idx].modes >> 8) & 255;
             filt_type = (m == SMOOTH_PRED || m == SMOOTH_V_PRED || m == SMOOTH_H_PRED) &&
                         (((int)buffer_mi[above_idx].block_type << 8) >> 24) <= 0 &&
                         (buffer_mi[above_idx].intra_mode_flags & 0x100) == 0;
           }
           if (mi_col1 > cb_col_srart) {
             const int left_idx = get_mi_index(mi_grid, mi_base - 1 + cb_mi_stride, cb_mi_addr_base);
             const int m = (buffer_mi[left_idx].modes >> 8) & 255;
             filt_type |= (m == SMOOTH_PRED || m == SMOOTH_V_PRED || m == SMOOTH_H_PRED) &&
                          (((int)buffer_mi[left_idx].block_type << 8) >> 24) <= 0 &&
                          (buffer_mi[left_idx].intra_mode_flags & 0x100) == 0;
           }
           int d90 = abs(angle - 90);
           int d180 = abs(angle - 180);
           int blk_wh = (4 << txw_uv) + (4 << txh_uv);
           upsample_above = d90 != 0 && d90 < 40 && blk_wh <= (16 >> filt_type);
           upsample_left = d180 != 0 && d180 < 40 && blk_wh <= (16 >> filt_type);
           dir_above_filter = intra_edge_filter_strength(blk_wh, d90, filt_type);
           dir_left_filter = intra_edge_filter_strength(blk_wh, d180, filt_type);
         }
         mode_flags_base |= (upsample_above << 22) | (upsample_left << 23) | (mode_angle << 28);
       }

       mode_flags_base |= is_inter_intra << 31;

       int mode_u = 1 << 3;  // plane
       int mode_v = 2 << 3;  // plane
       int mode_info0 = 0;
       if (mode1 == UV_CFL_PRED) {
         int sign_u = ((mi.cfl_alpha_signs + 1) * 11) >> 5;   // CFL_SIGN_U(cfl_alpha_signs);
         int sign_v = (mi.cfl_alpha_signs + 1) - 3 * sign_u;  // CFL_SIGN_V(cfl_alpha_signs);
         int idx_u = (sign_u == 2) ? (mi.cfl_alpha_idx >> 4) + 1 : (sign_u == 1) ? -(mi.cfl_alpha_idx >> 4) - 1 : 0;
         int idx_v = (sign_v == 2) ? (mi.cfl_alpha_idx & 15) + 1 : (sign_v == 1) ? -(mi.cfl_alpha_idx & 15) - 1 : 0;
         mode_u |= (idx_u + 16) << 22;
         mode_v |= (idx_v + 16) << 22;
         mode_info0 = cfl_max_x | (cfl_max_y << 16);
       }

       if (is_inter_intra) {
         const int w_idx = mi.interintra_wedge_sign + mi.interintra_wedge_index * 2;
         const int w_ofs = cb_wedge_offsets[bsize].y;
         const int w_sz = 1 << max(0, bw_log_uv + bh_log_uv - 2);
         mode_info0 = w_ofs + w_sz * ((intra_mode_flags & 1) ? w_idx : (32 + interintra_mode));
       } else if (is_intrabc) {
         mode_info0 = mi.mv[0];
         need_left = 0;
         need_above = 0;
       }

       const int type_idx_base = IntraBlockOffset - 1 - txw_uv - txh_uv;
       const int cnt_y_uv = 1 << (bh_log_uv - txh_uv - unit_y_log);
       const int cnt_x_uv = 1 << (bw_log_uv - txw_uv - unit_x_log);
       for (int unit_y = 0; unit_y <= unit_y_log; ++unit_y) {
         for (int unit_x = 0; unit_x <= unit_x_log; ++unit_x) {
           for (int suby = 0; suby < cnt_y_uv; ++suby) {
             for (int subx = 0; subx < cnt_x_uv; ++subx) {
               const int x = subx + unit_x * cnt_x_uv;
               const int y = suby + unit_y * cnt_y_uv;
               const int col = mi_col_uv + (x << txw_uv);
               const int row = mi_row_uv + (y << txh_uv);
               const int subblk_w = 1 << txw_uv;
               const int subblk_h = 1 << txh_uv;
               const int have_top = y || (mi_row1 > cb_row_srart);
               const int have_left = x || (mi_col1 > cb_col_srart);

               uint block_index = blocks_indexes.Load(index_addr);
               index_addr += 4;

               int above_available = have_top;
               if (need_above) {
                 const int xr = ((cb_mi_cols - mi_col - bw) + (2 << bw_log_uv) - ((x + 1) << (txw_uv + 1))) >> 1;
                 int have_top_right = block_index & 1;
                 above_available =
                     (have_top ? min(subblk_w, subblk_w + xr) : 0) + (have_top_right ? min(subblk_w, xr) : 0);
               }

               int left_available = have_left;
               if (need_left) {
                 const int yd = ((cb_mi_rows - mi_row - bh) + (2 << bh_log_uv) - ((y + 1) << (txh_uv + 1))) >> 1;
                 int have_bottom_left = block_index & 2;
                 left_available =
                     (have_left ? min(subblk_h, subblk_h + yd) : 0) + (have_bottom_left ? min(subblk_h, yd) : 0);
               }

               int iter_grid_offset = cb_iter_grid_offset_uv;
               int iter_grid_stride = cb_iter_grid_stride_uv;
               int iter = intra_iter_grid.Load((iter_grid_offset + col + subblk_w + (row + 1) * iter_grid_stride) * 4);
               const int type_index = iter * IntraTypeCount + type_idx_base;
               int dst_ptr = blocks_index_base.Load(4 * (cb_index_offset + type_index)) + (block_index >> 2);
               dst_ptr *= 16;

               uint uv_mode = mode_flags_base | (above_available << 10) | (left_available << 16) |
                              (above_available ? (dir_above_filter << 24) : 0) |
                              (left_available ? (dir_left_filter << 26) : 0);
               uint4 block;
               block.x = col | (row << 16);
               block.y = uv_mode | mode_u;
               block.z = mode_info0;
               block.w = 0;
               pred_blocks.Store4(dst_ptr, block);

               dst_ptr += 16;
               block.y = uv_mode | mode_v;
               pred_blocks.Store4(dst_ptr, block);
             }
           }
         }
       }
     }
   }

   const int do_recon = (mi.tx_info & 0xff00) == 0;
   const int inter_recon = do_recon && (is_obmc_above || is_obmc_left);
   if (y_use_palette || inter_recon) {
     const int type_index = ReconBlockOffset + bw_log + 6 * bh_log;
     int dst_ptr = blocks_index_base.Load(4 * (cb_index_offset + type_index)) + blocks_indexes.Load(index_addr);

     index_addr += 4;
     dst_ptr *= 16;

     uint4 block;
     block.x = mi_col | (mi_row << 16);
     block.y = (do_recon << 2) | (y_use_palette << 3);
     block.z = 0;
     block.w = 0;
     pred_blocks.Store4(dst_ptr, block);
   }
   if ((uv_use_palette || inter_recon) && !is_chroma_ref) {
     const int type_index = ReconBlockOffset + bw_log_uv + 6 * bh_log_uv;
     int dst_ptr = blocks_index_base.Load(4 * (cb_index_offset + type_index)) + blocks_indexes.Load(index_addr);

     index_addr += 4;
     dst_ptr *= 16;

     uint4 block;
     block.x = (mi_col >> 1) | ((mi_row >> 1) << 16);
     block.y = 1 | (do_recon << 2) | (uv_use_palette << 3);
     block.z = 0;
     block.w = 0;
     pred_blocks.Store4(dst_ptr, block);
     dst_ptr += 16;
     block.y = 2 | (do_recon << 2) | (uv_use_palette << 3);
     pred_blocks.Store4(dst_ptr, block);
   }
 }