blob: cbe010bfb191f75245724ad8a9bbdcfa0a4260dd [file] [log] [blame]
/*
* Copyright 2020 Google LLC
*
*/
/*
* Copyright (c) 2020, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
cbuffer IntraDataCommon : register(b0) {
int4 cb_planes[3];
int4 cb_flags;
int4 cb_filter[5][8][2];
int4 cb_mode_params_lut[16][7];
int4 cb_sm_weight_arrays[128];
};
cbuffer PSSLIntraSRT : register(b1) {
int4 cb_counts0;
int4 cb_counts1;
uint cb_wi_count;
int cb_pass_offset;
};
ByteAddressBuffer pred_blocks : register(t0);
ByteAddressBuffer residuals : register(t1);
ByteAddressBuffer wedge_mask : register(t2);
RWByteAddressBuffer dst_frame : register(u0);
#define Dir1 1
#define Dir2 2
#define Dir3 3
#define SmoothV 4
#define SmoothH 8
#define MODE_PAETH 11
#define MODE_INTRA_BC 12
#define MODE_FILTER 13
#define MODE_DC 14
#define MODE_CFL 15
#define NeedAboveShift 4
#define NeedRightShift 5
#define NeedLeftShift 6
#define NeedBotShift 7
#define NeedAboveLeftShift 8
#define FilterAboveLeftFlag 0x200
#define NeedAboveLeftLUT 0x08ff
// intra bc:
#define SubpelBits 4
#define FilterLineShift 3
#define OutputShift 11
#define OffsetBits 19
#define SumAddHor (1 << 14)
#define SumAddVert ((1 << OffsetBits) + (1 << (OutputShift - 1)))
#define OutputSub ((1 << (OffsetBits - OutputShift)) + (1 << (OffsetBits - OutputShift - 1)))
groupshared int above[64 * 20];
groupshared int left[64 * 20];
int compute_bc(int p0, int p1, int p2, int p3, int fh, int fv) {
int l0 = ((1 << 14) + p0 * fh + p1 * (128 - fh) + (1 << (FilterLineShift - 1))) >> FilterLineShift;
int l1 = ((1 << 14) + p2 * fh + p3 * (128 - fh) + (1 << (FilterLineShift - 1))) >> FilterLineShift;
int output = SumAddVert + l0 * fv + l1 * (128 - fv);
return clamp((output >> OutputShift) - OutputSub, 0, 255);
}
uint compute_cfl_pixel(int dc, int value) {
value = (value < 0) ? -((-value + (1 << 5)) >> 6) : ((value + (1 << 5)) >> 6);
return clamp(dc + value, 0, 255);
}
int intra_inter_blend(int mask, int intra, int inter) {
return clamp((intra * mask + inter * (64 - mask) + 32) >> 6, 0, 255);
}
#define WG_SIZE 256
[numthreads(256, 1, 1)] void main(uint3 thread
: SV_DispatchThreadID) {
int4 counts0 = cb_counts0;
int4 counts1 = cb_counts1;
int bsize_log = 10;
int offset = cb_pass_offset;
int threadx = thread.x;
uint3 block = uint3(0, 0, 0);
int wi_count = 0;
int wi = 1024;
if (thread.x < cb_wi_count) {
if (threadx >= (counts0.x << 10)) {
offset += cb_counts0.x;
threadx -= counts0.x << 10;
bsize_log = 9;
if (threadx >= (counts0.y << 9)) {
offset += cb_counts0.y;
threadx -= counts0.y << 9;
bsize_log = 8;
if (threadx >= (counts0.z << 8)) {
offset += cb_counts0.z;
threadx -= counts0.z << 8;
bsize_log = 7;
if (threadx >= (counts0.w << 7)) {
offset += cb_counts0.w;
threadx -= counts0.w << 7;
bsize_log = 6;
if (threadx >= (counts1.x << 6)) {
offset += cb_counts1.x;
threadx -= counts1.x << 6;
bsize_log = 5;
if (threadx >= (counts1.y << 5)) {
offset += cb_counts1.y;
threadx -= counts1.y << 5;
bsize_log = 4;
if (threadx >= (counts1.z << 4)) {
offset += cb_counts1.z;
threadx -= counts1.z << 4;
bsize_log = 3;
if (threadx >= (counts1.w << 3)) {
offset += cb_counts1.w;
threadx -= counts1.w << 3;
bsize_log = 2;
}
}
}
}
}
}
}
}
int block_index = offset + (threadx >> bsize_log);
block = pred_blocks.Load3(block_index << 4);
wi_count = min(WG_SIZE, 1 << bsize_log);
wi = thread.x & (wi_count - 1);
}
const int bw_log = block.y & 7; // 0..4
const int bw = 4 << bw_log;
const int bh = 1 << (bsize_log - bw_log);
int bx = (block.x & 0xffff) << 2;
int by = (block.x >> 16) << 2;
const int plane = (block.y >> 3) & 3;
const int above_available = (block.y >> 10) & 63;
const int left_available = ((block.y >> 16) & 63) << 2;
const int mode = (block.y >> 6) & 15;
const int mode_angle = (block.y >> 28) & 7;
int4 params = cb_mode_params_lut[mode][mode_angle];
// block.y bits:
// 0 3 bw_log
// 3 2 plane
// 5 1 non skip
// 6 4 mode
// all mods except intra_bc:
// 10 6 above_available
// 16 6 left_available
// dir mode params:
// 22 2 upsample
// 24 2 edge_filter above
// 26 2 edge_filter left
// 28 3 mode_angle
// 31 1 inter_intra?
// CFL:
// 22 4 alpha
//
// block.z
// intra_bc - mv
// inter-intra - coef. table indexes;
// filter - bh_log | filter_mode;
// block.w - reserved; (prob. used for sorting);
const int stride = cb_planes[plane].x;
const int corner = cb_planes[plane].y + (bx - 4) + (by - 1) * stride;
const int loc_base = (((thread.x & (WG_SIZE - 1)) - wi) >> 2) * 20 + 2;
int above_count = ((params.z >> NeedAboveShift) & 1) * bw + ((params.z >> NeedRightShift) & 1) * bh;
// Above:
if (wi < (above_count >> 2) || wi == 0) {
int addr = corner + 4 + 4 * min(wi, above_available - 1);
uint pixels =
above_available ? dst_frame.Load(addr) : left_available ? dst_frame.Load(corner + stride) : 0x7f7f7f7f;
if (wi >= above_available) pixels = (pixels >> 24) * 0x01010101;
above[loc_base + wi * 4 + 0] = (pixels >> 0) & 255;
above[loc_base + wi * 4 + 1] = (pixels >> 8) & 255;
above[loc_base + wi * 4 + 2] = (pixels >> 16) & 255;
above[loc_base + wi * 4 + 3] = (pixels >> 24) & 255;
}
// Left:
GroupMemoryBarrierWithGroupSync();
int left_count = ((params.z >> NeedLeftShift) & 1) * bh + ((params.z >> NeedBotShift) & 1) * bw;
if (wi < left_count || wi == 0) {
const int addr = corner + (min(wi, left_available - 1) + 1) * stride;
left[loc_base + wi] = left_available ? (dst_frame.Load(addr) >> 24) : above_available ? above[loc_base] : 129;
}
if (wi < (left_count - wi_count)) {
const int addr = corner + (min(wi + wi_count, left_available - 1) + 1) * stride;
left[loc_base + wi + wi_count] =
left_available ? (dst_frame.Load(addr) >> 24) : above_available ? above[loc_base] : 129;
}
GroupMemoryBarrierWithGroupSync();
const int need_aboveleft = (params.z >> NeedAboveLeftShift) & 1;
if (wi == 0 && need_aboveleft) {
uint t = above[loc_base];
uint l = left[loc_base];
uint topleft = (left_available && above_available) ? (dst_frame.Load(corner) >> 24)
: left_available ? l : above_available ? t : 128;
if ((params.z & FilterAboveLeftFlag) && (bw + bh >= 24) && cb_flags.x)
topleft = ((l + t) * 5 + topleft * 6 + 8) >> 4;
above[loc_base - 1] = topleft;
left[loc_base - 1] = topleft;
above[loc_base - 2] = topleft;
left[loc_base - 2] = topleft;
}
GroupMemoryBarrierWithGroupSync();
int dir_mode = mode < MODE_DC;
int upsample_above = (block.y >> 22) & dir_mode;
int upsample_left = (block.y >> 23) & dir_mode;
int filter_count = 0;
if (above_available && ((params.z >> NeedAboveShift) & 1)) {
filter_count = min(above_available << 2, bw) + ((params.z >> NeedRightShift) & 1) * bh + need_aboveleft - 1;
}
int edge_filter = (block.y >> 24) & ((wi < filter_count && dir_mode) * 3);
int sum0 = 8;
int sum1 = 8;
const int wi1 = wi + wi_count;
if (edge_filter) {
edge_filter <<= 2;
const int base = loc_base - need_aboveleft;
const int last = above_count;
//{ 0, 4, 8, 4, 0 }, { 0, 5, 6, 5, 0 }, { 2, 4, 4, 4, 2 }
sum0 += above[base + max(wi - 1, 0)] * ((0x2000 >> edge_filter) & 15);
sum0 += above[base + (wi + 0)] * ((0x4540 >> edge_filter) & 15);
sum0 += above[base + (wi + 1)] * ((0x4680 >> edge_filter) & 15);
sum0 += above[base + min(wi + 2, last)] * ((0x4540 >> edge_filter) & 15);
sum0 += above[base + min(wi + 3, last)] * ((0x2000 >> edge_filter) & 15);
if (wi1 < above_count) // rare, some mods for 4x4, 8x4, 4x8 blocks
{
sum1 += above[base + max(wi1 - 1, 0)] * ((0x2000 >> edge_filter) & 15);
sum1 += above[base + (wi1 + 0)] * ((0x4540 >> edge_filter) & 15);
sum1 += above[base + (wi1 + 1)] * ((0x4680 >> edge_filter) & 15);
sum1 += above[base + min(wi1 + 2, last)] * ((0x4540 >> edge_filter) & 15);
sum1 += above[base + min(wi1 + 3, last)] * ((0x2000 >> edge_filter) & 15);
}
}
GroupMemoryBarrierWithGroupSync();
if (edge_filter) {
above[loc_base + wi] = sum0 >> 4;
if (wi1 < filter_count) {
above[loc_base + wi1] = sum1 >> 4;
}
}
if (left_available && ((params.z >> NeedLeftShift) & 1)) {
filter_count = min(left_available, bh) + ((params.z >> NeedBotShift) & 1) * bw + need_aboveleft - 1;
}
edge_filter = (block.y >> 26) & ((wi < filter_count && dir_mode) * 3);
if (edge_filter) {
sum0 = 8;
sum1 = 8;
edge_filter *= 4;
const int base = loc_base - need_aboveleft;
const int last = filter_count;
sum0 += left[base + max(wi - 1, 0)] * ((0x2000 >> edge_filter) & 15);
sum0 += left[base + (wi + 0)] * ((0x4540 >> edge_filter) & 15);
sum0 += left[base + (wi + 1)] * ((0x4680 >> edge_filter) & 15);
sum0 += left[base + min(wi + 2, last)] * ((0x4540 >> edge_filter) & 15);
sum0 += left[base + min(wi + 3, last)] * ((0x2000 >> edge_filter) & 15);
if (wi1 < filter_count) {
sum1 += left[base + max(wi1 - 1, 0)] * ((0x2000 >> edge_filter) & 15);
sum1 += left[base + (wi1 + 0)] * ((0x4540 >> edge_filter) & 15);
sum1 += left[base + (wi1 + 1)] * ((0x4680 >> edge_filter) & 15);
sum1 += left[base + min(wi1 + 2, last)] * ((0x4540 >> edge_filter) & 15);
sum1 += left[base + min(wi1 + 3, last)] * ((0x2000 >> edge_filter) & 15);
}
}
GroupMemoryBarrierWithGroupSync();
if (edge_filter) {
left[loc_base + wi] = sum0 >> 4;
if (wi1 < filter_count) {
left[loc_base + wi1] = sum1 >> 4;
}
}
GroupMemoryBarrierWithGroupSync();
int p0 = 0, p1 = 0, p2 = 0, p3 = 0, p4 = 0;
int do_upsample = upsample_above && wi < (above_count >> 1);
if (do_upsample) {
p0 = above[loc_base + wi * 2 - 2];
p1 = above[loc_base + wi * 2 - 1];
p2 = above[loc_base + wi * 2 + 0];
p3 = above[loc_base + wi * 2 + 1];
p4 = above[loc_base + min(wi * 2 + 2, above_count - 1)];
}
GroupMemoryBarrierWithGroupSync();
if (do_upsample) {
above[loc_base - 1 + wi * 4] = clamp((-p0 + 9 * p1 + 9 * p2 - p3 + 8) >> 4, 0, 255);
above[loc_base + 0 + wi * 4] = p2;
above[loc_base + 1 + wi * 4] = clamp((-p1 + 9 * p2 + 9 * p3 - p4 + 8) >> 4, 0, 255);
above[loc_base + 2 + wi * 4] = p3;
}
do_upsample = upsample_left && wi < (left_count >> 1);
if (do_upsample) {
p0 = left[loc_base + wi * 2 - 2];
p1 = left[loc_base + wi * 2 - 1];
p2 = left[loc_base + wi * 2 + 0];
p3 = left[loc_base + wi * 2 + 1];
p4 = left[loc_base + min(wi * 2 + 2, left_count - 1)];
}
GroupMemoryBarrierWithGroupSync();
if (do_upsample) {
left[loc_base - 1 + wi * 4] = clamp((-p0 + 9 * p1 + 9 * p2 - p3 + 8) >> 4, 0, 255);
left[loc_base + 0 + wi * 4] = p2;
left[loc_base + 1 + wi * 4] = clamp((-p1 + 9 * p2 + 9 * p3 - p4 + 8) >> 4, 0, 255);
left[loc_base + 2 + wi * 4] = p3;
}
GroupMemoryBarrierWithGroupSync();
// int x0 = (wi & ((1 << bw_log) - 1)) << 2;
// int y = wi >> bw_log;
int x0 = (thread.x & ((1 << bw_log) - 1)) << 2;
int y = (thread.x & ((1 << bsize_log) - 1)) >> bw_log;
uint pixels = 0;
if (mode == MODE_INTRA_BC) {
int mv = (int)block.z;
int mvx = bx + x0 + ((mv) >> (16 + SubpelBits));
int mvy = by + y + ((mv << 16) >> (16 + SubpelBits));
const int filt_h = 128 >> ((mv >> 19) & 1);
const int filt_v = 128 >> ((mv >> 3) & 1);
int addr = cb_planes[plane].y + mvx + mvy * stride;
uint2 ref0, ref1 = 0;
const uint shift = (addr & 3) << 3;
addr &= ~3;
ref0 = dst_frame.Load2(addr);
ref0.x = (ref0.x >> shift) | ((ref0.y << (24 - shift)) << 8);
ref0.y = ref0.y >> shift;
if (filt_v != 128) {
ref1 = dst_frame.Load2(addr + stride);
ref1.x = (ref1.x >> shift) | ((ref1.y << (24 - shift)) << 8);
ref1.y = ref1.y >> shift;
}
pixels =
(compute_bc((ref0.x >> 0) & 255, (ref0.x >> 8) & 255, (ref1.x >> 0) & 255, (ref1.x >> 8) & 255, filt_h, filt_v)
<< 0) |
(compute_bc((ref0.x >> 8) & 255, (ref0.x >> 16) & 255, (ref1.x >> 8) & 255, (ref1.x >> 16) & 255, filt_h,
filt_v)
<< 8) |
(compute_bc((ref0.x >> 16) & 255, (ref0.x >> 24) & 255, (ref1.x >> 16) & 255, (ref1.x >> 24) & 255, filt_h,
filt_v)
<< 16) |
(compute_bc((ref0.x >> 24) & 255, (ref0.y >> 0) & 255, (ref1.x >> 24) & 255, (ref1.y >> 0) & 255, filt_h,
filt_v)
<< 24);
}
uint dc = 0;
if (mode >= MODE_DC) {
uint count = 0;
if (above_available) {
for (int i = 0; i < bw; ++i) dc += above[loc_base + i];
count += bw;
}
if (left_available) {
for (int i = 0; i < bh; ++i) dc += left[loc_base + i];
count += bh;
}
dc += count >> 1;
dc = count ? dc / count : 128;
pixels = 0x01010101 * dc;
}
GroupMemoryBarrierWithGroupSync();
int4 luma = 0;
if (mode == MODE_CFL) {
const int max_x = (block.z & 0xffff) - 4;
const int max_y = (block.z >> 16) - 2;
const int y_stride = cb_planes[0].x;
const int luma_x = (bx + x0) << 1;
const int luma_y = min((by + y) << 1, max_y);
int y_offset = cb_planes[0].y + min(luma_x, max_x) + luma_y * y_stride;
uint2 luma0 = dst_frame.Load2(y_offset);
uint2 luma1 = dst_frame.Load2(y_offset + y_stride);
if (luma_x >= max_x) {
luma0.y = (luma0.x >> 16) | (luma0.x & 0xffff0000);
luma1.y = (luma1.x >> 16) | (luma1.x & 0xffff0000);
if (luma_x > max_x) {
luma0.x = luma0.y;
luma1.x = luma1.y;
}
}
luma.x = ((luma0.x >> 0) & 255) + ((luma0.x >> 8) & 255) + ((luma1.x >> 0) & 255) + ((luma1.x >> 8) & 255);
luma.y = ((luma0.x >> 16) & 255) + ((luma0.x >> 24) & 255) + ((luma1.x >> 16) & 255) + ((luma1.x >> 24) & 255);
luma.z = ((luma0.y >> 0) & 255) + ((luma0.y >> 8) & 255) + ((luma1.y >> 0) & 255) + ((luma1.y >> 8) & 255);
luma.w = ((luma0.y >> 16) & 255) + ((luma0.y >> 24) & 255) + ((luma1.y >> 16) & 255) + ((luma1.y >> 24) & 255);
luma <<= 1;
above[loc_base + wi] = luma.x + luma.y + luma.z + luma.w;
}
GroupMemoryBarrierWithGroupSync();
if (mode == MODE_CFL) // reduce
{
int count = wi_count >> 2;
// max cfl block 32x32 / 4(pixels) = 256 wi (wi_count);
// count = 256wi / 4 = up to 64wi == wavefront size, => we can use GroupMemoryBarrier();
int ofs = loc_base + wi * 4;
if (wi < count) {
int sum = above[ofs + 0] + above[ofs + 1] + above[ofs + 2] + above[ofs + 3];
GroupMemoryBarrier();
above[loc_base + wi] = sum;
GroupMemoryBarrier();
}
ofs = loc_base + wi * 2;
count >>= 1;
while (wi < count) {
int sum = above[ofs + 0] + above[ofs + 1];
GroupMemoryBarrier();
above[loc_base + wi] = sum;
GroupMemoryBarrier();
count >>= 1;
}
}
GroupMemoryBarrierWithGroupSync();
if (thread.x >= cb_wi_count) return;
if (mode == MODE_CFL) {
int avrg = (above[loc_base] + (2 << bsize_log)) >> (bsize_log + 2);
int alpha = ((block.y >> 22) & 63) - 16;
pixels = compute_cfl_pixel(dc, (luma.x - avrg) * alpha);
pixels |= compute_cfl_pixel(dc, (luma.y - avrg) * alpha) << 8;
pixels |= compute_cfl_pixel(dc, (luma.z - avrg) * alpha) << 16;
pixels |= compute_cfl_pixel(dc, (luma.w - avrg) * alpha) << 24;
}
if (mode < MODE_INTRA_BC) {
const int frac_bits_x = 6 - upsample_above;
const int frac_bits_y = 6 - upsample_left;
const int min_base_x = -(1 << upsample_above);
above_count <<= upsample_above;
left_count <<= upsample_left;
uint topleft = above[loc_base - 1];
for (int i = 0; i < 4; ++i) {
int x = i + x0;
////1:
// int offset_x = (1 + y) * dx;
// int base_x = offset_x >> frac_bits_x;
//
////2:
// int offset_x = (x << 6) - (y + 1) * dx;
// const int base_x = offset_x >> frac_bits_x;
//
// int offset_y = (y << 6) - (x + 1) * dy;
// const int base_y = offset_y >> frac_bits_y;
//
////3:
// int offset_y = (1 + x) * dy;
// int base_y = offset_x >> frac_bits_x;
//
// int shift_y = ((offset_y << upsample_above) & 0x3F) >> 1;
// int shift_x = ((offset_x << upsample_above) & 0x3F) >> 1;
// Note: for non dir modes:
// dx = 0; dy = 0; upsample_above = 0; upsample_left = 0;
const int offset_x = (x << 6) + (y + 1) * params.x;
const int offset_y = (y << 6) + (x + 1) * params.y;
const int shift_x = ((offset_x << upsample_above) & 0x3F) >> 1;
const int shift_y = ((offset_y << upsample_left) & 0x3F) >> 1;
// Note: shift_x = 0 & shift_y = 0 if (dx == 0) & (dy == 0);
// maybe this can be used to simplify weight calc;
const int base_x = offset_x >> frac_bits_x;
int l0 = left[loc_base + clamp(offset_y >> frac_bits_y, -2, left_count - 1)];
int l1 = left[loc_base + clamp((offset_y >> frac_bits_y) + 1, -2, left_count - 1)];
int t0 = above[loc_base + clamp(base_x, -2, above_count - 1)];
int t1 = above[loc_base + clamp(base_x + 1, -2, above_count - 1)];
// switch(mode)
//{
// case (SMOOTH_V | SMOOTH_H):
// w_t0 = *(sm_weight_arrays + bh + y);
// w_l0 = *(sm_weight_arrays + bw + x);
//
// case SMOOTH_V:
// w_t0 = *(sm_weight_arrays + bh + y);
//
// case SMOOTH_H:
// w_l0 = *(sm_weight_arrays + bw + x);
//
// case V:
// case DIR_1:
// w_t0 = 32 - shift_x;
//
// case H:
// case DIR_3:
// w_l0 = 32 - shift_y;
//
// case MODE_PAETH:
// int base = t0 + l0 - topleft;
// int diff_t = abs(t0 - base);
// int diff_l = abs(l0 - base);
// int diff_tl = abs(topleft - base);
// w_t0 = diff_t < diff_l && diff_t < diff_tl;
// w_l0 = diff_l < diff_t && diff_l < diff_tl;
//};
int mode_info = params.z & 15;
if (mode_info == Dir2) // eliminate dir2
mode_info = base_x >= min_base_x ? Dir1 : Dir3;
int w_t0 = (mode_info & SmoothV) ? cb_sm_weight_arrays[bh + y].x : (mode_info == Dir1) ? (32 - shift_x) : 0;
int w_l0 = (mode_info & SmoothH) ? cb_sm_weight_arrays[bw + x].x : (mode_info == Dir3) ? (32 - shift_y) : 0;
if (mode == MODE_PAETH) {
int diff_t = topleft - l0;
int diff_l = topleft - t0;
int diff_tl = abs(diff_t + diff_l);
diff_t = abs(diff_t);
diff_l = abs(diff_l);
w_l0 = diff_l <= diff_t && diff_l <= diff_tl;
w_t0 = diff_t < diff_l && diff_t <= diff_tl;
w_l0 <<= 5;
w_t0 <<= 5;
}
int w_tl = (mode == MODE_PAETH) ? (32 - w_t0 - w_l0) : 0;
int w_l1 = (mode_info == Dir3) ? shift_y : 0;
int w_t1 = (mode_info == Dir1) ? shift_x : 0;
int w_b = (mode_info & SmoothV) ? 0x100 - w_t0 : 0;
int w_r = (mode_info & SmoothH) ? 0x100 - w_l0 : 0;
int sum = w_tl * topleft + w_b * left[loc_base + bh - 1] + w_r * above[loc_base + bw - 1] + w_l0 * l0 +
w_t0 * t0 + w_l1 * l1 + w_t1 * t1;
sum += 1 << (params.w - 1);
sum >>= params.w;
pixels |= clamp(sum, 0, 255) << (i * 8);
}
}
const int addr = corner + 4 + x0 + (y + 1) * stride;
if (block.y & 0x80000000) // inter-intra
{
int wedge_addr = (block.z << 6) + x0 + y * bw;
uint wedge = wedge_mask.Load(wedge_addr);
uint inter = dst_frame.Load(addr);
pixels = (intra_inter_blend((wedge >> 0) & 255, (pixels >> 0) & 255, (inter >> 0) & 255) << 0) |
(intra_inter_blend((wedge >> 8) & 255, (pixels >> 8) & 255, (inter >> 8) & 255) << 8) |
(intra_inter_blend((wedge >> 16) & 255, (pixels >> 16) & 255, (inter >> 16) & 255) << 16) |
(intra_inter_blend((wedge >> 24) & 255, (pixels >> 24) & 255, (inter >> 24) & 255) << 24);
}
if (block.y & (1 << 5)) {
const int res_stride = cb_planes[plane].z;
const int res_addr = cb_planes[plane].w + 2 * (bx + x0) + (by + y) * res_stride;
int2 res = residuals.Load2(res_addr);
pixels = (clamp((int)((pixels >> 0) & 255) + (int)((res.x << 16) >> 16), 0, 255) << 0) |
(clamp((int)((pixels >> 8) & 255) + (int)(res.x >> 16), 0, 255) << 8) |
(clamp((int)((pixels >> 16) & 255) + (int)((res.y << 16) >> 16), 0, 255) << 16) |
(clamp((int)((pixels >> 24) & 255) + (int)(res.y >> 16), 0, 255) << 24);
}
dst_frame.Store(addr, pixels);
}