| /* |
| * Copyright 2020 Google LLC |
| * |
| */ |
| |
| /* |
| * Copyright (c) 2020, Alliance for Open Media. All rights reserved |
| * |
| * This source code is subject to the terms of the BSD 2 Clause License and |
| * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| * was not distributed with this source code in the LICENSE file, you can |
| * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| * Media Patent License 1.0 was not distributed with this source code in the |
| * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| */ |
| |
| cbuffer IntraDataCommon : register(b0) { |
| int4 cb_planes[3]; |
| int4 cb_flags; |
| int4 cb_filter[5][8][2]; |
| int4 cb_mode_params_lut[16][7]; |
| int4 cb_sm_weight_arrays[128]; |
| }; |
| |
| cbuffer PSSLIntraSRT : register(b1) { |
| int4 cb_counts0; |
| int4 cb_counts1; |
| uint cb_wi_count; |
| int cb_pass_offset; |
| }; |
| |
| ByteAddressBuffer pred_blocks : register(t0); |
| ByteAddressBuffer residuals : register(t1); |
| ByteAddressBuffer wedge_mask : register(t2); |
| RWByteAddressBuffer dst_frame : register(u0); |
| |
| #define Dir1 1 |
| #define Dir2 2 |
| #define Dir3 3 |
| #define SmoothV 4 |
| #define SmoothH 8 |
| |
| #define MODE_PAETH 11 |
| #define MODE_INTRA_BC 12 |
| #define MODE_FILTER 13 |
| #define MODE_DC 14 |
| #define MODE_CFL 15 |
| |
| #define NeedAboveShift 4 |
| #define NeedRightShift 5 |
| #define NeedLeftShift 6 |
| #define NeedBotShift 7 |
| #define NeedAboveLeftShift 8 |
| #define FilterAboveLeftFlag 0x200 |
| #define NeedAboveLeftLUT 0x08ff |
| // intra bc: |
| #define SubpelBits 4 |
| #define FilterLineShift 3 |
| #define OutputShift 11 |
| #define OffsetBits 19 |
| #define SumAddHor (1 << 14) |
| #define SumAddVert ((1 << OffsetBits) + (1 << (OutputShift - 1))) |
| #define OutputSub ((1 << (OffsetBits - OutputShift)) + (1 << (OffsetBits - OutputShift - 1))) |
| |
| groupshared int above[64 * 20]; |
| groupshared int left[64 * 20]; |
| |
| int compute_bc(int p0, int p1, int p2, int p3, int fh, int fv) { |
| int l0 = ((1 << 14) + p0 * fh + p1 * (128 - fh) + (1 << (FilterLineShift - 1))) >> FilterLineShift; |
| int l1 = ((1 << 14) + p2 * fh + p3 * (128 - fh) + (1 << (FilterLineShift - 1))) >> FilterLineShift; |
| int output = SumAddVert + l0 * fv + l1 * (128 - fv); |
| return clamp((output >> OutputShift) - OutputSub, 0, 1023); |
| } |
| |
| uint compute_cfl_pixel(int dc, int value) { |
| value = (value < 0) ? -((-value + (1 << 5)) >> 6) : ((value + (1 << 5)) >> 6); |
| return clamp(dc + value, 0, 1023); |
| } |
| |
| int intra_inter_blend(int mask, int intra, int inter) { |
| return clamp((intra * mask + inter * (64 - mask) + 32) >> 6, 0, 1023); |
| } |
| #define WG_SIZE 256 |
| [numthreads(WG_SIZE, 1, 1)] void main(uint3 thread |
| : SV_DispatchThreadID) { |
| int4 counts0 = cb_counts0; |
| int4 counts1 = cb_counts1; |
| int bsize_log = 10; |
| int offset = cb_pass_offset; |
| int threadx = thread.x; |
| uint3 block = uint3(0, 0, 0); |
| int wi_count = 0; |
| int wi = 1024; |
| if (thread.x < cb_wi_count) { |
| if (threadx >= (counts0.x << 10)) { |
| offset += cb_counts0.x; |
| threadx -= counts0.x << 10; |
| bsize_log = 9; |
| if (threadx >= (counts0.y << 9)) { |
| offset += cb_counts0.y; |
| threadx -= counts0.y << 9; |
| bsize_log = 8; |
| if (threadx >= (counts0.z << 8)) { |
| offset += cb_counts0.z; |
| threadx -= counts0.z << 8; |
| bsize_log = 7; |
| if (threadx >= (counts0.w << 7)) { |
| offset += cb_counts0.w; |
| threadx -= counts0.w << 7; |
| bsize_log = 6; |
| if (threadx >= (counts1.x << 6)) { |
| offset += cb_counts1.x; |
| threadx -= counts1.x << 6; |
| bsize_log = 5; |
| if (threadx >= (counts1.y << 5)) { |
| offset += cb_counts1.y; |
| threadx -= counts1.y << 5; |
| bsize_log = 4; |
| if (threadx >= (counts1.z << 4)) { |
| offset += cb_counts1.z; |
| threadx -= counts1.z << 4; |
| bsize_log = 3; |
| if (threadx >= (counts1.w << 3)) { |
| offset += cb_counts1.w; |
| threadx -= counts1.w << 3; |
| bsize_log = 2; |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| int block_index = offset + (threadx >> bsize_log); |
| block = pred_blocks.Load3(block_index << 4); |
| wi_count = min(WG_SIZE, 1 << bsize_log); |
| wi = thread.x & (wi_count - 1); |
| } |
| const int bw_log = block.y & 7; // 0..4 |
| const int bw = 4 << bw_log; |
| const int bh = 1 << (bsize_log - bw_log); |
| |
| int bx = (block.x & 0xffff) << 2; |
| int by = (block.x >> 16) << 2; |
| |
| const int plane = (block.y >> 3) & 3; |
| const int above_available = ((block.y >> 10) & 63) << 1; |
| const int left_available = ((block.y >> 16) & 63) << 2; |
| |
| const int mode = (block.y >> 6) & 15; |
| |
| const int mode_angle = (block.y >> 28) & 7; |
| int4 params = cb_mode_params_lut[mode][mode_angle]; |
| |
| // block.y bits: |
| // 0 3 bw_log |
| // 3 2 plane |
| // 5 1 non skip |
| // 6 4 mode |
| // all mods except intra_bc: |
| // 10 6 above_available |
| // 16 6 left_available |
| // dir mode params: |
| // 22 2 upsample |
| // 24 2 edge_filter above |
| // 26 2 edge_filter left |
| // 28 3 mode_angle |
| // 31 1 inter_intra? |
| // CFL: |
| // 22 4 alpha |
| // |
| // block.z |
| // intra_bc - mv |
| // inter-intra - coef. table indexes; |
| // filter - bh_log | filter_mode; |
| // block.w - reserved; (prob. used for sorting); |
| |
| const int stride = cb_planes[plane].x; |
| const int corner = cb_planes[plane].y + ((bx << 1) - 4) + (by - 1) * stride; |
| |
| const int loc_base = (((thread.x & (WG_SIZE - 1)) - wi) >> 2) * 20 + 2; |
| int above_count = ((params.z >> NeedAboveShift) & 1) * bw + ((params.z >> NeedRightShift) & 1) * bh; |
| |
| if (wi < (above_count >> 1) || wi == 0) { |
| int addr = corner + 4 + 4 * min(wi, above_available - 1); |
| uint pixels = |
| above_available ? dst_frame.Load(addr) : left_available ? dst_frame.Load(corner + stride) : 0x01ff0000; |
| if (wi >= above_available) pixels = (pixels & 0xffff0000) | (pixels >> 16); |
| above[loc_base + (wi << 1) + 0] = (pixels >> 0) & 1023; |
| above[loc_base + (wi << 1) + 1] = (pixels >> 16) & 1023; |
| } |
| |
| // Left: |
| GroupMemoryBarrierWithGroupSync(); |
| |
| int left_count = ((params.z >> NeedLeftShift) & 1) * bh + ((params.z >> NeedBotShift) & 1) * bw; |
| |
| if (wi < left_count || wi == 0) { |
| const int addr = corner + (min(wi, left_available - 1) + 1) * stride; |
| left[loc_base + wi] = left_available ? (dst_frame.Load(addr) >> 16) : above_available ? above[loc_base] : 513; |
| } |
| if (wi < (left_count - wi_count)) { |
| const int addr = corner + (min(wi + wi_count, left_available - 1) + 1) * stride; |
| left[loc_base + wi + wi_count] = |
| left_available ? (dst_frame.Load(addr) >> 16) : above_available ? above[loc_base] : 513; |
| } |
| |
| GroupMemoryBarrierWithGroupSync(); |
| |
| const int need_aboveleft = (params.z >> NeedAboveLeftShift) & 1; |
| if (wi == 0 && need_aboveleft) { |
| uint t = above[loc_base]; |
| uint l = left[loc_base]; |
| uint topleft = (left_available && above_available) ? (dst_frame.Load(corner) >> 16) |
| : left_available ? l : above_available ? t : 512; |
| |
| if ((params.z & FilterAboveLeftFlag) && (bw + bh >= 24) && cb_flags.x) |
| topleft = ((l + t) * 5 + topleft * 6 + 8) >> 4; |
| |
| above[loc_base - 1] = topleft; |
| left[loc_base - 1] = topleft; |
| above[loc_base - 2] = topleft; |
| left[loc_base - 2] = topleft; |
| } |
| |
| GroupMemoryBarrierWithGroupSync(); |
| |
| int dir_mode = mode < MODE_DC; |
| int upsample_above = (block.y >> 22) & dir_mode; |
| int upsample_left = (block.y >> 23) & dir_mode; |
| |
| int filter_count = 0; |
| if (above_available && ((params.z >> NeedAboveShift) & 1)) { |
| filter_count = min(above_available << 2, bw) + ((params.z >> NeedRightShift) & 1) * bh + need_aboveleft - 1; |
| } |
| int edge_filter = (block.y >> 24) & ((wi < filter_count && dir_mode) * 3); |
| |
| int sum0 = 8; |
| int sum1 = 8; |
| const int wi1 = wi + wi_count; |
| if (edge_filter) { |
| edge_filter <<= 2; |
| const int base = loc_base - need_aboveleft; |
| const int last = filter_count; |
| //{ 0, 4, 8, 4, 0 }, { 0, 5, 6, 5, 0 }, { 2, 4, 4, 4, 2 } |
| sum0 += above[base + max(wi - 1, 0)] * ((0x2000 >> edge_filter) & 15); |
| sum0 += above[base + (wi + 0)] * ((0x4540 >> edge_filter) & 15); |
| sum0 += above[base + (wi + 1)] * ((0x4680 >> edge_filter) & 15); |
| sum0 += above[base + min(wi + 2, last)] * ((0x4540 >> edge_filter) & 15); |
| sum0 += above[base + min(wi + 3, last)] * ((0x2000 >> edge_filter) & 15); |
| if (wi1 < filter_count) // rare, some mods for 4x4, 8x4, 4x8 blocks |
| { |
| sum1 += above[base + max(wi1 - 1, 0)] * ((0x2000 >> edge_filter) & 15); |
| sum1 += above[base + (wi1 + 0)] * ((0x4540 >> edge_filter) & 15); |
| sum1 += above[base + (wi1 + 1)] * ((0x4680 >> edge_filter) & 15); |
| sum1 += above[base + min(wi1 + 2, last)] * ((0x4540 >> edge_filter) & 15); |
| sum1 += above[base + min(wi1 + 3, last)] * ((0x2000 >> edge_filter) & 15); |
| } |
| } |
| GroupMemoryBarrierWithGroupSync(); |
| if (edge_filter) { |
| above[loc_base + wi] = sum0 >> 4; |
| if (wi1 < filter_count) { |
| above[loc_base + wi1] = sum1 >> 4; |
| } |
| } |
| |
| if (left_available && ((params.z >> NeedLeftShift) & 1)) { |
| filter_count = min(left_available, bh) + ((params.z >> NeedBotShift) & 1) * bw + need_aboveleft - 1; |
| } |
| edge_filter = (block.y >> 26) & ((wi < filter_count && dir_mode) * 3); |
| if (edge_filter) { |
| sum0 = 8; |
| sum1 = 8; |
| edge_filter *= 4; |
| const int base = loc_base - need_aboveleft; |
| const int last = filter_count; |
| sum0 += left[base + max(wi - 1, 0)] * ((0x2000 >> edge_filter) & 15); |
| sum0 += left[base + (wi + 0)] * ((0x4540 >> edge_filter) & 15); |
| sum0 += left[base + (wi + 1)] * ((0x4680 >> edge_filter) & 15); |
| sum0 += left[base + min(wi + 2, last)] * ((0x4540 >> edge_filter) & 15); |
| sum0 += left[base + min(wi + 3, last)] * ((0x2000 >> edge_filter) & 15); |
| if (wi1 < filter_count) { |
| sum1 += left[base + max(wi1 - 1, 0)] * ((0x2000 >> edge_filter) & 15); |
| sum1 += left[base + (wi1 + 0)] * ((0x4540 >> edge_filter) & 15); |
| sum1 += left[base + (wi1 + 1)] * ((0x4680 >> edge_filter) & 15); |
| sum1 += left[base + min(wi1 + 2, last)] * ((0x4540 >> edge_filter) & 15); |
| sum1 += left[base + min(wi1 + 3, last)] * ((0x2000 >> edge_filter) & 15); |
| } |
| } |
| GroupMemoryBarrierWithGroupSync(); |
| if (edge_filter) { |
| left[loc_base + wi] = sum0 >> 4; |
| if (wi1 < filter_count) { |
| left[loc_base + wi1] = sum1 >> 4; |
| } |
| } |
| |
| GroupMemoryBarrierWithGroupSync(); |
| |
| int p0 = 0, p1 = 0, p2 = 0, p3 = 0, p4 = 0; |
| int do_upsample = upsample_above && wi < (above_count >> 1); |
| if (do_upsample) { |
| p0 = above[loc_base + wi * 2 - 2]; |
| p1 = above[loc_base + wi * 2 - 1]; |
| p2 = above[loc_base + wi * 2 + 0]; |
| p3 = above[loc_base + wi * 2 + 1]; |
| p4 = above[loc_base + min(wi * 2 + 2, above_count - 1)]; |
| } |
| GroupMemoryBarrierWithGroupSync(); |
| if (do_upsample) { |
| above[loc_base - 1 + wi * 4] = clamp((-p0 + 9 * p1 + 9 * p2 - p3 + 8) >> 4, 0, 1023); |
| above[loc_base + 0 + wi * 4] = p2; |
| above[loc_base + 1 + wi * 4] = clamp((-p1 + 9 * p2 + 9 * p3 - p4 + 8) >> 4, 0, 1023); |
| above[loc_base + 2 + wi * 4] = p3; |
| } |
| |
| do_upsample = upsample_left && wi < (left_count >> 1); |
| if (do_upsample) { |
| p0 = left[loc_base + wi * 2 - 2]; |
| p1 = left[loc_base + wi * 2 - 1]; |
| p2 = left[loc_base + wi * 2 + 0]; |
| p3 = left[loc_base + wi * 2 + 1]; |
| p4 = left[loc_base + min(wi * 2 + 2, left_count - 1)]; |
| } |
| GroupMemoryBarrierWithGroupSync(); |
| if (do_upsample) { |
| left[loc_base - 1 + wi * 4] = clamp((-p0 + 9 * p1 + 9 * p2 - p3 + 8) >> 4, 0, 1023); |
| left[loc_base + 0 + wi * 4] = p2; |
| left[loc_base + 1 + wi * 4] = clamp((-p1 + 9 * p2 + 9 * p3 - p4 + 8) >> 4, 0, 1023); |
| left[loc_base + 2 + wi * 4] = p3; |
| } |
| GroupMemoryBarrierWithGroupSync(); |
| |
| // int x0 = (wi & ((1 << bw_log) - 1)) << 2; |
| // int y = wi >> bw_log; |
| int x0 = (thread.x & ((1 << bw_log) - 1)) << 2; |
| int y = (thread.x & ((1 << bsize_log) - 1)) >> bw_log; |
| |
| uint2 pixels = uint2(0, 0); |
| if (mode == MODE_INTRA_BC) { |
| int mv = (int)block.z; |
| int mvx = bx + x0 + ((mv) >> (16 + SubpelBits)); |
| int mvy = by + y + ((mv << 16) >> (16 + SubpelBits)); |
| const int filt_h = 128 >> ((mv >> 19) & 1); |
| const int filt_v = 128 >> ((mv >> 3) & 1); |
| int addr = cb_planes[plane].y + (mvx << 1) + mvy * stride; |
| uint3 ref0, ref1 = 0; |
| |
| const uint shift = (addr & 2) << 3; |
| addr &= ~3; |
| ref0 = dst_frame.Load3(addr); |
| ref0.x = (ref0.x >> shift) | ((ref0.y << (24 - shift)) << 8); |
| ref0.y = (ref0.y >> shift) | ((ref0.z << (24 - shift)) << 8); |
| ref0.z = ref0.z >> shift; |
| |
| if (filt_v != 128) { |
| ref1 = dst_frame.Load3(addr + stride); |
| ref1.x = (ref1.x >> shift) | ((ref1.y << (24 - shift)) << 8); |
| ref1.y = (ref1.y >> shift) | ((ref1.z << (24 - shift)) << 8); |
| ref1.z = ref1.z >> shift; |
| } |
| |
| pixels.x = (compute_bc((ref0.x >> 0) & 1023, (ref0.x >> 16) & 1023, (ref1.x >> 0) & 1023, (ref1.x >> 16) & 1023, |
| filt_h, filt_v) |
| << 0) | |
| (compute_bc((ref0.x >> 16) & 1023, (ref0.y >> 0) & 1023, (ref1.x >> 8) & 1023, (ref1.y >> 0) & 1023, |
| filt_h, filt_v) |
| << 16); |
| pixels.y = (compute_bc((ref0.y >> 0) & 1023, (ref0.y >> 16) & 1023, (ref1.y >> 0) & 1023, (ref1.y >> 16) & 1023, |
| filt_h, filt_v) |
| << 0) | |
| (compute_bc((ref0.y >> 16) & 1023, (ref0.z >> 0) & 1023, (ref1.y >> 16) & 1023, (ref1.z >> 0) & 1023, |
| filt_h, filt_v) |
| << 16); |
| } |
| |
| uint dc = 0; |
| if (mode >= MODE_DC) { |
| uint count = 0; |
| // todo: maybe some optization here? |
| if (above_available) { |
| for (int i = 0; i < bw; ++i) dc += above[loc_base + i]; |
| count += bw; |
| } |
| if (left_available) { |
| for (int i = 0; i < bh; ++i) dc += left[loc_base + i]; |
| count += bh; |
| } |
| dc += count >> 1; |
| dc = count ? dc / count : 512; |
| |
| pixels.x = dc | (dc << 16); |
| pixels.y = pixels.x; |
| } |
| |
| GroupMemoryBarrierWithGroupSync(); |
| |
| int4 luma = 0; |
| if (mode == MODE_CFL) { |
| const int max_y = (block.z >> 16) - 2; |
| const int y_stride = cb_planes[0].x; |
| const int luma_y = min((by + y) << 1, max_y); |
| |
| // const int max_x = (block.z & 0xffff) - 4; |
| // const int luma_x = (bx + x0) << 1; |
| // int y_offset = cb_planes[0].y + min(luma_x, max_x) + luma_y * y_stride; |
| const int max_x = ((block.z & 0xffff) << 1) - 8; |
| const int luma_x = (bx + x0) << 2; |
| int y_offset = cb_planes[0].y + min(luma_x, max_x) + luma_y * y_stride; |
| |
| uint4 luma0 = dst_frame.Load4(y_offset); |
| uint4 luma1 = dst_frame.Load4(y_offset + y_stride); |
| |
| if (luma_x >= max_x) { |
| luma0.zw = luma0.yy; |
| luma1.zw = luma1.yy; |
| if (luma_x > max_x) { |
| luma0.x = luma0.y; |
| luma1.x = luma1.y; |
| } |
| } |
| |
| luma.x = ((luma0.x >> 0) & 1023) + ((luma0.x >> 16) & 1023) + ((luma1.x >> 0) & 1023) + ((luma1.x >> 16) & 1023); |
| luma.y = ((luma0.y >> 0) & 1023) + ((luma0.y >> 16) & 1023) + ((luma1.y >> 0) & 1023) + ((luma1.y >> 16) & 1023); |
| luma.z = ((luma0.z >> 0) & 1023) + ((luma0.z >> 16) & 1023) + ((luma1.z >> 0) & 1023) + ((luma1.z >> 16) & 1023); |
| luma.w = ((luma0.w >> 0) & 1023) + ((luma0.w >> 16) & 1023) + ((luma1.w >> 0) & 1023) + ((luma1.w >> 16) & 1023); |
| luma <<= 1; |
| above[loc_base + wi] = luma.x + luma.y + luma.z + luma.w; |
| } |
| |
| GroupMemoryBarrierWithGroupSync(); |
| |
| if (mode == MODE_CFL) // reduce |
| { |
| int count = wi_count >> 2; |
| // max cfl block 32x32 / 4(pixels) = 256 wi (wi_count); |
| // count = 256wi / 4 = up to 64wi == wavefront size, => we can use GroupMemoryBarrier(); |
| int ofs = loc_base + wi * 4; |
| if (wi < count) { |
| int sum = above[ofs + 0] + above[ofs + 1] + above[ofs + 2] + above[ofs + 3]; |
| GroupMemoryBarrier(); |
| above[loc_base + wi] = sum; |
| GroupMemoryBarrier(); |
| } |
| ofs = loc_base + wi * 2; |
| count >>= 1; |
| while (wi < count) { |
| int sum = above[ofs + 0] + above[ofs + 1]; |
| GroupMemoryBarrier(); |
| above[loc_base + wi] = sum; |
| GroupMemoryBarrier(); |
| count >>= 1; |
| } |
| } |
| |
| GroupMemoryBarrierWithGroupSync(); |
| if (thread.x >= cb_wi_count) return; |
| |
| if (mode == MODE_CFL) { |
| int avrg = (above[loc_base] + (2 << bsize_log)) >> (bsize_log + 2); |
| int alpha = ((block.y >> 22) & 63) - 16; |
| pixels.x = compute_cfl_pixel(dc, (luma.x - avrg) * alpha); |
| pixels.x |= compute_cfl_pixel(dc, (luma.y - avrg) * alpha) << 16; |
| pixels.y = compute_cfl_pixel(dc, (luma.z - avrg) * alpha); |
| pixels.y |= compute_cfl_pixel(dc, (luma.w - avrg) * alpha) << 16; |
| } |
| |
| if (mode < MODE_INTRA_BC) { |
| const int frac_bits_x = 6 - upsample_above; |
| const int frac_bits_y = 6 - upsample_left; |
| const int min_base_x = -(1 << upsample_above); |
| |
| above_count <<= upsample_above; |
| left_count <<= upsample_left; |
| uint topleft = above[loc_base - 1]; |
| |
| for (int i = 0; i < 4; ++i) { |
| int x = i + x0; |
| ////1: |
| // int offset_x = (1 + y) * dx; |
| // int base_x = offset_x >> frac_bits_x; |
| // |
| ////2: |
| // int offset_x = (x << 6) - (y + 1) * dx; |
| // const int base_x = offset_x >> frac_bits_x; |
| // |
| // int offset_y = (y << 6) - (x + 1) * dy; |
| // const int base_y = offset_y >> frac_bits_y; |
| // |
| ////3: |
| // int offset_y = (1 + x) * dy; |
| // int base_y = offset_x >> frac_bits_x; |
| // |
| // int shift_y = ((offset_y << upsample_above) & 0x3F) >> 1; |
| // int shift_x = ((offset_x << upsample_above) & 0x3F) >> 1; |
| // Note: for non dir modes: |
| // dx = 0; dy = 0; upsample_above = 0; upsample_left = 0; |
| |
| const int offset_x = (x << 6) + (y + 1) * params.x; |
| const int offset_y = (y << 6) + (x + 1) * params.y; |
| const int shift_x = ((offset_x << upsample_above) & 0x3F) >> 1; |
| const int shift_y = ((offset_y << upsample_left) & 0x3F) >> 1; |
| |
| // Note: shift_x = 0 & shift_y = 0 if (dx == 0) & (dy == 0); |
| // maybe this can be used to simplify weight calc; |
| |
| const int base_x = offset_x >> frac_bits_x; |
| int l0 = left[loc_base + clamp(offset_y >> frac_bits_y, -2, left_count - 1)]; |
| int l1 = left[loc_base + clamp((offset_y >> frac_bits_y) + 1, -2, left_count - 1)]; |
| int t0 = above[loc_base + clamp(base_x, -2, above_count - 1)]; |
| int t1 = above[loc_base + clamp(base_x + 1, -2, above_count - 1)]; |
| |
| // switch(mode) |
| //{ |
| // case (SMOOTH_V | SMOOTH_H): |
| // w_t0 = *(sm_weight_arrays + bh + y); |
| // w_l0 = *(sm_weight_arrays + bw + x); |
| // |
| // case SMOOTH_V: |
| // w_t0 = *(sm_weight_arrays + bh + y); |
| // |
| // case SMOOTH_H: |
| // w_l0 = *(sm_weight_arrays + bw + x); |
| // |
| // case V: |
| // case DIR_1: |
| // w_t0 = 32 - shift_x; |
| // |
| // case H: |
| // case DIR_3: |
| // w_l0 = 32 - shift_y; |
| // |
| // case MODE_PAETH: |
| // int base = t0 + l0 - topleft; |
| // int diff_t = abs(t0 - base); |
| // int diff_l = abs(l0 - base); |
| // int diff_tl = abs(topleft - base); |
| // w_t0 = diff_t < diff_l && diff_t < diff_tl; |
| // w_l0 = diff_l < diff_t && diff_l < diff_tl; |
| //}; |
| |
| int mode_info = params.z & 15; |
| if (mode_info == Dir2) // eliminate dir2 |
| mode_info = base_x >= min_base_x ? Dir1 : Dir3; |
| |
| int w_t0 = (mode_info & SmoothV) ? cb_sm_weight_arrays[bh + y].x : (mode_info == Dir1) ? (32 - shift_x) : 0; |
| int w_l0 = (mode_info & SmoothH) ? cb_sm_weight_arrays[bw + x].x : (mode_info == Dir3) ? (32 - shift_y) : 0; |
| |
| if (mode == MODE_PAETH) { |
| int diff_t = topleft - l0; |
| int diff_l = topleft - t0; |
| int diff_tl = abs(diff_t + diff_l); |
| diff_t = abs(diff_t); |
| diff_l = abs(diff_l); |
| w_l0 = diff_l <= diff_t && diff_l <= diff_tl; |
| w_t0 = diff_t < diff_l && diff_t <= diff_tl; |
| w_l0 <<= 5; |
| w_t0 <<= 5; |
| } |
| |
| int w_tl = (mode == MODE_PAETH) ? (32 - w_t0 - w_l0) : 0; |
| int w_l1 = (mode_info == Dir3) ? shift_y : 0; |
| int w_t1 = (mode_info == Dir1) ? shift_x : 0; |
| int w_b = (mode_info & SmoothV) ? 0x100 - w_t0 : 0; |
| int w_r = (mode_info & SmoothH) ? 0x100 - w_l0 : 0; |
| |
| int sum = w_tl * topleft + w_b * left[loc_base + bh - 1] + w_r * above[loc_base + bw - 1] + w_l0 * l0 + |
| w_t0 * t0 + w_l1 * l1 + w_t1 * t1; |
| |
| sum += 1 << (params.w - 1); |
| sum >>= params.w; |
| int val = clamp(sum, 0, 1023) << ((i & 1) * 16); |
| if (i & 2) |
| pixels.y |= val; |
| else |
| pixels.x |= val; |
| } |
| } |
| |
| const int addr = corner + 4 + (x0 << 1) + (y + 1) * stride; |
| if (block.y & 0x80000000) // inter-intra |
| { |
| int wedge_addr = (block.z << 6) + x0 + y * bw; |
| uint wedge = wedge_mask.Load(wedge_addr); |
| uint2 inter = dst_frame.Load2(addr); |
| |
| pixels.x = intra_inter_blend((wedge >> 0) & 255, (pixels.x >> 0) & 1023, (inter.x >> 0) & 1023) | |
| (intra_inter_blend((wedge >> 8) & 255, (pixels.x >> 16) & 1023, (inter.x >> 16) & 1023) << 16); |
| pixels.y = intra_inter_blend((wedge >> 16) & 255, (pixels.y >> 0) & 1023, (inter.y >> 0) & 1023) | |
| (intra_inter_blend((wedge >> 24) & 255, (pixels.y >> 16) & 1023, (inter.y >> 16) & 1023) << 16); |
| } |
| |
| if (block.y & (1 << 5)) { |
| const int res_stride = cb_planes[plane].z; |
| const int res_addr = cb_planes[plane].w + ((bx + x0) << 1) + (by + y) * res_stride; |
| int2 res = residuals.Load2(res_addr); |
| pixels.x = clamp((int)((pixels.x >> 0) & 1023) + (int)((res.x << 16) >> 16), 0, 1023) | |
| (clamp((int)((pixels.x >> 16) & 1023) + (int)(res.x >> 16), 0, 1023) << 16); |
| pixels.y = clamp((int)((pixels.y >> 0) & 1023) + (int)((res.y << 16) >> 16), 0, 1023) | |
| (clamp((int)((pixels.y >> 16) & 1023) + (int)(res.y >> 16), 0, 1023) << 16); |
| } |
| |
| dst_frame.Store2(addr, pixels); |
| } |