blob: efbf794609b66b22dbf2eae0d9f8259286b3fe84 [file] [log] [blame]
/*
* Copyright 2020 Google LLC
*
*/
/*
* Copyright (c) 2020, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "inter_common.h"
#define SubblockW 4
#define SubblockH 4
#define OutputShift 11
#define OffsetBits 21
#define OutputRoundAdd ((1 << (OutputShift - 1)) + (1 << OffsetBits))
#define OutputSub ((1 << (OffsetBits - OutputShift)) + (1 << (OffsetBits - OutputShift - 1)))
#define PixelMax 1023
[numthreads(64, 1, 1)] void main(uint3 thread
: SV_DispatchThreadID) {
if (thread.x >= cb_wi_count) return;
const int w_log = cb_width_log2;
const int h_log = cb_height_log2;
const int subblock = thread.x & ((1 << (w_log + h_log)) - 1);
const int block_index = cb_pass_offset + (thread.x >> (w_log + h_log));
uint4 block = pred_blocks.Load4(block_index << 4);
// block.x - pos xy
// block.y - flags:
// 2 plane
// 3 ref
// 4 filter_x
// 4 filter_y
// 1 skip
// 1 compound?
int x = SubblockW * ((block.x & 0xffff) + (subblock & ((1 << w_log) - 1)));
int y = SubblockH * ((block.x >> 16) + (subblock >> w_log));
const int plane = block.y & 3;
const int2 dims = cb_dims[plane > 0].xy;
const int noskip = block.y & NoSkipFlag;
int mv = block.z;
int mvx = x + ((mv) >> (16 + SUBPEL_BITS)) - 3;
int mvy = y + ((mv << 16) >> (16 + SUBPEL_BITS)) - 3;
mvx = clamp(mvx, -11, dims.x) << 1;
const int filter_h = (((block.y >> 5) & 15) << 4) + ((mv >> 16) & SUBPEL_MASK);
const int filter_v = (((block.y >> 9) & 15) << 4) + (mv & SUBPEL_MASK);
const int refplane = ((block.y >> 2) & 7) * 3 + plane;
int ref_offset = cb_refplanes[refplane].y;
int ref_stride = cb_refplanes[refplane].x;
int4 kernel_h0 = cb_kernels[filter_h][0];
int4 kernel_h1 = cb_kernels[filter_h][1];
int4 output[4] = {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}};
int4 l;
l = filter_line_hbd(dst_frame, ref_offset + mvx + ref_stride * clamp(mvy + 0, 0, dims.y), kernel_h0, kernel_h1);
output[0] += l * cb_kernels[filter_v][0].x;
l = filter_line_hbd(dst_frame, ref_offset + mvx + ref_stride * clamp(mvy + 1, 0, dims.y), kernel_h0, kernel_h1);
output[1] += l * cb_kernels[filter_v][0].x;
output[0] += l * cb_kernels[filter_v][0].y;
l = filter_line_hbd(dst_frame, ref_offset + mvx + ref_stride * clamp(mvy + 2, 0, dims.y), kernel_h0, kernel_h1);
output[2] += l * cb_kernels[filter_v][0].x;
output[1] += l * cb_kernels[filter_v][0].y;
output[0] += l * cb_kernels[filter_v][0].z;
//
l = filter_line_hbd(dst_frame, ref_offset + mvx + ref_stride * clamp(mvy + 3, 0, dims.y), kernel_h0, kernel_h1);
output[3] += l * cb_kernels[filter_v][0].x;
output[2] += l * cb_kernels[filter_v][0].y;
output[1] += l * cb_kernels[filter_v][0].z;
output[0] += l * cb_kernels[filter_v][0].w;
l = filter_line_hbd(dst_frame, ref_offset + mvx + ref_stride * clamp(mvy + 4, 0, dims.y), kernel_h0, kernel_h1);
output[3] += l * cb_kernels[filter_v][0].y;
output[2] += l * cb_kernels[filter_v][0].z;
output[1] += l * cb_kernels[filter_v][0].w;
output[0] += l * cb_kernels[filter_v][1].x;
l = filter_line_hbd(dst_frame, ref_offset + mvx + ref_stride * clamp(mvy + 5, 0, dims.y), kernel_h0, kernel_h1);
output[3] += l * cb_kernels[filter_v][0].z;
output[2] += l * cb_kernels[filter_v][0].w;
output[1] += l * cb_kernels[filter_v][1].x;
output[0] += l * cb_kernels[filter_v][1].y;
l = filter_line_hbd(dst_frame, ref_offset + mvx + ref_stride * clamp(mvy + 6, 0, dims.y), kernel_h0, kernel_h1);
output[3] += l * cb_kernels[filter_v][0].w;
output[2] += l * cb_kernels[filter_v][1].x;
output[1] += l * cb_kernels[filter_v][1].y;
output[0] += l * cb_kernels[filter_v][1].z;
l = filter_line_hbd(dst_frame, ref_offset + mvx + ref_stride * clamp(mvy + 7, 0, dims.y), kernel_h0, kernel_h1);
output[3] += l * cb_kernels[filter_v][1].x;
output[2] += l * cb_kernels[filter_v][1].y;
output[1] += l * cb_kernels[filter_v][1].z;
output[0] += l * cb_kernels[filter_v][1].w;
//
l = filter_line_hbd(dst_frame, ref_offset + mvx + ref_stride * clamp(mvy + 8, 0, dims.y), kernel_h0, kernel_h1);
output[3] += l * cb_kernels[filter_v][1].y;
output[2] += l * cb_kernels[filter_v][1].z;
output[1] += l * cb_kernels[filter_v][1].w;
l = filter_line_hbd(dst_frame, ref_offset + mvx + ref_stride * clamp(mvy + 9, 0, dims.y), kernel_h0, kernel_h1);
output[3] += l * cb_kernels[filter_v][1].z;
output[2] += l * cb_kernels[filter_v][1].w;
l = filter_line_hbd(dst_frame, ref_offset + mvx + ref_stride * clamp(mvy + 10, 0, dims.y), kernel_h0, kernel_h1);
output[3] += l * cb_kernels[filter_v][1].w;
x <<= 1;
const int output_stride = cb_planes[plane].x;
const int output_offset = cb_planes[plane].y + x + y * output_stride;
const int res_stride = cb_planes[plane].z;
const int res_offset = cb_planes[plane].w + x + y * res_stride;
for (int i = 0; i < 4; ++i) {
output[i].x = clamp((int)(((output[i].x + OutputRoundAdd) >> OutputShift) - OutputSub), 0, PixelMax);
output[i].y = clamp((int)(((output[i].y + OutputRoundAdd) >> OutputShift) - OutputSub), 0, PixelMax);
output[i].z = clamp((int)(((output[i].z + OutputRoundAdd) >> OutputShift) - OutputSub), 0, PixelMax);
output[i].w = clamp((int)(((output[i].w + OutputRoundAdd) >> OutputShift) - OutputSub), 0, PixelMax);
if (noskip) {
int2 r = (int2)residuals.Load2(res_offset + i * res_stride);
output[i].x += (r.x << 16) >> 16;
output[i].y += r.x >> 16;
output[i].z += (r.y << 16) >> 16;
output[i].w += r.y >> 16;
output[i] = clamp(output[i], 0, PixelMax);
}
dst_frame.Store2(output_offset + i * output_stride,
uint2(output[i].x | (output[i].y << 16), output[i].z | (output[i].w << 16)));
}
}