blob: 44d94eae31ed1d3e1c1d83ab802ad00d64fdfb53 [file] [log] [blame] [edit]
/*
* Copyright (c) 2019, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <assert.h>
#include <math.h>
#include "aom_dsp/aom_dsp_common.h"
#include "av1/common/cnn.h"
#include "av1/common/onyxc_int.h"
#define CLAMPINDEX(a, hi) ((a) < 0 ? 0 : ((a) >= (hi) ? ((hi)-1) : (a)))
typedef struct {
const float **input;
int in_width;
int in_height;
int in_stride;
const CNN_LAYER_CONFIG *layer_config;
float **output;
int out_stride;
int start_idx;
int th_step;
} CONVOLVE_OPS;
typedef float (*activation_fn)(float);
static float softsign(float x) { return x / (float)(fabsf(x) + 1.0); }
static float relu(float x) { return (x < 0) ? 0 : x; }
static float identity(float x) { return x; }
typedef struct {
int allocsize;
int channels;
int width, height, stride;
float *buf[CNN_MAX_CHANNELS];
} TENSOR;
static void init_tensor(TENSOR *tensor) { memset(tensor, 0, sizeof(*tensor)); }
static void free_tensor(TENSOR *tensor) {
if (tensor->allocsize) {
aom_free(tensor->buf[0]);
tensor->buf[0] = NULL;
tensor->allocsize = 0;
}
}
static void realloc_tensor(TENSOR *tensor, int channels, int width,
int height) {
const int newallocsize = channels * width * height;
if (tensor->allocsize < newallocsize) {
free_tensor(tensor);
tensor->buf[0] =
(float *)aom_malloc(sizeof(*tensor->buf[0]) * newallocsize);
tensor->allocsize = newallocsize;
}
tensor->width = width;
tensor->height = height;
tensor->stride = width;
tensor->channels = channels;
for (int c = 1; c < channels; ++c)
tensor->buf[c] = &tensor->buf[0][c * width * height];
}
static void copy_tensor(const TENSOR *src, int copy_channels, int dst_offset,
TENSOR *dst) {
assert(src->width == dst->width);
assert(src->height == dst->height);
assert(copy_channels <= src->channels);
if (src->stride == dst->width && dst->stride == dst->width) {
for (int c = 0; c < copy_channels; ++c) {
memcpy(dst->buf[dst_offset + c], src->buf[c],
sizeof(*dst->buf[0]) * src->width * src->height);
}
} else {
for (int c = 0; c < copy_channels; ++c) {
for (int r = 0; r < dst->height; ++r) {
memcpy(&dst->buf[dst_offset + c][r * dst->stride],
&src->buf[c][r * src->stride],
dst->width * sizeof(*dst->buf[c]));
}
}
}
}
static void assign_tensor(TENSOR *tensor, float *buf[CNN_MAX_CHANNELS],
int channels, int width, int height, int stride) {
tensor->allocsize = 0;
tensor->channels = channels;
tensor->width = width;
tensor->height = height;
tensor->stride = stride;
if (buf) {
for (int c = 0; c < channels; ++c) tensor->buf[c] = buf[c];
} else {
for (int c = 0; c < channels; ++c) tensor->buf[c] = NULL;
}
}
static void swap_tensor(TENSOR *t1, TENSOR *t2) {
TENSOR t = *t1;
*t1 = *t2;
*t2 = t;
}
// The concatenated tensor goes into dst with first the channels in
// original dst followed by the channels in the src
static void concat_tensor(const TENSOR *src, TENSOR *dst) {
assert(src->width == dst->width);
assert(src->height == dst->height);
const int dst_channels = dst->channels;
const int channels = dst->channels + src->channels;
const int newallocsize = channels * dst->width * dst->height;
if (dst->allocsize < newallocsize) {
TENSOR t;
init_tensor(&t);
// allocate new buffers and copy first the dst channels
realloc_tensor(&t, channels, dst->width, dst->height);
copy_tensor(dst, dst->channels, 0, &t);
// Swap the tensors and free the old buffers
swap_tensor(dst, &t);
free_tensor(&t);
}
for (int c = 1; c < channels; ++c)
dst->buf[c] = &dst->buf[0][c * dst->width * dst->height];
// Copy the channels in src after the first dst_channels channels.
copy_tensor(src, src->channels, dst_channels, dst);
}
int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) {
return (t1->width == t2->width && t1->height == t2->height);
}
int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) {
return (t1->channels == t2->channels && t1->width == t2->width &&
t1->height == t2->height);
}
static void find_layer_output_size(int in_width, int in_height,
const CNN_LAYER_CONFIG *layer_config,
int *out_width, int *out_height) {
if (!layer_config->deconvolve) {
switch (layer_config->pad) {
case PADDING_SAME_ZERO:
case PADDING_SAME_REPLICATE:
*out_width = (in_width + layer_config->skip_width - 1) /
layer_config->skip_width;
*out_height = (in_height + layer_config->skip_height - 1) /
layer_config->skip_height;
break;
case PADDING_VALID:
*out_width =
(in_width - layer_config->filter_width + layer_config->skip_width) /
layer_config->skip_width;
*out_height = (in_height - layer_config->filter_height +
layer_config->skip_height) /
layer_config->skip_height;
break;
default: assert(0 && "Unknown padding type");
}
} else {
switch (layer_config->pad) {
case PADDING_SAME_ZERO:
case PADDING_SAME_REPLICATE:
*out_width = in_width * layer_config->skip_width;
*out_height = in_height * layer_config->skip_height;
break;
case PADDING_VALID:
*out_width = (in_width - 1) * layer_config->skip_width +
layer_config->filter_width;
*out_height = (in_height - 1) * layer_config->skip_height +
layer_config->filter_height;
break;
default: assert(0 && "Unknown padding type");
}
}
}
void find_cnn_out_dimensions(const CNN_LAYER_CONFIG *layer_config,
int i_width[], int i_height[], int i_channel[]) {
const int branch = layer_config->branch;
assert(branch >= 0 && branch < CNN_MAX_BRANCHES);
assert(i_width[branch] > 0 && i_height[branch] > 0);
const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
// Compute dimensions of inputs to all other branches from
// layer_config->branch.
int o_width = 0;
int o_height = 0;
find_layer_output_size(i_width[branch], i_height[branch], layer_config,
&o_width, &o_height);
assert(o_width > 0 && o_height > 0);
for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
if (layer_config->branch_copy_type == BRANCH_INPUT) {
i_width[b] = i_width[branch];
i_height[b] = i_height[branch];
i_channel[b] = layer_config->in_channels;
} else if (layer_config->branch_copy_type == BRANCH_OUTPUT ||
layer_config->branch_copy_type == BRANCH_COMBINED) {
i_width[b] = o_width;
i_height[b] = o_height;
i_channel[b] = layer_config->out_channels;
}
if (layer_config->branch_combine_type == BRANCH_CAT &&
layer_config->branch_copy_type == BRANCH_COMBINED) {
for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
assert(i_channel[c] > 0);
i_channel[b] += i_channel[c];
}
}
}
}
}
// Compute dimensions for our own layer by checking if any branches
// are concatenated to layer_config->branch.
i_width[branch] = o_width;
i_height[branch] = o_height;
i_channel[branch] = layer_config->out_channels;
if (layer_config->branch_combine_type == BRANCH_CAT) {
for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
assert(i_channel[c] > 0);
i_channel[branch] += i_channel[c];
}
}
}
}
#if CONFIG_DEBUG
static INLINE int cnn_has_at_least_one_output(const CNN_CONFIG *cnn_config) {
const int num_layers = cnn_config->num_layers;
const CNN_LAYER_CONFIG *layer_configs = cnn_config->layer_config;
for (int idx = 0; idx < num_layers; idx++) {
if (layer_configs[idx].output_num == 0) {
return 1;
}
}
return 0;
}
static INLINE int cnn_has_exactly_one_output(const CNN_CONFIG *cnn_config) {
const int num_layers = cnn_config->num_layers;
const CNN_LAYER_CONFIG *layer_configs = cnn_config->layer_config;
int output_count = 0;
for (int idx = 0; idx < num_layers; idx++) {
int output_num = layer_configs[idx].output_num;
if (output_num > 0) {
return 0;
} else if (output_num == 0) {
output_count++;
}
}
return output_count == 1;
}
#endif
void av1_find_cnn_output_size(int in_width, int in_height,
const CNN_CONFIG *cnn_config, int *out_width,
int *out_height, int *out_channels) {
int i_channels[CNN_MAX_BRANCHES] = { 0 };
int i_width[CNN_MAX_BRANCHES] = { 0 };
int i_height[CNN_MAX_BRANCHES] = { 0 };
i_width[0] = in_width + cnn_config->ext_width * 2;
i_height[0] = in_height + cnn_config->ext_height * 2;
#if CONFIG_DEBUG
assert(cnn_has_at_least_one_output(cnn_config));
#endif
for (int i = 0; i < cnn_config->num_layers; ++i) {
const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[i];
find_cnn_out_dimensions(layer_config, i_width, i_height, i_channels);
const int output_num = layer_config->output_num;
if (output_num != -1) { // Current layer is an output layer
const int branch = layer_config->branch;
assert(branch >= 0 && branch < CNN_MAX_BRANCHES && output_num >= 0);
out_width[output_num] = i_width[branch];
out_height[output_num] = i_height[branch];
out_channels[output_num] = i_channels[branch];
}
}
}
activation_fn get_activation(ACTIVATION layer_activation) {
switch (layer_activation) {
case NONE: return identity;
case RELU: return relu;
case SOFTSIGN: return softsign;
default: assert(0 && "Unknown padding type"); return NULL;
}
}
static INLINE int get_start_shift_convolve(int width, int filt_width,
int stride) {
const int mod = (width % stride);
const int filt_off = (filt_width - 1) / 2;
const int dif = (mod ? mod - 1 : stride - 1);
return AOMMIN((dif + (filt_width % 2)) / 2, filt_off);
}
void av1_cnn_add_c(float **output, int channels, int width, int height,
int stride, const float **add) {
for (int c = 0; c < channels; ++c) {
for (int i = 0; i < height; ++i)
for (int j = 0; j < width; ++j)
output[c][i * stride + j] += add[c][i * stride + j];
}
}
void av1_cnn_activate_c(float **output, int channels, int width, int height,
int stride, ACTIVATION layer_activation) {
activation_fn activation = get_activation(layer_activation);
for (int c = 0; c < channels; ++c) {
for (int i = 0; i < height; ++i)
for (int j = 0; j < width; ++j)
output[c][i * stride + j] = activation(output[c][i * stride + j]);
}
}
static void copy_active_tensor_to_branches(const TENSOR *layer_active_tensor,
const CNN_LAYER_CONFIG *layer_config,
int branch, TENSOR branch_output[]) {
const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
// Copy layer's active tensor to output tensor of branch b if set in
// mask. The output becomes the input of the first layer of the branch
// because the layer of the branch is not the first layer.
int copy_channels = branch_config->channels_to_copy > 0
? branch_config->channels_to_copy
: layer_active_tensor->channels;
realloc_tensor(&branch_output[b], copy_channels,
layer_active_tensor->width, layer_active_tensor->height);
copy_tensor(layer_active_tensor, copy_channels, 0, &branch_output[b]);
}
}
}
static int convolve_layer(void *arg1, void *arg2) {
const CONVOLVE_OPS *convolve_ops = arg1;
(void)arg2;
av1_cnn_convolve(
convolve_ops->input, convolve_ops->in_width, convolve_ops->in_height,
convolve_ops->in_stride, convolve_ops->layer_config, convolve_ops->output,
convolve_ops->out_stride, convolve_ops->start_idx, convolve_ops->th_step);
return 1;
}
static void convolve_layer_mt(const float **input, int in_width, int in_height,
int in_stride,
const CNN_LAYER_CONFIG *layer_config,
const CNN_THREAD_DATA *thread_data,
float **output, int out_stride) {
const AVxWorkerInterface *const winterface = aom_get_worker_interface();
const int num_workers = thread_data->num_workers;
CONVOLVE_OPS convolve_ops[CNN_MAX_THREADS];
for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
AVxWorker *const worker = &thread_data->workers[th];
winterface->reset(worker);
CONVOLVE_OPS convolve_op = { input, in_width, in_height,
in_stride, layer_config, output,
out_stride, th, num_workers };
convolve_ops[th] = convolve_op;
worker->hook = convolve_layer;
worker->data1 = &(convolve_ops[th]);
worker->data2 = NULL;
// Start convolving.
if (th == num_workers - 1) {
winterface->execute(worker);
} else {
winterface->launch(worker);
}
}
// Wait until all workers have finished.
for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
winterface->sync(&thread_data->workers[th]);
}
}
void av1_cnn_convolve_c(const float **input, int in_width, int in_height,
int in_stride, const CNN_LAYER_CONFIG *layer_config,
float **output, int out_stride, int start_idx,
int step) {
assert(!layer_config->deconvolve);
const int cstep = layer_config->in_channels * layer_config->out_channels;
const int filter_height_half = layer_config->filter_height >> 1;
const int filter_width_half = layer_config->filter_width >> 1;
const int channel_step = AOMMAX(step, 1);
if (layer_config->maxpool &&
(layer_config->skip_height > 1 || layer_config->skip_width > 1)) {
switch (layer_config->pad) {
case PADDING_SAME_ZERO:
for (int i = 0; i < layer_config->out_channels; ++i) {
for (int h = 0, u = 0; h < in_height;
h += layer_config->skip_height, ++u) {
for (int w = 0, v = 0; w < in_width;
w += layer_config->skip_width, ++v) {
for (int hh = h;
hh < AOMMIN(in_height, h + layer_config->skip_height);
++hh) {
for (int ww = w;
ww < AOMMIN(in_width, w + layer_config->skip_width);
++ww) {
float sum = layer_config->bias[i];
for (int k = 0; k < layer_config->in_channels; ++k) {
int off = k * layer_config->out_channels + i;
for (int l = 0; l < layer_config->filter_height; ++l) {
const int ii = hh + l - filter_height_half;
for (int m = 0; m < layer_config->filter_width;
++m, off += cstep) {
const int jj = ww + m - filter_width_half;
if (ii < 0 || ii >= in_height || jj < 0 ||
jj >= in_width)
continue;
sum += layer_config->weights[off] *
input[k][ii * in_stride + jj];
}
}
}
const float a = sum;
if (h == hh && w == ww)
output[i][u * out_stride + v] = a;
else
output[i][u * out_stride + v] =
AOMMAX(output[i][u * out_stride + v], a);
}
}
}
}
}
break;
case PADDING_SAME_REPLICATE:
for (int i = 0; i < layer_config->out_channels; ++i) {
for (int h = 0, u = 0; h < in_height;
h += layer_config->skip_height, ++u) {
for (int w = 0, v = 0; w < in_width;
w += layer_config->skip_width, ++v) {
for (int hh = h;
hh < AOMMIN(in_height, h + layer_config->skip_height);
++hh) {
for (int ww = w;
ww < AOMMIN(in_width, w + layer_config->skip_width);
++ww) {
float sum = layer_config->bias[i];
for (int k = 0; k < layer_config->in_channels; ++k) {
int off = k * layer_config->out_channels + i;
for (int l = 0; l < layer_config->filter_height; ++l) {
const int ii =
CLAMPINDEX(hh + l - filter_height_half, in_height);
for (int m = 0; m < layer_config->filter_width;
++m, off += cstep) {
const int jj =
CLAMPINDEX(ww + m - filter_width_half, in_width);
assert(ii >= 0 && ii < in_height && jj >= 0 &&
jj < in_width);
sum += layer_config->weights[off] *
input[k][ii * in_stride + jj];
}
}
}
const float a = sum;
if (h == hh && w == ww)
output[i][u * out_stride + v] = a;
else
output[i][u * out_stride + v] =
AOMMAX(output[i][u * out_stride + v], a);
}
}
}
}
}
break;
case PADDING_VALID:
for (int i = 0; i < layer_config->out_channels; ++i) {
for (int h = 0, u = 0;
h < in_height - layer_config->filter_height + 1;
h += layer_config->skip_height, ++u) {
for (int w = 0, v = 0;
w < in_width - layer_config->filter_width + 1;
w += layer_config->skip_width, ++v) {
for (int hh = h;
hh < AOMMIN(in_height, h + layer_config->skip_height);
++hh) {
for (int ww = w;
ww < AOMMIN(in_width, w + layer_config->skip_width);
++ww) {
float sum = layer_config->bias[i];
for (int k = 0; k < layer_config->in_channels; ++k) {
int off = k * layer_config->out_channels + i;
for (int l = 0; l < layer_config->filter_height; ++l) {
const int ii = hh + l;
for (int m = 0; m < layer_config->filter_width;
++m, off += cstep) {
const int jj = ww + m;
assert(ii >= 0 && ii < in_height && jj >= 0 &&
jj < in_width);
sum += layer_config->weights[off] *
input[k][ii * in_stride + jj];
}
}
}
const float a = sum;
if (h == hh && w == ww)
output[i][u * out_stride + v] = a;
else
output[i][u * out_stride + v] =
AOMMAX(output[i][u * out_stride + v], a);
}
}
}
}
}
break;
default: assert(0 && "Unknown padding type");
}
} else {
// Results in element-wise matrix multiplication.
if (layer_config->filter_height == 1 && layer_config->filter_width == 1) {
const int start_h = get_start_shift_convolve(
in_height, layer_config->filter_height, layer_config->skip_height);
const int start_w =
get_start_shift_convolve(in_width, layer_config->filter_width,
layer_config->skip_width) +
start_idx * layer_config->skip_width;
const int out_w_step = AOMMAX(step, 1);
const int in_w_step = layer_config->skip_width * out_w_step;
for (int i = 0; i < layer_config->out_channels; ++i) {
for (int h = start_h, u = 0; h < in_height;
h += layer_config->skip_height, ++u) {
const int in_h = h * in_stride;
const int out_h = u * out_stride + start_idx;
for (int w = start_w, out_index = out_h; w < in_width;
w += in_w_step, out_index += out_w_step) {
float sum = layer_config->bias[i];
for (int k = 0; k < layer_config->in_channels; ++k) {
sum += layer_config->weights[k * layer_config->out_channels + i] *
input[k][in_h + w];
}
output[i][out_index] = sum;
}
}
}
return;
}
const int ii_shift =
filter_height_half - (layer_config->filter_height - 1) % 2;
const int jj_shift =
filter_width_half - (layer_config->filter_width - 1) % 2;
switch (layer_config->pad) {
case PADDING_SAME_ZERO: {
const int start_h = get_start_shift_convolve(
in_height, layer_config->filter_height, layer_config->skip_height);
const int start_w = get_start_shift_convolve(
in_width, layer_config->filter_width, layer_config->skip_width);
const int end_ii_shift = filter_height_half + 1;
const int end_jj_shift = filter_width_half + 1;
// *_filter_margin stores the number of pixels along a dimension in the
// intersection of the complement of the image in the extended image
// and the filter.
const int top_filter_margin = layer_config->filter_width * ii_shift;
const int right_filter_margin = end_jj_shift - in_width;
for (int i = start_idx; i < layer_config->out_channels;
i += channel_step) {
for (int h = start_h, u = 0; h < in_height;
h += layer_config->skip_height, ++u) {
const int out_h = u * out_stride;
const int top_cstep =
AOMMAX(0, top_filter_margin - h * layer_config->filter_width) *
cstep +
i;
const int start_ii = AOMMAX(0, h - ii_shift);
const int end_ii = AOMMIN(in_height, h + end_ii_shift);
for (int w = start_w, out_index = out_h; w < in_width;
w += layer_config->skip_width, ++out_index) {
const int left_cstep = AOMMAX(0, jj_shift - w) * cstep;
const int right_cstep =
AOMMAX(0, right_filter_margin + w) * cstep;
const int start_jj = AOMMAX(0, w - jj_shift);
const int end_jj = AOMMIN(in_width, w + end_jj_shift);
float sum = layer_config->bias[i];
for (int k = 0; k < layer_config->in_channels; ++k) {
int off = k * layer_config->out_channels + top_cstep;
for (int ii = start_ii; ii < end_ii; ++ii) {
off += left_cstep;
for (int jj = start_jj; jj < end_jj; ++jj, off += cstep) {
sum += layer_config->weights[off] *
input[k][ii * in_stride + jj];
}
off += right_cstep;
}
}
output[i][out_index] = sum;
}
}
}
break;
}
case PADDING_SAME_REPLICATE: {
// h and w are shifted to an offset coordinate system to reduce in-loop
// computation.
const int start_h =
get_start_shift_convolve(in_height, layer_config->filter_height,
layer_config->skip_height) -
ii_shift;
const int start_w =
get_start_shift_convolve(in_width, layer_config->filter_width,
layer_config->skip_width) -
jj_shift;
const int end_h = in_height - ii_shift;
const int end_w = in_width - jj_shift;
for (int i = start_idx; i < layer_config->out_channels;
i += channel_step) {
for (int h = start_h, u = 0; h < end_h;
h += layer_config->skip_height, ++u) {
const int out_h = u * out_stride;
const int upper_ii_index = layer_config->filter_height + h;
for (int w = start_w, out_index = out_h; w < end_w;
w += layer_config->skip_width, ++out_index) {
const int upper_jj_index = layer_config->filter_width + w;
float sum = layer_config->bias[i];
for (int k = 0; k < layer_config->in_channels; ++k) {
int off = k * layer_config->out_channels + i;
for (int ii = h; ii < upper_ii_index; ++ii) {
const int clamped_ii = CLAMPINDEX(ii, in_height);
for (int jj = w; jj < upper_jj_index; ++jj) {
const int clamped_jj = CLAMPINDEX(jj, in_width);
assert(clamped_ii >= 0 && clamped_ii < in_height &&
clamped_jj >= 0 && clamped_jj < in_width);
sum += layer_config->weights[off] *
input[k][clamped_ii * in_stride + clamped_jj];
off += cstep;
}
}
}
output[i][out_index] = sum;
}
}
}
break;
}
case PADDING_VALID: {
for (int i = start_idx; i < layer_config->out_channels;
i += channel_step) {
for (int h = 0, u = 0;
h < in_height - layer_config->filter_height + 1;
h += layer_config->skip_height, ++u) {
const int out_h = u * out_stride;
const int upper_ii_index = layer_config->filter_height + h;
for (int w = 0, out_index = out_h;
w < in_width - layer_config->filter_width + 1;
w += layer_config->skip_width, ++out_index) {
const int upper_jj_index = layer_config->filter_width + w;
float sum = layer_config->bias[i];
for (int k = 0; k < layer_config->in_channels; ++k) {
int off = k * layer_config->out_channels + i;
for (int ii = h; ii < upper_ii_index; ++ii) {
for (int jj = w; jj < upper_jj_index; ++jj) {
assert(ii >= 0 && ii < in_height && jj >= 0 &&
jj < in_width);
sum += layer_config->weights[off] *
input[k][ii * in_stride + jj];
off += cstep;
}
}
}
output[i][out_index] = sum;
}
}
}
break;
}
default: assert(0 && "Unknown padding type");
}
}
}
static INLINE int get_start_shift_deconvolve(int filt_width, int stride) {
const int dif = AOMMAX(filt_width - stride, 0);
return dif / 2;
}
void av1_cnn_batchnorm_c(float **image, int channels, int width, int height,
int stride, const float *gamma, const float *beta,
const float *mean, const float *std) {
assert(gamma && beta && beta && std && "batchnorm has null parameter!");
for (int ch = 0; ch < channels; ch++) {
const float ch_gamma = gamma[ch];
const float ch_beta = beta[ch];
const float ch_mean = mean[ch];
const float ch_std = std[ch];
float *image_row = image[ch];
for (int row = 0; row < height; row++) {
for (int col = 0; col < width; col++) {
image_row[col] =
ch_gamma * (image_row[col] - ch_mean) / ch_std + ch_beta;
}
image_row += stride;
}
}
}
void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height,
int in_stride, const CNN_LAYER_CONFIG *layer_config,
float **output, int out_stride) {
assert(layer_config->deconvolve);
const int cstep = layer_config->in_channels * layer_config->out_channels;
int out_width = 0;
int out_height = 0;
find_layer_output_size(in_width, in_height, layer_config, &out_width,
&out_height);
switch (layer_config->pad) {
case PADDING_SAME_ZERO:
for (int i = 0; i < layer_config->out_channels; ++i) {
for (int u = 0; u < out_height; ++u) {
for (int v = 0; v < out_width; ++v) {
float sum = layer_config->bias[i];
for (int k = 0; k < layer_config->in_channels; ++k) {
int off = k * layer_config->out_channels + i;
for (int l = 0; l < layer_config->filter_height; ++l) {
const int h =
u - l +
get_start_shift_deconvolve(layer_config->filter_height,
layer_config->skip_height);
for (int m = 0; m < layer_config->filter_width;
++m, off += cstep) {
const int w =
v - m +
get_start_shift_deconvolve(layer_config->filter_width,
layer_config->skip_width);
if ((h % layer_config->skip_height) != 0 ||
(w % layer_config->skip_width) != 0)
continue;
const int ii = h / layer_config->skip_height;
const int jj = w / layer_config->skip_width;
if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
continue;
sum += layer_config->weights[off] *
input[k][ii * in_stride + jj];
}
}
}
output[i][u * out_stride + v] = sum;
}
}
}
break;
case PADDING_SAME_REPLICATE:
for (int i = 0; i < layer_config->out_channels; ++i) {
for (int u = 0; u < out_height; ++u) {
for (int v = 0; v < out_width; ++v) {
float sum = layer_config->bias[i];
for (int k = 0; k < layer_config->in_channels; ++k) {
int off = k * layer_config->out_channels + i;
for (int l = 0; l < layer_config->filter_height; ++l) {
const int h =
u - l +
get_start_shift_deconvolve(layer_config->filter_height,
layer_config->skip_height);
for (int m = 0; m < layer_config->filter_width;
++m, off += cstep) {
const int w =
v - m +
get_start_shift_deconvolve(layer_config->filter_width,
layer_config->skip_width);
if ((h % layer_config->skip_height) != 0 ||
(w % layer_config->skip_width) != 0)
continue;
const int ii =
CLAMPINDEX(h / layer_config->skip_height, in_height);
const int jj =
CLAMPINDEX(w / layer_config->skip_width, in_width);
assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
continue;
sum += layer_config->weights[off] *
input[k][ii * in_stride + jj];
}
}
}
output[i][u * out_stride + v] = sum;
}
}
}
break;
case PADDING_VALID:
for (int i = 0; i < layer_config->out_channels; ++i) {
for (int u = 0; u < out_height; ++u) {
for (int v = 0; v < out_width; ++v) {
float sum = layer_config->bias[i];
for (int k = 0; k < layer_config->in_channels; ++k) {
int off = k * layer_config->out_channels + i;
for (int l = 0; l < layer_config->filter_height; ++l) {
const int h = u - l;
for (int m = 0; m < layer_config->filter_width;
++m, off += cstep) {
const int w = v - m;
if ((h % layer_config->skip_height) != 0 ||
(w % layer_config->skip_width) != 0)
continue;
const int ii = h / layer_config->skip_height;
const int jj = w / layer_config->skip_width;
if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
continue;
sum += layer_config->weights[off] *
input[k][ii * in_stride + jj];
}
}
}
output[i][u * out_stride + v] = sum;
}
}
}
break;
default: assert(0 && "Unknown padding type");
}
}
void av1_cnn_predict_c(const float **input, int in_width, int in_height,
int in_stride, const CNN_CONFIG *cnn_config,
const CNN_THREAD_DATA *thread_data,
CNN_MULTI_OUT *output_struct) {
TENSOR tensor1[CNN_MAX_BRANCHES] = { 0 };
TENSOR tensor2[CNN_MAX_BRANCHES] = { 0 };
float **output[CNN_MAX_BRANCHES];
const int *out_chs = output_struct->output_channels;
output[0] = output_struct->output_buffer;
for (int out_idx = 1; out_idx < output_struct->num_outputs; out_idx++) {
output[out_idx] = output[out_idx - 1] + out_chs[out_idx - 1];
}
int i_width = in_width;
int i_height = in_height;
int o_width = 0, o_height = 0;
for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
init_tensor(&tensor1[b]);
init_tensor(&tensor2[b]);
}
const int *out_stride = output_struct->output_strides;
for (int layer = 0; layer < cnn_config->num_layers; ++layer) {
const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer];
const int branch = layer_config->branch;
const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
// Allocate input tensor
if (layer == 0) { // First layer
assert(branch == 0); // First layer must be primary branch
assign_tensor(&tensor1[branch], (float **)input,
layer_config->in_channels, in_width, in_height, in_stride);
} else { // Non-first layer
// Swap tensor1 and tensor2
swap_tensor(&tensor1[branch], &tensor2[branch]);
i_width = tensor1[branch].width;
i_height = tensor1[branch].height;
}
// Allocate output tensor
find_layer_output_size(i_width, i_height, layer_config, &o_width,
&o_height);
const int output_num = layer_config->output_num;
if (output_num == -1) { // Non-output layer
realloc_tensor(&tensor2[branch], layer_config->out_channels, o_width,
o_height);
} else { // Output layer
free_tensor(&tensor2[branch]);
assign_tensor(&tensor2[branch], output[output_num],
layer_config->out_channels, o_width, o_height,
out_stride[output_num]);
}
// If we are combining branches make sure that the branch to combine
// is different from the current branch.
assert(IMPLIES(layer_config->branch_combine_type != BRANCH_NOC,
!(branch_config->branches_to_combine & (1 << branch))));
if (layer_config->branch_copy_type == BRANCH_INPUT) {
copy_active_tensor_to_branches(&tensor1[branch], layer_config, branch,
tensor2);
}
// Check consistency of input and output channels
assert(tensor1[branch].channels == layer_config->in_channels);
assert(tensor2[branch].channels == layer_config->out_channels);
// Convolve/Deconvolve
if (!cnn_config->layer_config[layer].deconvolve) {
if (thread_data->num_workers > 1) {
convolve_layer_mt((const float **)tensor1[branch].buf,
tensor1[branch].width, tensor1[branch].height,
tensor1[branch].stride, layer_config, thread_data,
tensor2[branch].buf, tensor2[branch].stride);
} else {
av1_cnn_convolve((const float **)tensor1[branch].buf,
tensor1[branch].width, tensor1[branch].height,
tensor1[branch].stride, layer_config,
tensor2[branch].buf, tensor2[branch].stride, 0, 1);
}
} else {
av1_cnn_deconvolve((const float **)tensor1[branch].buf,
tensor1[branch].width, tensor1[branch].height,
tensor1[branch].stride, layer_config,
tensor2[branch].buf, tensor2[branch].stride);
}
if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
copy_active_tensor_to_branches(&tensor2[branch], layer_config, branch,
tensor2);
}
// Add tensors from other branches if needed
if (layer_config->branch_combine_type == BRANCH_ADD) {
for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
assert(check_tensor_equal_size(&tensor2[b], &tensor2[branch]));
av1_cnn_add(tensor2[branch].buf, tensor2[branch].channels,
tensor2[branch].width, tensor2[branch].height,
tensor2[branch].stride, (const float **)tensor2[b].buf);
}
}
}
// Non-linearity
if (layer_config->activation != IDENTITY)
av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels,
tensor2[branch].width, tensor2[branch].height,
tensor2[branch].stride, layer_config->activation);
if (layer_config->bn_params.bn_gamma) {
av1_cnn_batchnorm(
tensor2[branch].buf, tensor2[branch].channels, tensor2[branch].width,
tensor2[branch].height, tensor2[branch].stride,
layer_config->bn_params.bn_gamma, layer_config->bn_params.bn_beta,
layer_config->bn_params.bn_mean, layer_config->bn_params.bn_std);
}
// Concatenate tensors
if (layer_config->branch_combine_type == BRANCH_CAT) {
if (output_num == -1) { // Non-output layer
for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
assert(tensor2[b].channels > 0);
concat_tensor(&tensor2[b], &tensor2[branch]);
}
}
} else { // Output layer
const int existing_channels = tensor2[branch].channels;
int num_chs = existing_channels;
for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
// Needed only to assign the new channel buffers
num_chs += tensor2[b].channels;
}
}
assign_tensor(&tensor2[branch], output[output_num], num_chs, o_width,
o_height, out_stride[output_num]);
num_chs = existing_channels;
for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
// Needed only to assign the new channel buffers
copy_tensor(&tensor2[b], tensor2[b].channels, num_chs,
&tensor2[branch]);
num_chs += tensor2[b].channels;
}
}
}
}
if (layer_config->branch_copy_type == BRANCH_COMBINED) {
copy_active_tensor_to_branches(&tensor2[branch], layer_config, branch,
tensor2);
}
}
for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
free_tensor(&tensor1[b]);
free_tensor(&tensor2[b]);
}
}
// Assume output already has proper allocation
// Assume input image buffers all have same resolution and strides
void av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
int stride, const CNN_CONFIG *cnn_config,
const CNN_THREAD_DATA *thread_data,
CNN_MULTI_OUT *output) {
const float max_val = 255.0;
const int in_width = width + 2 * cnn_config->ext_width;
const int in_height = height + 2 * cnn_config->ext_height;
const int in_channels = cnn_config->layer_config[0].in_channels;
float *inputs[CNN_MAX_CHANNELS];
float *input_ =
(float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
const int in_stride = in_width;
for (int c = 0; c < in_channels; ++c) {
inputs[c] = input_ + c * in_stride * in_height;
float *input =
inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
if (cnn_config->strict_bounds) {
for (int i = 0; i < height; ++i)
for (int j = 0; j < width; ++j)
input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
// extend left and right
for (int i = 0; i < height; ++i) {
for (int j = -cnn_config->ext_width; j < 0; ++j)
input[i * in_stride + j] = input[i * in_stride];
for (int j = width; j < width + cnn_config->ext_width; ++j)
input[i * in_stride + j] = input[i * in_stride + width - 1];
}
// extend top and bottom
for (int i = -cnn_config->ext_height; i < 0; ++i)
memcpy(&input[i * in_stride - cnn_config->ext_width],
&input[-cnn_config->ext_width], in_width * sizeof(*input));
for (int i = height; i < height + cnn_config->ext_height; ++i)
memcpy(&input[i * in_stride - cnn_config->ext_width],
&input[(height - 1) * in_stride - cnn_config->ext_width],
in_width * sizeof(*input));
} else {
for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
++i)
for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
++j)
input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
}
}
av1_cnn_predict((const float **)inputs, in_width, in_height, in_stride,
cnn_config, thread_data, output);
aom_free(input_);
}
// Assume output already has proper allocation
// Assume input image buffers all have same resolution and strides
void av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
int stride,
const CNN_CONFIG *cnn_config,
const CNN_THREAD_DATA *thread_data,
int bit_depth,
CNN_MULTI_OUT *output) {
const float max_val = (float)((1 << bit_depth) - 1);
const int in_width = width + 2 * cnn_config->ext_width;
const int in_height = height + 2 * cnn_config->ext_height;
const int in_channels = cnn_config->layer_config[0].in_channels;
float *inputs[CNN_MAX_CHANNELS];
float *input_ =
(float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
const int in_stride = in_width;
for (int c = 0; c < in_channels; ++c) {
inputs[c] = input_ + c * in_stride * in_height;
float *input =
inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
if (cnn_config->strict_bounds) {
for (int i = 0; i < height; ++i)
for (int j = 0; j < width; ++j)
input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
// extend left and right
for (int i = 0; i < height; ++i) {
for (int j = -cnn_config->ext_width; j < 0; ++j)
input[i * in_stride + j] = input[i * in_stride];
for (int j = width; j < width + cnn_config->ext_width; ++j)
input[i * in_stride + j] = input[i * in_stride + width - 1];
}
// extend top and bottom
for (int i = -cnn_config->ext_height; i < 0; ++i)
memcpy(&input[i * in_stride - cnn_config->ext_width],
&input[-cnn_config->ext_width], in_width * sizeof(*input));
for (int i = height; i < height + cnn_config->ext_height; ++i)
memcpy(&input[i * in_stride - cnn_config->ext_width],
&input[(height - 1) * in_stride - cnn_config->ext_width],
in_width * sizeof(*input));
} else {
for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
++i)
for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
++j)
input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
}
}
av1_cnn_predict((const float **)inputs, in_width, in_height, in_stride,
cnn_config, thread_data, output);
aom_free(input_);
}
// Assume output already has proper allocation
// Assume input image buffers all have same resolution and strides
void av1_cnn_predict_img(uint8_t **dgd, int width, int height, int stride,
const CNN_CONFIG *cnn_config,
const CNN_THREAD_DATA *thread_data, float **output,
int out_stride) {
int out_width = 0, out_height = 0, out_channels = 0;
av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height,
&out_channels);
const int output_chs[1] = { out_channels };
const int output_strides[1] = { out_stride };
CNN_MULTI_OUT output_struct = { .output_channels = output_chs,
.output_strides = output_strides,
.output_buffer = output };
av1_cnn_predict_img_multi_out(dgd, width, height, stride, cnn_config,
thread_data, &output_struct);
}
// Assume output already has proper allocation
// Assume input image buffers all have same resolution and strides
void av1_cnn_predict_img_highbd(uint16_t **dgd, int width, int height,
int stride, const CNN_CONFIG *cnn_config,
const CNN_THREAD_DATA *thread_data,
int bit_depth, float **output, int out_stride) {
int out_width = 0, out_height = 0, out_channels = 0;
av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height,
&out_channels);
const int output_chs[1] = { out_channels };
const int output_strides[1] = { out_stride };
CNN_MULTI_OUT output_struct = { .output_channels = output_chs,
.output_strides = output_strides,
.output_buffer = output };
av1_cnn_predict_img_multi_out_highbd(dgd, width, height, stride, cnn_config,
thread_data, bit_depth, &output_struct);
}
void av1_restore_cnn_img(uint8_t *dgd, int width, int height, int stride,
const CNN_CONFIG *cnn_config,
const CNN_THREAD_DATA *thread_data) {
const float max_val = 255;
int out_width = 0;
int out_height = 0;
int out_channels = 0;
#if CONFIG_DEBUG
assert(cnn_has_exactly_one_output(cnn_config));
#endif
av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height,
&out_channels);
assert(out_width == width);
assert(out_height == height);
// For restoration, we only want one channel outputted.
assert(out_channels == 1);
const int out_stride = width;
float *output = (float *)aom_malloc(width * height * sizeof(*output));
av1_cnn_predict_img(&dgd, width, height, stride, cnn_config, thread_data,
&output, out_stride);
if (cnn_config->is_residue) {
for (int i = 0; i < height; ++i)
for (int j = 0; j < width; ++j) {
const int residue = (int)(output[i * out_stride + j] * max_val + 0.5);
dgd[i * stride + j] = clip_pixel(dgd[i * stride + j] + residue);
}
} else {
for (int i = 0; i < height; ++i)
for (int j = 0; j < width; ++j)
dgd[i * stride + j] =
clip_pixel((int)(output[i * out_stride + j] * max_val + 0.5));
}
aom_free(output);
}
void av1_restore_cnn_img_highbd(uint16_t *dgd, int width, int height,
int stride, const CNN_CONFIG *cnn_config,
const CNN_THREAD_DATA *thread_data,
int bit_depth) {
const float max_val = (float)((1 << bit_depth) - 1);
int out_width = 0;
int out_height = 0;
int out_channels = 0;
#if CONFIG_DEBUG
assert(cnn_has_exactly_one_output(cnn_config));
#endif
av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height,
&out_channels);
assert(out_width == width);
assert(out_height == height);
// For restoration, we only want one channel outputted.
assert(out_channels == 1);
float *output = (float *)aom_malloc(width * height * sizeof(*output));
const int out_stride = width;
av1_cnn_predict_img_highbd(&dgd, width, height, stride, cnn_config,
thread_data, bit_depth, &output, out_stride);
if (cnn_config->is_residue) {
for (int i = 0; i < height; ++i)
for (int j = 0; j < width; ++j) {
const int residue = (int)(output[i * out_stride + j] * max_val + 0.5);
dgd[i * stride + j] +=
clip_pixel_highbd(dgd[i * stride + j] + residue, bit_depth);
}
} else {
for (int i = 0; i < height; ++i)
for (int j = 0; j < width; ++j)
dgd[i * stride + j] = clip_pixel_highbd(
(int)(output[i * out_stride + j] * max_val + 0.5), bit_depth);
}
aom_free(output);
}
void av1_restore_cnn_plane_part(const AV1_COMMON *cm,
const CNN_CONFIG *cnn_config,
const CNN_THREAD_DATA *thread_data, int plane,
int start_x, int start_y, int width,
int height) {
YV12_BUFFER_CONFIG *buf = &cm->cur_frame->buf;
assert(start_x >= 0 && start_x + width <= buf->y_crop_width);
assert(start_y >= 0 && start_y + height <= buf->y_crop_height);
int offset = 0, part_width = 0, part_height = 0;
switch (plane) {
case AOM_PLANE_Y:
part_width = width;
part_height = height;
offset = start_y * buf->y_stride + start_x;
break;
case AOM_PLANE_U:
case AOM_PLANE_V:
part_width = width >> buf->subsampling_x;
part_height = height >> buf->subsampling_y;
offset = (start_y >> buf->subsampling_y) * buf->uv_stride +
(start_x >> buf->subsampling_x);
break;
default: assert(0 && "Invalid plane index");
}
if (cm->seq_params.use_highbitdepth) {
switch (plane) {
case AOM_PLANE_Y:
av1_restore_cnn_img_highbd(CONVERT_TO_SHORTPTR(buf->y_buffer + offset),
part_width, part_height, buf->y_stride,
cnn_config, thread_data,
cm->seq_params.bit_depth);
break;
case AOM_PLANE_U:
av1_restore_cnn_img_highbd(CONVERT_TO_SHORTPTR(buf->u_buffer + offset),
part_width, part_height, buf->uv_stride,
cnn_config, thread_data,
cm->seq_params.bit_depth);
break;
case AOM_PLANE_V:
av1_restore_cnn_img_highbd(CONVERT_TO_SHORTPTR(buf->v_buffer + offset),
part_width, part_height, buf->uv_stride,
cnn_config, thread_data,
cm->seq_params.bit_depth);
break;
default: assert(0 && "Invalid plane index");
}
} else {
assert(cm->seq_params.bit_depth == 8);
switch (plane) {
case AOM_PLANE_Y:
av1_restore_cnn_img(buf->y_buffer + offset, part_width, part_height,
buf->y_stride, cnn_config, thread_data);
break;
case AOM_PLANE_U:
av1_restore_cnn_img(buf->u_buffer + offset, part_width, part_height,
buf->uv_stride, cnn_config, thread_data);
break;
case AOM_PLANE_V:
av1_restore_cnn_img(buf->v_buffer + offset, part_width, part_height,
buf->uv_stride, cnn_config, thread_data);
break;
default: assert(0 && "Invalid plane index");
}
}
}
void av1_restore_cnn_plane(const AV1_COMMON *cm, const CNN_CONFIG *cnn_config,
int plane, const CNN_THREAD_DATA *thread_data) {
YV12_BUFFER_CONFIG *buf = &cm->cur_frame->buf;
if (cm->seq_params.use_highbitdepth) {
switch (plane) {
case AOM_PLANE_Y:
av1_restore_cnn_img_highbd(CONVERT_TO_SHORTPTR(buf->y_buffer),
buf->y_crop_width, buf->y_crop_height,
buf->y_stride, cnn_config, thread_data,
cm->seq_params.bit_depth);
break;
case AOM_PLANE_U:
av1_restore_cnn_img_highbd(CONVERT_TO_SHORTPTR(buf->u_buffer),
buf->uv_crop_width, buf->uv_crop_height,
buf->uv_stride, cnn_config, thread_data,
cm->seq_params.bit_depth);
break;
case AOM_PLANE_V:
av1_restore_cnn_img_highbd(CONVERT_TO_SHORTPTR(buf->v_buffer),
buf->uv_crop_width, buf->uv_crop_height,
buf->uv_stride, cnn_config, thread_data,
cm->seq_params.bit_depth);
break;
default: assert(0 && "Invalid plane index");
}
} else {
assert(cm->seq_params.bit_depth == 8);
switch (plane) {
case AOM_PLANE_Y:
av1_restore_cnn_img(buf->y_buffer, buf->y_crop_width,
buf->y_crop_height, buf->y_stride, cnn_config,
thread_data);
break;
case AOM_PLANE_U:
av1_restore_cnn_img(buf->u_buffer, buf->uv_crop_width,
buf->uv_crop_height, buf->uv_stride, cnn_config,
thread_data);
break;
case AOM_PLANE_V:
av1_restore_cnn_img(buf->v_buffer, buf->uv_crop_width,
buf->uv_crop_height, buf->uv_stride, cnn_config,
thread_data);
break;
default: assert(0 && "Invalid plane index");
}
}
}