av1/encoder/cnn.c - aom - Git at Google

 /*
  * Copyright (c) 2019, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
  * was not distributed with this source code in the LICENSE file, you can
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */

 #include <assert.h>
 #include <math.h>

 #include "aom_dsp/aom_dsp_common.h"
 #include "av1/encoder/cnn.h"
 #include "av1/common/av1_common_int.h"

 #define CLAMPINDEX(a, hi) ((a) < 0 ? 0 : ((a) >= (hi) ? ((hi)-1) : (a)))

 typedef struct {
   const float **input;
   int in_width;
   int in_height;
   int in_stride;
   const CNN_LAYER_CONFIG *layer_config;
   float **output;
   int out_stride;
   int start_idx;
   int th_step;
 } CONVOLVE_OPS;

 typedef float (*activation_fn)(float);

 static float softsign(float x) { return x / (float)(fabsf(x) + 1.0); }

 static float relu(float x) { return (x < 0) ? 0 : x; }

 static float identity(float x) { return x; }

 typedef struct {
   int allocsize;
   int channels;
   int width, height, stride;
   float *buf[CNN_MAX_CHANNELS];
 } TENSOR;

 static void init_tensor(TENSOR *tensor) { memset(tensor, 0, sizeof(*tensor)); }

 static void free_tensor(TENSOR *tensor) {
   if (tensor->allocsize) {
     aom_free(tensor->buf[0]);
     tensor->buf[0] = NULL;
     tensor->allocsize = 0;
   }
 }

 static void realloc_tensor(TENSOR *tensor, int channels, int width,
                            int height) {
   const int newallocsize = channels * width * height;
   if (tensor->allocsize < newallocsize) {
     free_tensor(tensor);
     tensor->buf[0] =
         (float *)aom_malloc(sizeof(*tensor->buf[0]) * newallocsize);
     tensor->allocsize = newallocsize;
   }
   tensor->width = width;
   tensor->height = height;
   tensor->stride = width;
   tensor->channels = channels;
   for (int c = 1; c < channels; ++c)
     tensor->buf[c] = &tensor->buf[0][c * width * height];
 }

 static void copy_tensor(const TENSOR *src, int copy_channels, int dst_offset,
                         TENSOR *dst) {
   assert(src->width == dst->width);
   assert(src->height == dst->height);
   assert(copy_channels <= src->channels);
   if (src->stride == dst->width && dst->stride == dst->width) {
     for (int c = 0; c < copy_channels; ++c) {
       memcpy(dst->buf[dst_offset + c], src->buf[c],
              sizeof(*dst->buf[0]) * src->width * src->height);
     }
   } else {
     for (int c = 0; c < copy_channels; ++c) {
       for (int r = 0; r < dst->height; ++r) {
         memcpy(&dst->buf[dst_offset + c][r * dst->stride],
                &src->buf[c][r * src->stride],
                dst->width * sizeof(*dst->buf[c]));
       }
     }
   }
 }

 static void assign_tensor(TENSOR *tensor, float *buf[CNN_MAX_CHANNELS],
                           int channels, int width, int height, int stride) {
   tensor->allocsize = 0;
   tensor->channels = channels;
   tensor->width = width;
   tensor->height = height;
   tensor->stride = stride;
   if (buf) {
     for (int c = 0; c < channels; ++c) tensor->buf[c] = buf[c];
   } else {
     for (int c = 0; c < channels; ++c) tensor->buf[c] = NULL;
   }
 }

 static void swap_tensor(TENSOR *t1, TENSOR *t2) {
   TENSOR t = *t1;
   *t1 = *t2;
   *t2 = t;
 }

 // The concatenated tensor goes into dst with first the channels in
 // original dst followed by the channels in the src
 static void concat_tensor(const TENSOR *src, TENSOR *dst) {
   assert(src->width == dst->width);
   assert(src->height == dst->height);

   const int dst_channels = dst->channels;
   const int channels = dst->channels + src->channels;
   const int newallocsize = channels * dst->width * dst->height;
   if (dst->allocsize < newallocsize) {
     TENSOR t;
     init_tensor(&t);
     // allocate new buffers and copy first the dst channels
     realloc_tensor(&t, channels, dst->width, dst->height);
     copy_tensor(dst, dst->channels, 0, &t);
     // Swap the tensors and free the old buffers
     swap_tensor(dst, &t);
     free_tensor(&t);
   }
   for (int c = 1; c < channels; ++c)
     dst->buf[c] = &dst->buf[0][c * dst->width * dst->height];
   // Copy the channels in src after the first dst_channels channels.
   copy_tensor(src, src->channels, dst_channels, dst);
 }

 int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) {
   return (t1->width == t2->width && t1->height == t2->height);
 }

 int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) {
   return (t1->channels == t2->channels && t1->width == t2->width &&
           t1->height == t2->height);
 }

 static void find_layer_output_size(int in_width, int in_height,
                                    const CNN_LAYER_CONFIG *layer_config,
                                    int *out_width, int *out_height) {
   if (!layer_config->deconvolve) {
     switch (layer_config->pad) {
       case PADDING_SAME_ZERO:
       case PADDING_SAME_REPLICATE:
         *out_width = (in_width + layer_config->skip_width - 1) /
                      layer_config->skip_width;
         *out_height = (in_height + layer_config->skip_height - 1) /
                       layer_config->skip_height;
         break;
       case PADDING_VALID:
         *out_width =
             (in_width - layer_config->filter_width + layer_config->skip_width) /
             layer_config->skip_width;
         *out_height = (in_height - layer_config->filter_height +
                        layer_config->skip_height) /
                       layer_config->skip_height;
         break;
       default: assert(0 && "Unknown padding type");
     }
   } else {
     switch (layer_config->pad) {
       case PADDING_SAME_ZERO:
       case PADDING_SAME_REPLICATE:
         *out_width = in_width * layer_config->skip_width;
         *out_height = in_height * layer_config->skip_height;
         break;
       case PADDING_VALID:
         *out_width = (in_width - 1) * layer_config->skip_width +
                      layer_config->filter_width;
         *out_height = (in_height - 1) * layer_config->skip_height +
                       layer_config->filter_height;
         break;
       default: assert(0 && "Unknown padding type");
     }
   }
 }

 void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config,
                            int channels_per_branch[]) {
   int branch = layer_config->branch;
   const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
   for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
     if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
       if (layer_config->branch_copy_type == BRANCH_INPUT) {
         channels_per_branch[b] = layer_config->in_channels;
       } else if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
         channels_per_branch[b] = layer_config->out_channels;
       } else if (layer_config->branch_copy_type == BRANCH_COMBINED) {
         channels_per_branch[b] = layer_config->out_channels;
         for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
           if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
             assert(channels_per_branch[c] > 0);
             channels_per_branch[b] += channels_per_branch[c];
           }
         }
       }
     }
   }
   channels_per_branch[branch] = layer_config->out_channels;
   for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
     if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
       assert(channels_per_branch[c] > 0);
       channels_per_branch[branch] += channels_per_branch[c];
     }
   }
 }

 #if CONFIG_DEBUG
 static INLINE int cnn_has_at_least_one_output(const CNN_CONFIG *cnn_config) {
   const int num_layers = cnn_config->num_layers;
   const CNN_LAYER_CONFIG *layer_configs = cnn_config->layer_config;

   for (int idx = 0; idx < num_layers; idx++) {
     if (layer_configs[idx].output_num != -1) {
       return 1;
     }
   }
   return 0;
 }
 #endif

 void av1_find_cnn_output_size(int in_width, int in_height,
                               const CNN_CONFIG *cnn_config, int *out_width,
                               int *out_height, int *out_channels) {
   int channels_per_branch[CNN_MAX_BRANCHES] = { 0 };
   int i_width[CNN_MAX_BRANCHES] = { 0 };
   int i_height[CNN_MAX_BRANCHES] = { 0 };
   i_width[0] = in_width + cnn_config->ext_width * 2;
   i_height[0] = in_height + cnn_config->ext_height * 2;

 #if CONFIG_DEBUG
   assert(cnn_has_at_least_one_output(cnn_config));
 #endif

   for (int i = 0; i < cnn_config->num_layers; ++i) {
     const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[i];
     const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
     const int branch = layer_config->branch;
     int o_width = 0, o_height = 0;

     if (layer_config->branch_copy_type == BRANCH_INPUT) {
       for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
         if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
           assert(i_width[branch] > 0 && i_height[branch] > 0);
           i_width[b] = i_width[branch];
           i_height[b] = i_height[branch];
         }
       }
     }

     find_layer_output_size(i_width[branch], i_height[branch], layer_config,
                            &o_width, &o_height);
     i_width[branch] = o_width;
     i_height[branch] = o_height;

     if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
       for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
         if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
           i_width[b] = o_width;
           i_height[b] = o_height;
         }
       }
     }

     find_cnn_out_channels(layer_config, channels_per_branch);

     const int output_num = layer_config->output_num;
     if (output_num != -1) {  // Current layer is an output layer
       out_width[output_num] = o_width;
       out_height[output_num] = o_height;
       out_channels[output_num] = channels_per_branch[layer_config->branch];
     }
   }
 }

 activation_fn get_activation(ACTIVATION layer_activation) {
   switch (layer_activation) {
     case NONE: return identity;
     case RELU: return relu;
     case SOFTSIGN: return softsign;
     case SIGMOID:
       assert(0 && "Sigmoid has not been supported in CNN.");  // TO DO
       return NULL;
     default: assert(0 && "Unknown activation type"); return NULL;
   }
 }

 static INLINE int get_start_shift_convolve(int width, int filt_width,
                                            int stride) {
   const int mod = (width % stride);
   const int filt_off = (filt_width - 1) / 2;
   const int dif = (mod ? mod - 1 : stride - 1);
   return AOMMIN((dif + (filt_width % 2)) / 2, filt_off);
 }

 void av1_cnn_add_c(float **output, int channels, int width, int height,
                    int stride, const float **add) {
   for (int c = 0; c < channels; ++c) {
     for (int i = 0; i < height; ++i)
       for (int j = 0; j < width; ++j)
         output[c][i * stride + j] += add[c][i * stride + j];
   }
 }

 void av1_cnn_activate_c(float **output, int channels, int width, int height,
                         int stride, ACTIVATION layer_activation) {
   activation_fn activation = get_activation(layer_activation);
   for (int c = 0; c < channels; ++c) {
     for (int i = 0; i < height; ++i)
       for (int j = 0; j < width; ++j)
         output[c][i * stride + j] = activation(output[c][i * stride + j]);
   }
 }

 static void copy_active_tensor_to_branches(const TENSOR *layer_active_tensor,
                                            const CNN_LAYER_CONFIG *layer_config,
                                            int branch, TENSOR branch_output[]) {
   const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
   for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
     if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
       // Copy layer's active tensor to output tensor of branch b if set in
       // mask. The output becomes the input of the first layer of the branch
       // because the layer of the branch is not the first layer.
       int copy_channels = branch_config->channels_to_copy > 0
                               ? branch_config->channels_to_copy
                               : layer_active_tensor->channels;
       realloc_tensor(&branch_output[b], copy_channels,
                      layer_active_tensor->width, layer_active_tensor->height);
       copy_tensor(layer_active_tensor, copy_channels, 0, &branch_output[b]);
     }
   }
 }

 static int convolve_layer(void *arg1, void *arg2) {
   const CONVOLVE_OPS *convolve_ops = arg1;
   (void)arg2;
   av1_cnn_convolve(
       convolve_ops->input, convolve_ops->in_width, convolve_ops->in_height,
       convolve_ops->in_stride, convolve_ops->layer_config, convolve_ops->output,
       convolve_ops->out_stride, convolve_ops->start_idx, convolve_ops->th_step);
   return 1;
 }

 static void convolve_layer_mt(const float **input, int in_width, int in_height,
                               int in_stride,
                               const CNN_LAYER_CONFIG *layer_config,
                               const CNN_THREAD_DATA *thread_data,
                               float **output, int out_stride) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   const int num_workers = thread_data->num_workers;

   CONVOLVE_OPS convolve_ops[CNN_MAX_THREADS];
   for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
     AVxWorker *const worker = &thread_data->workers[th];
     winterface->reset(worker);

     CONVOLVE_OPS convolve_op = { input,      in_width,     in_height,
                                  in_stride,  layer_config, output,
                                  out_stride, th,           num_workers };
     convolve_ops[th] = convolve_op;
     worker->hook = convolve_layer;
     worker->data1 = &(convolve_ops[th]);
     worker->data2 = NULL;

     // Start convolving.
     if (th == num_workers - 1) {
       winterface->execute(worker);
     } else {
       winterface->launch(worker);
     }
   }

   // Wait until all workers have finished.
   for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
     winterface->sync(&thread_data->workers[th]);
   }
 }

 void av1_cnn_convolve_c(const float **input, int in_width, int in_height,
                         int in_stride, const CNN_LAYER_CONFIG *layer_config,
                         float **output, int out_stride, int start_idx,
                         int step) {
   assert(!layer_config->deconvolve);
   const int cstep = layer_config->in_channels * layer_config->out_channels;
   const int filter_height_half = layer_config->filter_height >> 1;
   const int filter_width_half = layer_config->filter_width >> 1;
   const int channel_step = AOMMAX(step, 1);

   if (layer_config->maxpool &&
       (layer_config->skip_height > 1 || layer_config->skip_width > 1)) {
     switch (layer_config->pad) {
       case PADDING_SAME_ZERO:
         for (int i = 0; i < layer_config->out_channels; ++i) {
           for (int h = 0, u = 0; h < in_height;
                h += layer_config->skip_height, ++u) {
             for (int w = 0, v = 0; w < in_width;
                  w += layer_config->skip_width, ++v) {
               for (int hh = h;
                    hh < AOMMIN(in_height, h + layer_config->skip_height);
                    ++hh) {
                 for (int ww = w;
                      ww < AOMMIN(in_width, w + layer_config->skip_width);
                      ++ww) {
                   float sum = layer_config->bias[i];
                   for (int k = 0; k < layer_config->in_channels; ++k) {
                     int off = k * layer_config->out_channels + i;
                     for (int l = 0; l < layer_config->filter_height; ++l) {
                       const int ii = hh + l - filter_height_half;
                       for (int m = 0; m < layer_config->filter_width;
                            ++m, off += cstep) {
                         const int jj = ww + m - filter_width_half;
                         if (ii < 0 || ii >= in_height || jj < 0 ||
                             jj >= in_width)
                           continue;
                         sum += layer_config->weights[off] *
                                input[k][ii * in_stride + jj];
                       }
                     }
                   }
                   const float a = sum;
                   if (h == hh && w == ww)
                     output[i][u * out_stride + v] = a;
                   else
                     output[i][u * out_stride + v] =
                         AOMMAX(output[i][u * out_stride + v], a);
                 }
               }
             }
           }
         }
         break;
       case PADDING_SAME_REPLICATE:
         for (int i = 0; i < layer_config->out_channels; ++i) {
           for (int h = 0, u = 0; h < in_height;
                h += layer_config->skip_height, ++u) {
             for (int w = 0, v = 0; w < in_width;
                  w += layer_config->skip_width, ++v) {
               for (int hh = h;
                    hh < AOMMIN(in_height, h + layer_config->skip_height);
                    ++hh) {
                 for (int ww = w;
                      ww < AOMMIN(in_width, w + layer_config->skip_width);
                      ++ww) {
                   float sum = layer_config->bias[i];
                   for (int k = 0; k < layer_config->in_channels; ++k) {
                     int off = k * layer_config->out_channels + i;
                     for (int l = 0; l < layer_config->filter_height; ++l) {
                       const int ii =
                           CLAMPINDEX(hh + l - filter_height_half, in_height);
                       for (int m = 0; m < layer_config->filter_width;
                            ++m, off += cstep) {
                         const int jj =
                             CLAMPINDEX(ww + m - filter_width_half, in_width);
                         assert(ii >= 0 && ii < in_height && jj >= 0 &&
                                jj < in_width);
                         sum += layer_config->weights[off] *
                                input[k][ii * in_stride + jj];
                       }
                     }
                   }
                   const float a = sum;
                   if (h == hh && w == ww)
                     output[i][u * out_stride + v] = a;
                   else
                     output[i][u * out_stride + v] =
                         AOMMAX(output[i][u * out_stride + v], a);
                 }
               }
             }
           }
         }
         break;
       case PADDING_VALID:
         for (int i = 0; i < layer_config->out_channels; ++i) {
           for (int h = 0, u = 0;
                h < in_height - layer_config->filter_height + 1;
                h += layer_config->skip_height, ++u) {
             for (int w = 0, v = 0;
                  w < in_width - layer_config->filter_width + 1;
                  w += layer_config->skip_width, ++v) {
               for (int hh = h;
                    hh < AOMMIN(in_height, h + layer_config->skip_height);
                    ++hh) {
                 for (int ww = w;
                      ww < AOMMIN(in_width, w + layer_config->skip_width);
                      ++ww) {
                   float sum = layer_config->bias[i];
                   for (int k = 0; k < layer_config->in_channels; ++k) {
                     int off = k * layer_config->out_channels + i;
                     for (int l = 0; l < layer_config->filter_height; ++l) {
                       const int ii = hh + l;
                       for (int m = 0; m < layer_config->filter_width;
                            ++m, off += cstep) {
                         const int jj = ww + m;
                         assert(ii >= 0 && ii < in_height && jj >= 0 &&
                                jj < in_width);
                         sum += layer_config->weights[off] *
                                input[k][ii * in_stride + jj];
                       }
                     }
                   }
                   const float a = sum;
                   if (h == hh && w == ww)
                     output[i][u * out_stride + v] = a;
                   else
                     output[i][u * out_stride + v] =
                         AOMMAX(output[i][u * out_stride + v], a);
                 }
               }
             }
           }
         }
         break;
       default: assert(0 && "Unknown padding type");
     }
   } else {
     // Results in element-wise matrix multiplication.
     if (layer_config->filter_height == 1 && layer_config->filter_width == 1) {
       const int start_h = get_start_shift_convolve(
           in_height, layer_config->filter_height, layer_config->skip_height);
       const int start_w =
           get_start_shift_convolve(in_width, layer_config->filter_width,
                                    layer_config->skip_width) +
           start_idx * layer_config->skip_width;
       const int out_w_step = AOMMAX(step, 1);
       const int in_w_step = layer_config->skip_width * out_w_step;
       for (int i = 0; i < layer_config->out_channels; ++i) {
         for (int h = start_h, u = 0; h < in_height;
              h += layer_config->skip_height, ++u) {
           const int in_h = h * in_stride;
           const int out_h = u * out_stride + start_idx;
           for (int w = start_w, out_index = out_h; w < in_width;
                w += in_w_step, out_index += out_w_step) {
             float sum = layer_config->bias[i];
             for (int k = 0; k < layer_config->in_channels; ++k) {
               sum += layer_config->weights[k * layer_config->out_channels + i] *
                      input[k][in_h + w];
             }
             output[i][out_index] = sum;
           }
         }
       }
       return;
     }
     const int ii_shift =
         filter_height_half - (layer_config->filter_height - 1) % 2;
     const int jj_shift =
         filter_width_half - (layer_config->filter_width - 1) % 2;
     switch (layer_config->pad) {
       case PADDING_SAME_ZERO: {
         const int start_h = get_start_shift_convolve(
             in_height, layer_config->filter_height, layer_config->skip_height);
         const int start_w = get_start_shift_convolve(
             in_width, layer_config->filter_width, layer_config->skip_width);
         const int end_ii_shift = filter_height_half + 1;
         const int end_jj_shift = filter_width_half + 1;
         // *_filter_margin stores the number of pixels along a dimension in the
         // intersection of the complement of the image in the extended image
         // and the filter.
         const int top_filter_margin = layer_config->filter_width * ii_shift;
         const int right_filter_margin = end_jj_shift - in_width;
         for (int i = start_idx; i < layer_config->out_channels;
              i += channel_step) {
           for (int h = start_h, u = 0; h < in_height;
                h += layer_config->skip_height, ++u) {
             const int out_h = u * out_stride;
             const int top_cstep =
                 AOMMAX(0, top_filter_margin - h * layer_config->filter_width) *
                     cstep +
                 i;
             const int start_ii = AOMMAX(0, h - ii_shift);
             const int end_ii = AOMMIN(in_height, h + end_ii_shift);
             for (int w = start_w, out_index = out_h; w < in_width;
                  w += layer_config->skip_width, ++out_index) {
               const int left_cstep = AOMMAX(0, jj_shift - w) * cstep;
               const int right_cstep =
                   AOMMAX(0, right_filter_margin + w) * cstep;
               const int start_jj = AOMMAX(0, w - jj_shift);
               const int end_jj = AOMMIN(in_width, w + end_jj_shift);
               float sum = layer_config->bias[i];
               for (int k = 0; k < layer_config->in_channels; ++k) {
                 int off = k * layer_config->out_channels + top_cstep;
                 for (int ii = start_ii; ii < end_ii; ++ii) {
                   off += left_cstep;
                   for (int jj = start_jj; jj < end_jj; ++jj, off += cstep) {
                     sum += layer_config->weights[off] *
                            input[k][ii * in_stride + jj];
                   }
                   off += right_cstep;
                 }
               }
               output[i][out_index] = sum;
             }
           }
         }
         break;
       }
       case PADDING_SAME_REPLICATE: {
         // h and w are shifted to an offset coordinate system to reduce in-loop
         // computation.
         const int start_h =
             get_start_shift_convolve(in_height, layer_config->filter_height,
                                      layer_config->skip_height) -
             ii_shift;
         const int start_w =
             get_start_shift_convolve(in_width, layer_config->filter_width,
                                      layer_config->skip_width) -
             jj_shift;
         const int end_h = in_height - ii_shift;
         const int end_w = in_width - jj_shift;
         for (int i = start_idx; i < layer_config->out_channels;
              i += channel_step) {
           for (int h = start_h, u = 0; h < end_h;
                h += layer_config->skip_height, ++u) {
             const int out_h = u * out_stride;
             const int upper_ii_index = layer_config->filter_height + h;
             for (int w = start_w, out_index = out_h; w < end_w;
                  w += layer_config->skip_width, ++out_index) {
               const int upper_jj_index = layer_config->filter_width + w;
               float sum = layer_config->bias[i];
               for (int k = 0; k < layer_config->in_channels; ++k) {
                 int off = k * layer_config->out_channels + i;
                 for (int ii = h; ii < upper_ii_index; ++ii) {
                   const int clamped_ii = CLAMPINDEX(ii, in_height);
                   for (int jj = w; jj < upper_jj_index; ++jj) {
                     const int clamped_jj = CLAMPINDEX(jj, in_width);
                     assert(clamped_ii >= 0 && clamped_ii < in_height &&
                            clamped_jj >= 0 && clamped_jj < in_width);
                     sum += layer_config->weights[off] *
                            input[k][clamped_ii * in_stride + clamped_jj];
                     off += cstep;
                   }
                 }
               }
               output[i][out_index] = sum;
             }
           }
         }
         break;
       }
       case PADDING_VALID: {
         for (int i = start_idx; i < layer_config->out_channels;
              i += channel_step) {
           for (int h = 0, u = 0;
                h < in_height - layer_config->filter_height + 1;
                h += layer_config->skip_height, ++u) {
             const int out_h = u * out_stride;
             const int upper_ii_index = layer_config->filter_height + h;
             for (int w = 0, out_index = out_h;
                  w < in_width - layer_config->filter_width + 1;
                  w += layer_config->skip_width, ++out_index) {
               const int upper_jj_index = layer_config->filter_width + w;
               float sum = layer_config->bias[i];
               for (int k = 0; k < layer_config->in_channels; ++k) {
                 int off = k * layer_config->out_channels + i;
                 for (int ii = h; ii < upper_ii_index; ++ii) {
                   for (int jj = w; jj < upper_jj_index; ++jj) {
                     assert(ii >= 0 && ii < in_height && jj >= 0 &&
                            jj < in_width);
                     sum += layer_config->weights[off] *
                            input[k][ii * in_stride + jj];
                     off += cstep;
                   }
                 }
               }
               output[i][out_index] = sum;
             }
           }
         }
         break;
       }
       default: assert(0 && "Unknown padding type");
     }
   }
 }

 static INLINE int get_start_shift_deconvolve(int filt_width, int stride) {
   const int dif = AOMMAX(filt_width - stride, 0);
   return dif / 2;
 }

 void av1_cnn_batchnorm_c(float **image, int channels, int width, int height,
                          int stride, const float *gamma, const float *beta,
                          const float *mean, const float *std) {
   assert(gamma && beta && beta && std && "batchnorm has null parameter!");
   for (int ch = 0; ch < channels; ch++) {
     const float ch_gamma = gamma[ch];
     const float ch_beta = beta[ch];
     const float ch_mean = mean[ch];
     const float ch_std = std[ch];
     float *image_row = image[ch];

     for (int row = 0; row < height; row++) {
       for (int col = 0; col < width; col++) {
         image_row[col] =
             ch_gamma * (image_row[col] - ch_mean) / ch_std + ch_beta;
       }
       image_row += stride;
     }
   }
 }

 void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height,
                           int in_stride, const CNN_LAYER_CONFIG *layer_config,
                           float **output, int out_stride) {
   assert(layer_config->deconvolve);

   const int cstep = layer_config->in_channels * layer_config->out_channels;

   int out_width = 0;
   int out_height = 0;
   find_layer_output_size(in_width, in_height, layer_config, &out_width,
                          &out_height);
   switch (layer_config->pad) {
     case PADDING_SAME_ZERO:
       for (int i = 0; i < layer_config->out_channels; ++i) {
         for (int u = 0; u < out_height; ++u) {
           for (int v = 0; v < out_width; ++v) {
             float sum = layer_config->bias[i];
             for (int k = 0; k < layer_config->in_channels; ++k) {
               int off = k * layer_config->out_channels + i;
               for (int l = 0; l < layer_config->filter_height; ++l) {
                 const int h =
                     u - l +
                     get_start_shift_deconvolve(layer_config->filter_height,
                                                layer_config->skip_height);
                 for (int m = 0; m < layer_config->filter_width;
                      ++m, off += cstep) {
                   const int w =
                       v - m +
                       get_start_shift_deconvolve(layer_config->filter_width,
                                                  layer_config->skip_width);
                   if ((h % layer_config->skip_height) != 0 ||
                       (w % layer_config->skip_width) != 0)
                     continue;
                   const int ii = h / layer_config->skip_height;
                   const int jj = w / layer_config->skip_width;
                   if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
                     continue;
                   sum += layer_config->weights[off] *
                          input[k][ii * in_stride + jj];
                 }
               }
             }
             output[i][u * out_stride + v] = sum;
           }
         }
       }
       break;
     case PADDING_SAME_REPLICATE:
       for (int i = 0; i < layer_config->out_channels; ++i) {
         for (int u = 0; u < out_height; ++u) {
           for (int v = 0; v < out_width; ++v) {
             float sum = layer_config->bias[i];
             for (int k = 0; k < layer_config->in_channels; ++k) {
               int off = k * layer_config->out_channels + i;
               for (int l = 0; l < layer_config->filter_height; ++l) {
                 const int h =
                     u - l +
                     get_start_shift_deconvolve(layer_config->filter_height,
                                                layer_config->skip_height);
                 for (int m = 0; m < layer_config->filter_width;
                      ++m, off += cstep) {
                   const int w =
                       v - m +
                       get_start_shift_deconvolve(layer_config->filter_width,
                                                  layer_config->skip_width);
                   if ((h % layer_config->skip_height) != 0 ||
                       (w % layer_config->skip_width) != 0)
                     continue;
                   const int ii =
                       CLAMPINDEX(h / layer_config->skip_height, in_height);
                   const int jj =
                       CLAMPINDEX(w / layer_config->skip_width, in_width);
                   assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
                   sum += layer_config->weights[off] *
                          input[k][ii * in_stride + jj];
                 }
               }
             }
             output[i][u * out_stride + v] = sum;
           }
         }
       }
       break;
     case PADDING_VALID:
       for (int i = 0; i < layer_config->out_channels; ++i) {
         for (int u = 0; u < out_height; ++u) {
           for (int v = 0; v < out_width; ++v) {
             float sum = layer_config->bias[i];
             for (int k = 0; k < layer_config->in_channels; ++k) {
               int off = k * layer_config->out_channels + i;
               for (int l = 0; l < layer_config->filter_height; ++l) {
                 const int h = u - l;
                 for (int m = 0; m < layer_config->filter_width;
                      ++m, off += cstep) {
                   const int w = v - m;
                   if ((h % layer_config->skip_height) != 0 ||
                       (w % layer_config->skip_width) != 0)
                     continue;
                   const int ii = h / layer_config->skip_height;
                   const int jj = w / layer_config->skip_width;
                   if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
                     continue;
                   sum += layer_config->weights[off] *
                          input[k][ii * in_stride + jj];
                 }
               }
             }
             output[i][u * out_stride + v] = sum;
           }
         }
       }
       break;
     default: assert(0 && "Unknown padding type");
   }
 }

 void av1_cnn_predict_c(const float **input, int in_width, int in_height,
                        int in_stride, const CNN_CONFIG *cnn_config,
                        const CNN_THREAD_DATA *thread_data,
                        CNN_MULTI_OUT *output_struct) {
   TENSOR tensor1[CNN_MAX_BRANCHES] = { 0 };
   TENSOR tensor2[CNN_MAX_BRANCHES] = { 0 };

   float **output[CNN_MAX_BRANCHES];
   const int *out_chs = output_struct->output_channels;
   output[0] = output_struct->output_buffer;
   for (int out_idx = 1; out_idx < output_struct->num_outputs; out_idx++) {
     output[out_idx] = output[out_idx - 1] + out_chs[out_idx - 1];
   }

   int i_width = in_width;
   int i_height = in_height;
   int o_width = 0, o_height = 0;
   for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
     init_tensor(&tensor1[b]);
     init_tensor(&tensor2[b]);
   }

   const int *out_stride = output_struct->output_strides;
   for (int layer = 0; layer < cnn_config->num_layers; ++layer) {
     const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer];
     const int branch = layer_config->branch;
     const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;

     // Allocate input tensor
     if (layer == 0) {       // First layer
       assert(branch == 0);  // First layer must be primary branch
       assign_tensor(&tensor1[branch], (float **)input,
                     layer_config->in_channels, in_width, in_height, in_stride);
     } else {  // Non-first layer
       // Swap tensor1 and tensor2
       swap_tensor(&tensor1[branch], &tensor2[branch]);

       i_width = tensor1[branch].width;
       i_height = tensor1[branch].height;
     }

     // Allocate output tensor
     find_layer_output_size(i_width, i_height, layer_config, &o_width,
                            &o_height);
     const int output_num = layer_config->output_num;
     if (output_num == -1) {  // Non-output layer
       realloc_tensor(&tensor2[branch], layer_config->out_channels, o_width,
                      o_height);
     } else {  // Output layer
       free_tensor(&tensor2[branch]);
       assign_tensor(&tensor2[branch], output[output_num],
                     layer_config->out_channels, o_width, o_height,
                     out_stride[output_num]);
     }

     // If we are combining branches make sure that the branch to combine
     // is different from the current branch.
     assert(IMPLIES(layer_config->branch_combine_type != BRANCH_NOC,
                    !(branch_config->branches_to_combine & (1 << branch))));

     if (layer_config->branch_copy_type == BRANCH_INPUT) {
       copy_active_tensor_to_branches(&tensor1[branch], layer_config, branch,
                                      tensor2);
     }
     // Check consistency of input and output channels
     assert(tensor1[branch].channels == layer_config->in_channels);
     assert(tensor2[branch].channels == layer_config->out_channels);

     // Convolve/Deconvolve
     if (!cnn_config->layer_config[layer].deconvolve) {
       if (thread_data->num_workers > 1) {
         convolve_layer_mt((const float **)tensor1[branch].buf,
                           tensor1[branch].width, tensor1[branch].height,
                           tensor1[branch].stride, layer_config, thread_data,
                           tensor2[branch].buf, tensor2[branch].stride);
       } else {
         av1_cnn_convolve((const float **)tensor1[branch].buf,
                          tensor1[branch].width, tensor1[branch].height,
                          tensor1[branch].stride, layer_config,
                          tensor2[branch].buf, tensor2[branch].stride, 0, 1);
       }
     } else {
       av1_cnn_deconvolve((const float **)tensor1[branch].buf,
                          tensor1[branch].width, tensor1[branch].height,
                          tensor1[branch].stride, layer_config,
                          tensor2[branch].buf, tensor2[branch].stride);
     }

     if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
       copy_active_tensor_to_branches(&tensor2[branch], layer_config, branch,
                                      tensor2);
     }

     // Add tensors from other branches if needed
     if (layer_config->branch_combine_type == BRANCH_ADD) {
       for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
         if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
           assert(check_tensor_equal_size(&tensor2[b], &tensor2[branch]));
           av1_cnn_add(tensor2[branch].buf, tensor2[branch].channels,
                       tensor2[branch].width, tensor2[branch].height,
                       tensor2[branch].stride, (const float **)tensor2[b].buf);
         }
       }
     }

     // Non-linearity
     if (layer_config->activation != IDENTITY)
       av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels,
                        tensor2[branch].width, tensor2[branch].height,
                        tensor2[branch].stride, layer_config->activation);

     if (layer_config->bn_params.bn_gamma) {
       av1_cnn_batchnorm(
           tensor2[branch].buf, tensor2[branch].channels, tensor2[branch].width,
           tensor2[branch].height, tensor2[branch].stride,
           layer_config->bn_params.bn_gamma, layer_config->bn_params.bn_beta,
           layer_config->bn_params.bn_mean, layer_config->bn_params.bn_std);
     }

     // Concatenate tensors
     if (layer_config->branch_combine_type == BRANCH_CAT) {
       if (output_num == -1) {  // Non-output layer
         for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
           if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
             assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
             assert(tensor2[b].channels > 0);
             concat_tensor(&tensor2[b], &tensor2[branch]);
           }
         }
       } else {  // Output layer
         const int existing_channels = tensor2[branch].channels;
         int num_chs = existing_channels;
         for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
           if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
             assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
             // Needed only to assign the new channel buffers
             num_chs += tensor2[b].channels;
           }
         }
         assign_tensor(&tensor2[branch], output[output_num], num_chs, o_width,
                       o_height, out_stride[output_num]);

         num_chs = existing_channels;
         for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
           if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
             assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
             // Needed only to assign the new channel buffers
             copy_tensor(&tensor2[b], tensor2[b].channels, num_chs,
                         &tensor2[branch]);
             num_chs += tensor2[b].channels;
           }
         }
       }
     }

     if (layer_config->branch_copy_type == BRANCH_COMBINED) {
       copy_active_tensor_to_branches(&tensor2[branch], layer_config, branch,
                                      tensor2);
     }
   }

   for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
     free_tensor(&tensor1[b]);
     free_tensor(&tensor2[b]);
   }
 }

 // Assume output already has proper allocation
 // Assume input image buffers all have same resolution and strides
 void av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
                                    int stride, const CNN_CONFIG *cnn_config,
                                    const CNN_THREAD_DATA *thread_data,
                                    CNN_MULTI_OUT *output) {
   const float max_val = 255.0;

   const int in_width = width + 2 * cnn_config->ext_width;
   const int in_height = height + 2 * cnn_config->ext_height;
   const int in_channels = cnn_config->layer_config[0].in_channels;
   float *inputs[CNN_MAX_CHANNELS];
   float *input_ =
       (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
   const int in_stride = in_width;

   for (int c = 0; c < in_channels; ++c) {
     inputs[c] = input_ + c * in_stride * in_height;
     float *input =
         inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;

     if (cnn_config->strict_bounds) {
       for (int i = 0; i < height; ++i)
         for (int j = 0; j < width; ++j)
           input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
       // extend left and right
       for (int i = 0; i < height; ++i) {
         for (int j = -cnn_config->ext_width; j < 0; ++j)
           input[i * in_stride + j] = input[i * in_stride];
         for (int j = width; j < width + cnn_config->ext_width; ++j)
           input[i * in_stride + j] = input[i * in_stride + width - 1];
       }
       // extend top and bottom
       for (int i = -cnn_config->ext_height; i < 0; ++i)
         memcpy(&input[i * in_stride - cnn_config->ext_width],
                &input[-cnn_config->ext_width], in_width * sizeof(*input));
       for (int i = height; i < height + cnn_config->ext_height; ++i)
         memcpy(&input[i * in_stride - cnn_config->ext_width],
                &input[(height - 1) * in_stride - cnn_config->ext_width],
                in_width * sizeof(*input));
     } else {
       for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
            ++i)
         for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
              ++j)
           input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
     }
   }
   av1_cnn_predict((const float **)inputs, in_width, in_height, in_stride,
                   cnn_config, thread_data, output);

   aom_free(input_);
 }

 // Assume output already has proper allocation
 // Assume input image buffers all have same resolution and strides
 void av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
                                           int stride,
                                           const CNN_CONFIG *cnn_config,
                                           const CNN_THREAD_DATA *thread_data,
                                           int bit_depth,
                                           CNN_MULTI_OUT *output) {
   const float max_val = (float)((1 << bit_depth) - 1);

   const int in_width = width + 2 * cnn_config->ext_width;
   const int in_height = height + 2 * cnn_config->ext_height;
   const int in_channels = cnn_config->layer_config[0].in_channels;
   float *inputs[CNN_MAX_CHANNELS];
   float *input_ =
       (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
   const int in_stride = in_width;

   for (int c = 0; c < in_channels; ++c) {
     inputs[c] = input_ + c * in_stride * in_height;
     float *input =
         inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;

     if (cnn_config->strict_bounds) {
       for (int i = 0; i < height; ++i)
         for (int j = 0; j < width; ++j)
           input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
       // extend left and right
       for (int i = 0; i < height; ++i) {
         for (int j = -cnn_config->ext_width; j < 0; ++j)
           input[i * in_stride + j] = input[i * in_stride];
         for (int j = width; j < width + cnn_config->ext_width; ++j)
           input[i * in_stride + j] = input[i * in_stride + width - 1];
       }
       // extend top and bottom
       for (int i = -cnn_config->ext_height; i < 0; ++i)
         memcpy(&input[i * in_stride - cnn_config->ext_width],
                &input[-cnn_config->ext_width], in_width * sizeof(*input));
       for (int i = height; i < height + cnn_config->ext_height; ++i)
         memcpy(&input[i * in_stride - cnn_config->ext_width],
                &input[(height - 1) * in_stride - cnn_config->ext_width],
                in_width * sizeof(*input));
     } else {
       for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
            ++i)
         for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
              ++j)
           input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
     }
   }

   av1_cnn_predict((const float **)inputs, in_width, in_height, in_stride,
                   cnn_config, thread_data, output);

   aom_free(input_);
 }

 // Assume output already has proper allocation
 // Assume input image buffers all have same resolution and strides
 void av1_cnn_predict_img(uint8_t **dgd, int width, int height, int stride,
                          const CNN_CONFIG *cnn_config,
                          const CNN_THREAD_DATA *thread_data, float **output,
                          int out_stride) {
   int out_width = 0, out_height = 0, out_channels = 0;
   av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height,
                            &out_channels);
   const int output_chs[1] = { out_channels };
   const int output_strides[1] = { out_stride };
   CNN_MULTI_OUT output_struct = { .output_channels = output_chs,
                                   .output_strides = output_strides,
                                   .output_buffer = output };
   av1_cnn_predict_img_multi_out(dgd, width, height, stride, cnn_config,
                                 thread_data, &output_struct);
 }

 // Assume output already has proper allocation
 // Assume input image buffers all have same resolution and strides
 void av1_cnn_predict_img_highbd(uint16_t **dgd, int width, int height,
                                 int stride, const CNN_CONFIG *cnn_config,
                                 const CNN_THREAD_DATA *thread_data,
                                 int bit_depth, float **output, int out_stride) {
   int out_width = 0, out_height = 0, out_channels = 0;
   av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height,
                            &out_channels);
   const int output_chs[1] = { out_channels };
   const int output_strides[1] = { out_stride };
   CNN_MULTI_OUT output_struct = { .output_channels = output_chs,
                                   .output_strides = output_strides,
                                   .output_buffer = output };
   av1_cnn_predict_img_multi_out_highbd(dgd, width, height, stride, cnn_config,
                                        thread_data, bit_depth, &output_struct);
 }