cnn.h - mozsearch

Enable keyboard shortcuts

/*

 * Copyright (c) 2019, Alliance for Open Media. All rights reserved.

 * This source code is subject to the terms of the BSD 2 Clause License and

 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License

 * was not distributed with this source code in the LICENSE file, you can

 * obtain it at www.aomedia.org/license/software. If the Alliance for Open

 * Media Patent License 1.0 was not distributed with this source code in the

 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.

*/

#ifndef AOM_AV1_ENCODER_CNN_H_

#define AOM_AV1_ENCODER_CNN_H_

#ifdef __cplusplus

extern "C" {

#endif

#include <math.h>

#include <stdbool.h>

#include "aom_util/aom_thread.h"

#include "config/av1_rtcd.h"

struct AV1Common;

#define CNN_MAX_HIDDEN_LAYERS 64

#define CNN_MAX_LAYERS (CNN_MAX_HIDDEN_LAYERS + 1)

#define CNN_MAX_CHANNELS 256

#define CNN_MAX_BRANCHES 4

#define CNN_MAX_THREADS 32

#define NO_BRANCH_CONFIG \

  { 0, 0, 0 }

#define NO_BN_PARAMS \

  { NULL, NULL, NULL, NULL }

enum {

  PADDING_SAME_ZERO,       // tensorflow's SAME padding with pixels outside

                           // the image area assumed to be 0 (default)

  PADDING_SAME_REPLICATE,  // tensorflow's SAME padding with pixels outside

                           // the image area replicated from closest edge

  PADDING_VALID            // tensorflow's VALID padding

} UENUM1BYTE(PADDING_TYPE);

// enum { NONE, RELU, SOFTSIGN } UENUM1BYTE(ACTIVATION);

// Times when input tensor may be copied to branches given in input_to_branches.

// BRANCH_NO_COPY: doesn't copy any tensor.

// BRANCH_INPUT: copies the input tensor to branches.

// BRANCH_OUTPUT: copies the convolved tensor to branches.

// BRANCH_COMBINED: copies the combined (after convolving and branch combining)

//   tensor. If no combinations happen at this layer, then this option

//   has the same effect as COPY_OUTPUT.

enum {

  BRANCH_NO_COPY,

  BRANCH_INPUT,

  BRANCH_OUTPUT,

  BRANCH_COMBINED

} UENUM1BYTE(BRANCH_COPY);

// Types of combining branches with output of current layer:

// BRANCH_NOC: no branch combining

// BRANCH_ADD: Add previously stored branch tensor to output of layer

// BRANCH_CAT: Concatenate branch tensor to output of layer

enum { BRANCH_NOC, BRANCH_ADD, BRANCH_CAT } UENUM1BYTE(BRANCH_COMBINE);

// The parameters used to scale each channel in batch

// normalization. The processing in done on a per-channel basis.

// e.g. bn_mean[c] is the mean for all pixels in channel c. This

// is always applied after activation. The output is given by

// out[c,i,j] = norm[c,i,j] * bn_gamma[c] + bn_beta[c] where

// norm[c,i,j] = (in[c,i,j] - bn_mean[c]) / bn_std[c]

// here we assume that the effect of variance_epsilon is already

// taken into account when bn_std is calculated. The pointers

// needs to be either all zero or all valid. If all zero, then

// batchnorm is disabled, else batchnorm is applied.

struct CNN_BATCHNORM_PARAMS {

  const float *bn_gamma;

  const float *bn_beta;

  const float *bn_mean;

  const float *bn_std;

};

struct CNN_BRANCH_CONFIG {

  int input_to_branches;  // If nonzero, copy the active tensor to the current

  // layer and store for future use in branches

  // specified in the field as a binary mask. For

  // example, if input_to_branch = 0x06, it means the

  // input tensor to the current branch is copied to

  // branches 1 and 2 (where 0 represents the primary

  // branch). One restriction is that the mask

  // cannot indicate copying to the current branch.

  // If greater than 0, only copies the channels up

  // to the given index.

  int channels_to_copy;  // Within the layer, input a copy of active

  // tensor to branches given in input_to_branches.

  int branches_to_combine;  // mask of branches to combine with output of

  // current layer, if

  // branch_combine_type != BRANCH_NOC

  // For example, if branches_to_combine = 0x0A,

  // it means that braches 1 and 3 are combined

  // with the current branch.

};

struct CNN_LAYER_CONFIG {

  int in_channels;

  int filter_width;

  int filter_height;

  int out_channels;

  int skip_width;

  int skip_height;

  int maxpool;            // whether to use maxpool or not (only effective when

                          // skip width or skip_height are > 1)

  const float *weights;   // array of length filter_height x filter_width x

                          // in_channels x out_channels where the inner-most

                          // scan is out_channels and the outer most scan is

                          // filter_height.

  const float *bias;      // array of length out_channels

  PADDING_TYPE pad;       // padding type

  ACTIVATION activation;  // the activation function to use after convolution

  int deconvolve;         // whether this is a deconvolution layer.

                          // 0: If skip_width or skip_height are > 1, then we

                          // reduce resolution

                          // 1: If skip_width or skip_height are > 1, then we

                          // increase resolution

  int branch;             // branch index in [0, CNN_MAX_BRANCHES - 1], where

                          // 0 refers to the primary branch.

  BRANCH_COPY branch_copy_type;

  BRANCH_COMBINE branch_combine_type;

  struct CNN_BRANCH_CONFIG branch_config;

  struct CNN_BATCHNORM_PARAMS

      bn_params;   // A struct that contains the parameters

                   // used for batch normalization.

  int output_num;  // The output buffer idx to which the layer output is

                   // written. Set to -1 to disable writing it to the output. In

                   // the case that branch_combine_type is BRANCH_CAT, all

                   // concatenated channels will be written to output. In the

                   // case of BRANCH_ADD, the output will be the result of

                   // summation.

};

struct CNN_CONFIG {

  int num_layers;  // number of CNN layers ( = number of hidden layers + 1)

  int is_residue;  // whether the output activation is a residue

  int ext_width, ext_height;  // extension horizontally and vertically

  int strict_bounds;          // whether the input bounds are strict or not.

                              // If strict, the extension area is filled by

                              // replication; if not strict, image data is

                              // assumed available beyond the bounds.

  CNN_LAYER_CONFIG layer_config[CNN_MAX_LAYERS];

};

struct CNN_THREAD_DATA {

  int num_workers;

  AVxWorker *workers;

};

struct CNN_MULTI_OUT {

  int num_outputs;

  const int *output_channels;

  const int *output_strides;

  float **output_buffer;

};

// Function to return size of output

void av1_find_cnn_output_size(int in_width, int in_height,

                              const CNN_CONFIG *cnn_config, int *out_width,

                              int *out_height, int *out_channels);

// Function to return output width and output height of given layer.

void av1_find_cnn_layer_output_size(int in_width, int in_height,

                                    const CNN_LAYER_CONFIG *layer_config,

                                    int *out_width, int *out_height);

// Prediction functions from set of input image buffers. This function supports

// CNN with multiple outputs.

bool av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,

                                   int stride, const CNN_CONFIG *cnn_config,

                                   const CNN_THREAD_DATA *thread_data,

                                   struct CNN_MULTI_OUT *output);

bool av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,

                                          int stride,

                                          const CNN_CONFIG *cnn_config,

                                          const CNN_THREAD_DATA *thread_data,

                                          int bit_depth, CNN_MULTI_OUT *output);

#ifdef __cplusplus

}  // extern "C"

#endif

#endif  // AOM_AV1_ENCODER_CNN_H_