1 files changed, 1189 insertions, 0 deletions
diff --git a/third_party/aom/av1/encoder/cnn.c b/third_party/aom/av1/encoder/cnn.c
new file mode 100644
index 0000000000..598b362753
--- /dev/null
+++ b/third_party/aom/av1/encoder/cnn.c
@@ -0,0 +1,1189 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdbool.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/encoder/cnn.h"
+
+#define CLAMPINDEX(a, hi) ((a) < 0 ? 0 : ((a) >= (hi) ? ((hi)-1) : (a)))
+
+typedef struct {
+  const float **input;
+  int in_width;
+  int in_height;
+  int in_stride;
+  const CNN_LAYER_CONFIG *layer_config;
+  float **output;
+  int out_stride;
+  int start_idx;
+  int th_step;
+} CONVOLVE_OPS;
+
+static INLINE float softsign(float x) { return x / (fabsf(x) + 1.0f); }
+
+static INLINE float relu(float x) { return (x < 0) ? 0 : x; }
+
+typedef struct {
+  int allocsize;
+  int channels;
+  int width, height, stride;
+  float *buf[CNN_MAX_CHANNELS];
+} TENSOR;
+
+static void init_tensor(TENSOR *tensor) { memset(tensor, 0, sizeof(*tensor)); }
+
+static void free_tensor(TENSOR *tensor) {
+  if (tensor->allocsize) {
+    aom_free(tensor->buf[0]);
+    tensor->buf[0] = NULL;
+    tensor->allocsize = 0;
+  }
+}
+
+static bool realloc_tensor(TENSOR *tensor, int channels, int width,
+                           int height) {
+  const int newallocsize = channels * width * height;
+  if (tensor->allocsize < newallocsize) {
+    free_tensor(tensor);
+    tensor->buf[0] =
+        (float *)aom_malloc(sizeof(*tensor->buf[0]) * newallocsize);
+    if (!tensor->buf[0]) return false;
+    tensor->allocsize = newallocsize;
+  }
+  tensor->width = width;
+  tensor->height = height;
+  tensor->stride = width;
+  tensor->channels = channels;
+  for (int c = 1; c < channels; ++c)
+    tensor->buf[c] = &tensor->buf[0][c * width * height];
+  return true;
+}
+
+static void copy_tensor(const TENSOR *src, int copy_channels, int dst_offset,
+                        TENSOR *dst) {
+  assert(src->width == dst->width);
+  assert(src->height == dst->height);
+  assert(copy_channels <= src->channels);
+  if (src->stride == dst->width && dst->stride == dst->width) {
+    for (int c = 0; c < copy_channels; ++c) {
+      memcpy(dst->buf[dst_offset + c], src->buf[c],
+             sizeof(*dst->buf[0]) * src->width * src->height);
+    }
+  } else {
+    for (int c = 0; c < copy_channels; ++c) {
+      for (int r = 0; r < dst->height; ++r) {
+        memcpy(&dst->buf[dst_offset + c][r * dst->stride],
+               &src->buf[c][r * src->stride],
+               dst->width * sizeof(*dst->buf[c]));
+      }
+    }
+  }
+}
+
+static void assign_tensor(TENSOR *tensor, float *buf[CNN_MAX_CHANNELS],
+                          int channels, int width, int height, int stride) {
+  tensor->allocsize = 0;
+  tensor->channels = channels;
+  tensor->width = width;
+  tensor->height = height;
+  tensor->stride = stride;
+  if (buf) {
+    for (int c = 0; c < channels; ++c) tensor->buf[c] = buf[c];
+  } else {
+    for (int c = 0; c < channels; ++c) tensor->buf[c] = NULL;
+  }
+}
+
+static void swap_tensor(TENSOR *t1, TENSOR *t2) {
+  TENSOR t = *t1;
+  *t1 = *t2;
+  *t2 = t;
+}
+
+// The concatenated tensor goes into dst with first the channels in
+// original dst followed by the channels in the src
+static bool concat_tensor(const TENSOR *src, TENSOR *dst) {
+  assert(src->width == dst->width);
+  assert(src->height == dst->height);
+
+  const int dst_channels = dst->channels;
+  const int channels = dst->channels + src->channels;
+  const int newallocsize = channels * dst->width * dst->height;
+  if (dst->allocsize < newallocsize) {
+    TENSOR t;
+    init_tensor(&t);
+    // allocate new buffers and copy first the dst channels
+    if (!realloc_tensor(&t, channels, dst->width, dst->height)) return false;
+    copy_tensor(dst, dst->channels, 0, &t);
+    // Swap the tensors and free the old buffers
+    swap_tensor(dst, &t);
+    free_tensor(&t);
+  }
+  for (int c = 1; c < channels; ++c)
+    dst->buf[c] = &dst->buf[0][c * dst->width * dst->height];
+  // Copy the channels in src after the first dst_channels channels.
+  copy_tensor(src, src->channels, dst_channels, dst);
+  return true;
+}
+
+int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) {
+  return (t1->width == t2->width && t1->height == t2->height);
+}
+
+int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) {
+  return (t1->channels == t2->channels && t1->width == t2->width &&
+          t1->height == t2->height);
+}
+
+void av1_find_cnn_layer_output_size(int in_width, int in_height,
+                                    const CNN_LAYER_CONFIG *layer_config,
+                                    int *out_width, int *out_height) {
+  assert(layer_config->skip_width > 0);
+  assert(layer_config->skip_height > 0);
+  if (!layer_config->deconvolve) {
+    switch (layer_config->pad) {
+      case PADDING_SAME_ZERO:
+      case PADDING_SAME_REPLICATE:
+        *out_width = (in_width + layer_config->skip_width - 1) /
+                     layer_config->skip_width;
+        *out_height = (in_height + layer_config->skip_height - 1) /
+                      layer_config->skip_height;
+        break;
+      case PADDING_VALID:
+        *out_width =
+            (in_width - layer_config->filter_width + layer_config->skip_width) /
+            layer_config->skip_width;
+        *out_height = (in_height - layer_config->filter_height +
+                       layer_config->skip_height) /
+                      layer_config->skip_height;
+        break;
+      default: assert(0 && "Unknown padding type");
+    }
+  } else {
+    switch (layer_config->pad) {
+      case PADDING_SAME_ZERO:
+      case PADDING_SAME_REPLICATE:
+        *out_width = in_width * layer_config->skip_width;
+        *out_height = in_height * layer_config->skip_height;
+        break;
+      case PADDING_VALID:
+        *out_width = (in_width - 1) * layer_config->skip_width +
+                     layer_config->filter_width;
+        *out_height = (in_height - 1) * layer_config->skip_height +
+                      layer_config->filter_height;
+        break;
+      default: assert(0 && "Unknown padding type");
+    }
+  }
+}
+
+void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config,
+                           int channels_per_branch[]) {
+  int branch = layer_config->branch;
+  const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
+  for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+    if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
+      if (layer_config->branch_copy_type == BRANCH_INPUT) {
+        channels_per_branch[b] = layer_config->in_channels;
+      } else if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
+        channels_per_branch[b] = layer_config->out_channels;
+      } else if (layer_config->branch_copy_type == BRANCH_COMBINED) {
+        channels_per_branch[b] = layer_config->out_channels;
+        for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
+          if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
+            assert(channels_per_branch[c] > 0);
+            channels_per_branch[b] += channels_per_branch[c];
+          }
+        }
+      }
+    }
+  }
+  channels_per_branch[branch] = layer_config->out_channels;
+  for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
+    if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
+      assert(channels_per_branch[c] > 0);
+      channels_per_branch[branch] += channels_per_branch[c];
+    }
+  }
+}
+
+#if CONFIG_DEBUG
+static INLINE int cnn_has_at_least_one_output(const CNN_CONFIG *cnn_config) {
+  const int num_layers = cnn_config->num_layers;
+  const CNN_LAYER_CONFIG *layer_configs = cnn_config->layer_config;
+
+  for (int idx = 0; idx < num_layers; idx++) {
+    if (layer_configs[idx].output_num != -1) {
+      return 1;
+    }
+  }
+  return 0;
+}
+#endif
+
+void av1_find_cnn_output_size(int in_width, int in_height,
+                              const CNN_CONFIG *cnn_config, int *out_width,
+                              int *out_height, int *out_channels) {
+  int channels_per_branch[CNN_MAX_BRANCHES] = { 0 };
+  int i_width[CNN_MAX_BRANCHES] = { 0 };
+  int i_height[CNN_MAX_BRANCHES] = { 0 };
+  i_width[0] = in_width + cnn_config->ext_width * 2;
+  i_height[0] = in_height + cnn_config->ext_height * 2;
+
+#if CONFIG_DEBUG
+  assert(cnn_has_at_least_one_output(cnn_config));
+#endif
+
+  for (int i = 0; i < cnn_config->num_layers; ++i) {
+    const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[i];
+    const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
+    const int branch = layer_config->branch;
+    int o_width = 0, o_height = 0;
+
+    if (layer_config->branch_copy_type == BRANCH_INPUT) {
+      for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+        if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
+          assert(i_width[branch] > 0 && i_height[branch] > 0);
+          i_width[b] = i_width[branch];
+          i_height[b] = i_height[branch];
+        }
+      }
+    }
+
+    av1_find_cnn_layer_output_size(i_width[branch], i_height[branch],
+                                   layer_config, &o_width, &o_height);
+    i_width[branch] = o_width;
+    i_height[branch] = o_height;
+
+    if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
+      for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+        if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
+          i_width[b] = o_width;
+          i_height[b] = o_height;
+        }
+      }
+    }
+
+    find_cnn_out_channels(layer_config, channels_per_branch);
+
+    const int output_num = layer_config->output_num;
+    if (output_num != -1) {  // Current layer is an output layer
+      out_width[output_num] = o_width;
+      out_height[output_num] = o_height;
+      out_channels[output_num] = channels_per_branch[layer_config->branch];
+    }
+  }
+}
+
+static INLINE int get_start_shift_convolve(int width, int filt_width,
+                                           int stride) {
+  const int mod = (width % stride);
+  const int filt_off = (filt_width - 1) / 2;
+  const int dif = (mod ? mod - 1 : stride - 1);
+  return AOMMIN((dif + (filt_width % 2)) / 2, filt_off);
+}
+
+void av1_cnn_add_c(float **output, int channels, int width, int height,
+                   int stride, const float **add) {
+  for (int c = 0; c < channels; ++c) {
+    for (int i = 0; i < height; ++i)
+      for (int j = 0; j < width; ++j)
+        output[c][i * stride + j] += add[c][i * stride + j];
+  }
+}
+
+void av1_cnn_activate_c(float **output, int channels, int width, int height,
+                        int stride, ACTIVATION layer_activation) {
+  if (layer_activation == RELU) {
+    for (int c = 0; c < channels; ++c) {
+      for (int i = 0; i < height; ++i)
+        for (int j = 0; j < width; ++j)
+          output[c][i * stride + j] = relu(output[c][i * stride + j]);
+    }
+  } else if (layer_activation == SOFTSIGN) {
+    for (int c = 0; c < channels; ++c) {
+      for (int i = 0; i < height; ++i)
+        for (int j = 0; j < width; ++j)
+          output[c][i * stride + j] = softsign(output[c][i * stride + j]);
+    }
+  } else if (layer_activation == SIGMOID) {
+    assert(0 && "Sigmoid has not been supported in CNN.");  // TO DO
+  } else if (layer_activation != NONE) {
+    assert(0 && "Unknown activation type");
+  }
+}
+
+static bool copy_active_tensor_to_branches(const TENSOR *layer_active_tensor,
+                                           const CNN_LAYER_CONFIG *layer_config,
+                                           int branch, TENSOR branch_output[]) {
+  const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
+  for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+    if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
+      // Copy layer's active tensor to output tensor of branch b if set in
+      // mask. The output becomes the input of the first layer of the branch
+      // because the layer of the branch is not the first layer.
+      int copy_channels = branch_config->channels_to_copy > 0
+                              ? branch_config->channels_to_copy
+                              : layer_active_tensor->channels;
+      if (!realloc_tensor(&branch_output[b], copy_channels,
+                          layer_active_tensor->width,
+                          layer_active_tensor->height)) {
+        return false;
+      }
+      copy_tensor(layer_active_tensor, copy_channels, 0, &branch_output[b]);
+    }
+  }
+  return true;
+}
+
+// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
+// greater than 1 and padding equal to PADDING_SAME_ZERO.
+static void convolve_maxpool_padding_zero(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+    const int cstep, const int filter_width_half,
+    const int filter_height_half) {
+  for (int i = 0; i < layer_config->out_channels; ++i) {
+    for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
+      for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
+        for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
+             ++hh) {
+          for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
+               ++ww) {
+            float sum = layer_config->bias[i];
+            for (int k = 0; k < layer_config->in_channels; ++k) {
+              int off = k * layer_config->out_channels + i;
+              for (int l = 0; l < layer_config->filter_height; ++l) {
+                const int ii = hh + l - filter_height_half;
+                for (int m = 0; m < layer_config->filter_width;
+                     ++m, off += cstep) {
+                  const int jj = ww + m - filter_width_half;
+                  if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
+                    continue;
+                  sum += layer_config->weights[off] *
+                         input[k][ii * in_stride + jj];
+                }
+              }
+            }
+            const float a = sum;
+            if (h == hh && w == ww)
+              output[i][u * out_stride + v] = a;
+            else
+              output[i][u * out_stride + v] =
+                  AOMMAX(output[i][u * out_stride + v], a);
+          }
+        }
+      }
+    }
+  }
+}
+
+// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
+// greater than 1 and padding equal to PADDING_SAME_REPLICATE.
+static void convolve_maxpool_padding_replicate(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+    const int cstep, const int filter_width_half,
+    const int filter_height_half) {
+  for (int i = 0; i < layer_config->out_channels; ++i) {
+    for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
+      for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
+        for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
+             ++hh) {
+          for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
+               ++ww) {
+            float sum = layer_config->bias[i];
+            for (int k = 0; k < layer_config->in_channels; ++k) {
+              int off = k * layer_config->out_channels + i;
+              for (int l = 0; l < layer_config->filter_height; ++l) {
+                const int ii =
+                    CLAMPINDEX(hh + l - filter_height_half, in_height);
+                for (int m = 0; m < layer_config->filter_width;
+                     ++m, off += cstep) {
+                  const int jj =
+                      CLAMPINDEX(ww + m - filter_width_half, in_width);
+                  assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
+                  sum += layer_config->weights[off] *
+                         input[k][ii * in_stride + jj];
+                }
+              }
+            }
+            const float a = sum;
+            if (h == hh && w == ww)
+              output[i][u * out_stride + v] = a;
+            else
+              output[i][u * out_stride + v] =
+                  AOMMAX(output[i][u * out_stride + v], a);
+          }
+        }
+      }
+    }
+  }
+}
+
+// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
+// greater than 1 and padding equal to PADDING_VALID.
+static void convolve_maxpool_padding_valid(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+    const int cstep) {
+  for (int i = 0; i < layer_config->out_channels; ++i) {
+    for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
+         h += layer_config->skip_height, ++u) {
+      for (int w = 0, v = 0; w < in_width - layer_config->filter_width + 1;
+           w += layer_config->skip_width, ++v) {
+        for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
+             ++hh) {
+          for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
+               ++ww) {
+            float sum = layer_config->bias[i];
+            for (int k = 0; k < layer_config->in_channels; ++k) {
+              int off = k * layer_config->out_channels + i;
+              for (int l = 0; l < layer_config->filter_height; ++l) {
+                const int ii = hh + l;
+                for (int m = 0; m < layer_config->filter_width;
+                     ++m, off += cstep) {
+                  const int jj = ww + m;
+                  assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
+                  sum += layer_config->weights[off] *
+                         input[k][ii * in_stride + jj];
+                }
+              }
+            }
+            const float a = sum;
+            if (h == hh && w == ww)
+              output[i][u * out_stride + v] = a;
+            else
+              output[i][u * out_stride + v] =
+                  AOMMAX(output[i][u * out_stride + v], a);
+          }
+        }
+      }
+    }
+  }
+}
+
+// CNNConvolve specific to maxpool set as 0 with filter_height and filter_width
+// equal to 1.
+static void convolve_element_wise(const float **input, int in_width,
+                                  int in_height, int in_stride,
+                                  const CNN_LAYER_CONFIG *const layer_config,
+                                  float **output, int out_stride, int start_idx,
+                                  int step) {
+  const int start_h = get_start_shift_convolve(
+      in_height, layer_config->filter_height, layer_config->skip_height);
+  const int start_w =
+      get_start_shift_convolve(in_width, layer_config->filter_width,
+                               layer_config->skip_width) +
+      start_idx * layer_config->skip_width;
+  const int out_w_step = AOMMAX(step, 1);
+  const int in_w_step = layer_config->skip_width * out_w_step;
+  for (int i = 0; i < layer_config->out_channels; ++i) {
+    for (int h = start_h, u = 0; h < in_height;
+         h += layer_config->skip_height, ++u) {
+      const int in_h = h * in_stride;
+      const int out_h = u * out_stride + start_idx;
+      for (int w = start_w, out_index = out_h; w < in_width;
+           w += in_w_step, out_index += out_w_step) {
+        float sum = layer_config->bias[i];
+        for (int k = 0; k < layer_config->in_channels; ++k) {
+          sum += layer_config->weights[k * layer_config->out_channels + i] *
+                 input[k][in_h + w];
+        }
+        output[i][out_index] = sum;
+      }
+    }
+  }
+}
+
+// CNNConvolve specific to maxpool set as 0 and padding equal to
+// PADDING_SAME_ZERO.
+static void convolve_no_maxpool_padding_zero(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+    int start_idx, const int cstep, const int filter_width_half,
+    const int filter_height_half, const int ii_shift, const int jj_shift,
+    const int channel_step) {
+  const int start_h = get_start_shift_convolve(
+      in_height, layer_config->filter_height, layer_config->skip_height);
+  const int start_w = get_start_shift_convolve(
+      in_width, layer_config->filter_width, layer_config->skip_width);
+  const int end_ii_shift = filter_height_half + 1;
+  const int end_jj_shift = filter_width_half + 1;
+  // *_filter_margin stores the number of pixels along a dimension in the
+  // intersection of the complement of the image in the extended image
+  // and the filter.
+  const int top_filter_margin = layer_config->filter_width * ii_shift;
+  const int right_filter_margin = end_jj_shift - in_width;
+  for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+    for (int h = start_h, u = 0; h < in_height;
+         h += layer_config->skip_height, ++u) {
+      const int out_h = u * out_stride;
+      const int top_cstep =
+          AOMMAX(0, top_filter_margin - h * layer_config->filter_width) *
+              cstep +
+          i;
+      const int start_ii = AOMMAX(0, h - ii_shift);
+      const int end_ii = AOMMIN(in_height, h + end_ii_shift);
+      for (int w = start_w, out_index = out_h; w < in_width;
+           w += layer_config->skip_width, ++out_index) {
+        const int left_cstep = AOMMAX(0, jj_shift - w) * cstep;
+        const int right_cstep = AOMMAX(0, right_filter_margin + w) * cstep;
+        const int start_jj = AOMMAX(0, w - jj_shift);
+        const int end_jj = AOMMIN(in_width, w + end_jj_shift);
+        float sum = layer_config->bias[i];
+        for (int k = 0; k < layer_config->in_channels; ++k) {
+          int off = k * layer_config->out_channels + top_cstep;
+          for (int ii = start_ii; ii < end_ii; ++ii) {
+            off += left_cstep;
+            for (int jj = start_jj; jj < end_jj; ++jj, off += cstep) {
+              sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
+            }
+            off += right_cstep;
+          }
+        }
+        output[i][out_index] = sum;
+      }
+    }
+  }
+}
+
+// CNNConvolve specific to maxpool set as 0 and padding equal to
+// PADDING_SAME_REPLICATE.
+static void convolve_no_maxpool_padding_replicate(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+    int start_idx, const int cstep, const int ii_shift, const int jj_shift,
+    const int channel_step) {
+  // h and w are shifted to an offset coordinate system to reduce in-loop
+  // computation.
+  const int start_h =
+      get_start_shift_convolve(in_height, layer_config->filter_height,
+                               layer_config->skip_height) -
+      ii_shift;
+  const int start_w =
+      get_start_shift_convolve(in_width, layer_config->filter_width,
+                               layer_config->skip_width) -
+      jj_shift;
+  const int end_h = in_height - ii_shift;
+  const int end_w = in_width - jj_shift;
+  for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+    for (int h = start_h, u = 0; h < end_h;
+         h += layer_config->skip_height, ++u) {
+      const int out_h = u * out_stride;
+      const int upper_ii_index = layer_config->filter_height + h;
+      for (int w = start_w, out_index = out_h; w < end_w;
+           w += layer_config->skip_width, ++out_index) {
+        const int upper_jj_index = layer_config->filter_width + w;
+        float sum = layer_config->bias[i];
+        for (int k = 0; k < layer_config->in_channels; ++k) {
+          int off = k * layer_config->out_channels + i;
+          for (int ii = h; ii < upper_ii_index; ++ii) {
+            const int clamped_ii = CLAMPINDEX(ii, in_height);
+            for (int jj = w; jj < upper_jj_index; ++jj) {
+              const int clamped_jj = CLAMPINDEX(jj, in_width);
+              assert(clamped_ii >= 0 && clamped_ii < in_height &&
+                     clamped_jj >= 0 && clamped_jj < in_width);
+              sum += layer_config->weights[off] *
+                     input[k][clamped_ii * in_stride + clamped_jj];
+              off += cstep;
+            }
+          }
+        }
+        output[i][out_index] = sum;
+      }
+    }
+  }
+}
+
+// CNNConvolve specific to maxpool set as 0 and padding equal to
+// PADDING_VALID.
+void av1_cnn_convolve_no_maxpool_padding_valid_c(
+    const float **input, int in_width, int in_height, int in_stride,
+    const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride,
+    int start_idx, int cstep, int channel_step) {
+  assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) ||
+         !layer_config->maxpool);
+  assert(layer_config->filter_height > 1 || layer_config->filter_width > 1);
+  assert(layer_config->pad == PADDING_VALID);
+  for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+    for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
+         h += layer_config->skip_height, ++u) {
+      const int out_h = u * out_stride;
+      const int upper_ii_index = layer_config->filter_height + h;
+      for (int w = 0, out_index = out_h;
+           w < in_width - layer_config->filter_width + 1;
+           w += layer_config->skip_width, ++out_index) {
+        const int upper_jj_index = layer_config->filter_width + w;
+        float sum = layer_config->bias[i];
+        for (int k = 0; k < layer_config->in_channels; ++k) {
+          int off = k * layer_config->out_channels + i;
+          for (int ii = h; ii < upper_ii_index; ++ii) {
+            for (int jj = w; jj < upper_jj_index; ++jj) {
+              assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
+              sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
+              off += cstep;
+            }
+          }
+        }
+        output[i][out_index] = sum;
+      }
+    }
+  }
+}
+
+static void av1_cnn_convolve(const float **input, int in_width, int in_height,
+                             int in_stride,
+                             const CNN_LAYER_CONFIG *layer_config,
+                             float **output, int out_stride, int start_idx,
+                             int step) {
+  assert(!layer_config->deconvolve);
+  const int cstep = layer_config->in_channels * layer_config->out_channels;
+  const int filter_height_half = layer_config->filter_height >> 1;
+  const int filter_width_half = layer_config->filter_width >> 1;
+  const int channel_step = AOMMAX(step, 1);
+
+  if (layer_config->maxpool &&
+      (layer_config->skip_height > 1 || layer_config->skip_width > 1)) {
+    switch (layer_config->pad) {
+      case PADDING_SAME_ZERO:
+        convolve_maxpool_padding_zero(input, in_width, in_height, in_stride,
+                                      layer_config, output, out_stride, cstep,
+                                      filter_width_half, filter_height_half);
+        break;
+      case PADDING_SAME_REPLICATE:
+        convolve_maxpool_padding_replicate(
+            input, in_width, in_height, in_stride, layer_config, output,
+            out_stride, cstep, filter_width_half, filter_height_half);
+        break;
+      case PADDING_VALID:
+        convolve_maxpool_padding_valid(input, in_width, in_height, in_stride,
+                                       layer_config, output, out_stride, cstep);
+        break;
+      default: assert(0 && "Unknown padding type");
+    }
+  } else {
+    // Results in element-wise matrix multiplication.
+    if (layer_config->filter_height == 1 && layer_config->filter_width == 1) {
+      convolve_element_wise(input, in_width, in_height, in_stride, layer_config,
+                            output, out_stride, start_idx, step);
+      return;
+    }
+    const int ii_shift =
+        filter_height_half - (layer_config->filter_height - 1) % 2;
+    const int jj_shift =
+        filter_width_half - (layer_config->filter_width - 1) % 2;
+    switch (layer_config->pad) {
+      case PADDING_SAME_ZERO:
+        convolve_no_maxpool_padding_zero(
+            input, in_width, in_height, in_stride, layer_config, output,
+            out_stride, start_idx, cstep, filter_width_half, filter_height_half,
+            ii_shift, jj_shift, channel_step);
+        break;
+      case PADDING_SAME_REPLICATE:
+        convolve_no_maxpool_padding_replicate(
+            input, in_width, in_height, in_stride, layer_config, output,
+            out_stride, start_idx, cstep, ii_shift, jj_shift, channel_step);
+        break;
+      case PADDING_VALID:
+        av1_cnn_convolve_no_maxpool_padding_valid(
+            input, in_width, in_height, in_stride, layer_config, output,
+            out_stride, start_idx, cstep, channel_step);
+        break;
+      default: assert(0 && "Unknown padding type");
+    }
+  }
+}
+
+static int convolve_layer(void *arg1, void *arg2) {
+  const CONVOLVE_OPS *convolve_ops = arg1;
+  (void)arg2;
+  av1_cnn_convolve(
+      convolve_ops->input, convolve_ops->in_width, convolve_ops->in_height,
+      convolve_ops->in_stride, convolve_ops->layer_config, convolve_ops->output,
+      convolve_ops->out_stride, convolve_ops->start_idx, convolve_ops->th_step);
+  return 1;
+}
+
+static void convolve_layer_mt(const float **input, int in_width, int in_height,
+                              int in_stride,
+                              const CNN_LAYER_CONFIG *layer_config,
+                              const CNN_THREAD_DATA *thread_data,
+                              float **output, int out_stride) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  const int num_workers = thread_data->num_workers;
+  assert(thread_data->workers);
+
+  CONVOLVE_OPS convolve_ops[CNN_MAX_THREADS];
+  for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
+    AVxWorker *const worker = &thread_data->workers[th];
+    winterface->reset(worker);
+
+    CONVOLVE_OPS convolve_op = { input,      in_width,     in_height,
+                                 in_stride,  layer_config, output,
+                                 out_stride, th,           num_workers };
+    convolve_ops[th] = convolve_op;
+    worker->hook = convolve_layer;
+    worker->data1 = &(convolve_ops[th]);
+    worker->data2 = NULL;
+
+    // Start convolving.
+    if (th == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  // Wait until all workers have finished.
+  for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
+    winterface->sync(&thread_data->workers[th]);
+  }
+}
+
+static INLINE int get_start_shift_deconvolve(int filt_width, int stride) {
+  const int dif = AOMMAX(filt_width - stride, 0);
+  return dif / 2;
+}
+
+void av1_cnn_batchnorm_c(float **image, int channels, int width, int height,
+                         int stride, const float *gamma, const float *beta,
+                         const float *mean, const float *std) {
+  assert(gamma && beta && beta && std && "batchnorm has null parameter!");
+  for (int ch = 0; ch < channels; ch++) {
+    const float ch_gamma = gamma[ch];
+    const float ch_beta = beta[ch];
+    const float ch_mean = mean[ch];
+    const float ch_std = std[ch];
+    float *image_row = image[ch];
+
+    for (int row = 0; row < height; row++) {
+      for (int col = 0; col < width; col++) {
+        image_row[col] =
+            ch_gamma * (image_row[col] - ch_mean) / ch_std + ch_beta;
+      }
+      image_row += stride;
+    }
+  }
+}
+
+void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height,
+                          int in_stride, const CNN_LAYER_CONFIG *layer_config,
+                          float **output, int out_stride) {
+  assert(layer_config->deconvolve);
+
+  const int cstep = layer_config->in_channels * layer_config->out_channels;
+
+  int out_width = 0;
+  int out_height = 0;
+  av1_find_cnn_layer_output_size(in_width, in_height, layer_config, &out_width,
+                                 &out_height);
+  switch (layer_config->pad) {
+    case PADDING_SAME_ZERO:
+      for (int i = 0; i < layer_config->out_channels; ++i) {
+        for (int u = 0; u < out_height; ++u) {
+          for (int v = 0; v < out_width; ++v) {
+            float sum = layer_config->bias[i];
+            for (int k = 0; k < layer_config->in_channels; ++k) {
+              int off = k * layer_config->out_channels + i;
+              for (int l = 0; l < layer_config->filter_height; ++l) {
+                const int h =
+                    u - l +
+                    get_start_shift_deconvolve(layer_config->filter_height,
+                                               layer_config->skip_height);
+                for (int m = 0; m < layer_config->filter_width;
+                     ++m, off += cstep) {
+                  const int w =
+                      v - m +
+                      get_start_shift_deconvolve(layer_config->filter_width,
+                                                 layer_config->skip_width);
+                  if ((h % layer_config->skip_height) != 0 ||
+                      (w % layer_config->skip_width) != 0)
+                    continue;
+                  const int ii = h / layer_config->skip_height;
+                  const int jj = w / layer_config->skip_width;
+                  if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
+                    continue;
+                  sum += layer_config->weights[off] *
+                         input[k][ii * in_stride + jj];
+                }
+              }
+            }
+            output[i][u * out_stride + v] = sum;
+          }
+        }
+      }
+      break;
+    case PADDING_SAME_REPLICATE:
+      for (int i = 0; i < layer_config->out_channels; ++i) {
+        for (int u = 0; u < out_height; ++u) {
+          for (int v = 0; v < out_width; ++v) {
+            float sum = layer_config->bias[i];
+            for (int k = 0; k < layer_config->in_channels; ++k) {
+              int off = k * layer_config->out_channels + i;
+              for (int l = 0; l < layer_config->filter_height; ++l) {
+                const int h =
+                    u - l +
+                    get_start_shift_deconvolve(layer_config->filter_height,
+                                               layer_config->skip_height);
+                for (int m = 0; m < layer_config->filter_width;
+                     ++m, off += cstep) {
+                  const int w =
+                      v - m +
+                      get_start_shift_deconvolve(layer_config->filter_width,
+                                                 layer_config->skip_width);
+                  if ((h % layer_config->skip_height) != 0 ||
+                      (w % layer_config->skip_width) != 0)
+                    continue;
+                  const int ii =
+                      CLAMPINDEX(h / layer_config->skip_height, in_height);
+                  const int jj =
+                      CLAMPINDEX(w / layer_config->skip_width, in_width);
+                  assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
+                  sum += layer_config->weights[off] *
+                         input[k][ii * in_stride + jj];
+                }
+              }
+            }
+            output[i][u * out_stride + v] = sum;
+          }
+        }
+      }
+      break;
+    case PADDING_VALID:
+      for (int i = 0; i < layer_config->out_channels; ++i) {
+        for (int u = 0; u < out_height; ++u) {
+          for (int v = 0; v < out_width; ++v) {
+            float sum = layer_config->bias[i];
+            for (int k = 0; k < layer_config->in_channels; ++k) {
+              int off = k * layer_config->out_channels + i;
+              for (int l = 0; l < layer_config->filter_height; ++l) {
+                const int h = u - l;
+                for (int m = 0; m < layer_config->filter_width;
+                     ++m, off += cstep) {
+                  const int w = v - m;
+                  if ((h % layer_config->skip_height) != 0 ||
+                      (w % layer_config->skip_width) != 0)
+                    continue;
+                  const int ii = h / layer_config->skip_height;
+                  const int jj = w / layer_config->skip_width;
+                  if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
+                    continue;
+                  sum += layer_config->weights[off] *
+                         input[k][ii * in_stride + jj];
+                }
+              }
+            }
+            output[i][u * out_stride + v] = sum;
+          }
+        }
+      }
+      break;
+    default: assert(0 && "Unknown padding type");
+  }
+}
+
+bool av1_cnn_predict_c(const float **input, int in_width, int in_height,
+                       int in_stride, const CNN_CONFIG *cnn_config,
+                       const CNN_THREAD_DATA *thread_data,
+                       CNN_MULTI_OUT *output_struct) {
+  bool success = false;
+  TENSOR tensor1[CNN_MAX_BRANCHES] = { { 0 } };
+  TENSOR tensor2[CNN_MAX_BRANCHES] = { { 0 } };
+
+  float **output[CNN_MAX_BRANCHES];
+  const int *out_chs = output_struct->output_channels;
+  output[0] = output_struct->output_buffer;
+  for (int out_idx = 1; out_idx < output_struct->num_outputs; out_idx++) {
+    output[out_idx] = output[out_idx - 1] + out_chs[out_idx - 1];
+  }
+
+  int i_width = in_width;
+  int i_height = in_height;
+  int o_width = 0, o_height = 0;
+  for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+    init_tensor(&tensor1[b]);
+    init_tensor(&tensor2[b]);
+  }
+
+  const int *out_stride = output_struct->output_strides;
+  for (int layer = 0; layer < cnn_config->num_layers; ++layer) {
+    const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer];
+    const int branch = layer_config->branch;
+    const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
+
+    // Allocate input tensor
+    if (layer == 0) {       // First layer
+      assert(branch == 0);  // First layer must be primary branch
+      assign_tensor(&tensor1[branch], (float **)input,
+                    layer_config->in_channels, in_width, in_height, in_stride);
+    } else {  // Non-first layer
+      // Swap tensor1 and tensor2
+      swap_tensor(&tensor1[branch], &tensor2[branch]);
+
+      i_width = tensor1[branch].width;
+      i_height = tensor1[branch].height;
+    }
+
+    // Allocate output tensor
+    av1_find_cnn_layer_output_size(i_width, i_height, layer_config, &o_width,
+                                   &o_height);
+    const int output_num = layer_config->output_num;
+    if (output_num == -1) {  // Non-output layer
+      if (!realloc_tensor(&tensor2[branch], layer_config->out_channels, o_width,
+                          o_height)) {
+        goto Error;
+      }
+    } else {  // Output layer
+      free_tensor(&tensor2[branch]);
+      assign_tensor(&tensor2[branch], output[output_num],
+                    layer_config->out_channels, o_width, o_height,
+                    out_stride[output_num]);
+    }
+
+    // If we are combining branches make sure that the branch to combine
+    // is different from the current branch.
+    assert(IMPLIES(layer_config->branch_combine_type != BRANCH_NOC,
+                   !(branch_config->branches_to_combine & (1 << branch))));
+
+    if (layer_config->branch_copy_type == BRANCH_INPUT) {
+      if (!copy_active_tensor_to_branches(&tensor1[branch], layer_config,
+                                          branch, tensor2)) {
+        goto Error;
+      }
+    }
+    // Check consistency of input and output channels
+    assert(tensor1[branch].channels == layer_config->in_channels);
+    assert(tensor2[branch].channels == layer_config->out_channels);
+
+    // Convolve/Deconvolve
+    if (!cnn_config->layer_config[layer].deconvolve) {
+      if (thread_data->num_workers > 1) {
+        convolve_layer_mt((const float **)tensor1[branch].buf,
+                          tensor1[branch].width, tensor1[branch].height,
+                          tensor1[branch].stride, layer_config, thread_data,
+                          tensor2[branch].buf, tensor2[branch].stride);
+      } else {
+        av1_cnn_convolve((const float **)tensor1[branch].buf,
+                         tensor1[branch].width, tensor1[branch].height,
+                         tensor1[branch].stride, layer_config,
+                         tensor2[branch].buf, tensor2[branch].stride, 0, 1);
+      }
+    } else {
+      av1_cnn_deconvolve((const float **)tensor1[branch].buf,
+                         tensor1[branch].width, tensor1[branch].height,
+                         tensor1[branch].stride, layer_config,
+                         tensor2[branch].buf, tensor2[branch].stride);
+    }
+
+    if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
+      if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config,
+                                          branch, tensor2)) {
+        goto Error;
+      }
+    }
+
+    // Add tensors from other branches if needed
+    if (layer_config->branch_combine_type == BRANCH_ADD) {
+      for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+        if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
+          assert(check_tensor_equal_size(&tensor2[b], &tensor2[branch]));
+          av1_cnn_add(tensor2[branch].buf, tensor2[branch].channels,
+                      tensor2[branch].width, tensor2[branch].height,
+                      tensor2[branch].stride, (const float **)tensor2[b].buf);
+        }
+      }
+    }
+
+    // Non-linearity
+    av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels,
+                     tensor2[branch].width, tensor2[branch].height,
+                     tensor2[branch].stride, layer_config->activation);
+
+    if (layer_config->bn_params.bn_gamma) {
+      av1_cnn_batchnorm(
+          tensor2[branch].buf, tensor2[branch].channels, tensor2[branch].width,
+          tensor2[branch].height, tensor2[branch].stride,
+          layer_config->bn_params.bn_gamma, layer_config->bn_params.bn_beta,
+          layer_config->bn_params.bn_mean, layer_config->bn_params.bn_std);
+    }
+
+    // Concatenate tensors
+    if (layer_config->branch_combine_type == BRANCH_CAT) {
+      if (output_num == -1) {  // Non-output layer
+        for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+          if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
+            assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
+            assert(tensor2[b].channels > 0);
+            if (!concat_tensor(&tensor2[b], &tensor2[branch])) goto Error;
+          }
+        }
+      } else {  // Output layer
+        const int existing_channels = tensor2[branch].channels;
+        int num_chs = existing_channels;
+        for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+          if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
+            assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
+            // Needed only to assign the new channel buffers
+            num_chs += tensor2[b].channels;
+          }
+        }
+        assign_tensor(&tensor2[branch], output[output_num], num_chs, o_width,
+                      o_height, out_stride[output_num]);
+
+        num_chs = existing_channels;
+        for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+          if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
+            assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
+            // Needed only to assign the new channel buffers
+            copy_tensor(&tensor2[b], tensor2[b].channels, num_chs,
+                        &tensor2[branch]);
+            num_chs += tensor2[b].channels;
+          }
+        }
+      }
+    }
+
+    if (layer_config->branch_copy_type == BRANCH_COMBINED) {
+      if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config,
+                                          branch, tensor2)) {
+        goto Error;
+      }
+    }
+  }
+
+  success = true;
+Error:
+  for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+    free_tensor(&tensor1[b]);
+    free_tensor(&tensor2[b]);
+  }
+  return success;
+}
+
+// Assume output already has proper allocation
+// Assume input image buffers all have same resolution and strides
+bool av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
+                                   int stride, const CNN_CONFIG *cnn_config,
+                                   const CNN_THREAD_DATA *thread_data,
+                                   CNN_MULTI_OUT *output) {
+  const float max_val = 255.0;
+
+  const int in_width = width + 2 * cnn_config->ext_width;
+  const int in_height = height + 2 * cnn_config->ext_height;
+  const int in_channels = cnn_config->layer_config[0].in_channels;
+  float *inputs[CNN_MAX_CHANNELS];
+  float *input_ =
+      (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
+  if (!input_) return false;
+  const int in_stride = in_width;
+
+  for (int c = 0; c < in_channels; ++c) {
+    inputs[c] = input_ + c * in_stride * in_height;
+    float *input =
+        inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
+
+    if (cnn_config->strict_bounds) {
+      for (int i = 0; i < height; ++i)
+        for (int j = 0; j < width; ++j)
+          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
+      // extend left and right
+      for (int i = 0; i < height; ++i) {
+        for (int j = -cnn_config->ext_width; j < 0; ++j)
+          input[i * in_stride + j] = input[i * in_stride];
+        for (int j = width; j < width + cnn_config->ext_width; ++j)
+          input[i * in_stride + j] = input[i * in_stride + width - 1];
+      }
+      // extend top and bottom
+      for (int i = -cnn_config->ext_height; i < 0; ++i)
+        memcpy(&input[i * in_stride - cnn_config->ext_width],
+               &input[-cnn_config->ext_width], in_width * sizeof(*input));
+      for (int i = height; i < height + cnn_config->ext_height; ++i)
+        memcpy(&input[i * in_stride - cnn_config->ext_width],
+               &input[(height - 1) * in_stride - cnn_config->ext_width],
+               in_width * sizeof(*input));
+    } else {
+      for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
+           ++i)
+        for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
+             ++j)
+          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
+    }
+  }
+  bool success = av1_cnn_predict((const float **)inputs, in_width, in_height,
+                                 in_stride, cnn_config, thread_data, output);
+
+  aom_free(input_);
+  return success;
+}
+
+// Assume output already has proper allocation
+// Assume input image buffers all have same resolution and strides
+bool av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
+                                          int stride,
+                                          const CNN_CONFIG *cnn_config,
+                                          const CNN_THREAD_DATA *thread_data,
+                                          int bit_depth,
+                                          CNN_MULTI_OUT *output) {
+  const float max_val = (float)((1 << bit_depth) - 1);
+
+  const int in_width = width + 2 * cnn_config->ext_width;
+  const int in_height = height + 2 * cnn_config->ext_height;
+  const int in_channels = cnn_config->layer_config[0].in_channels;
+  float *inputs[CNN_MAX_CHANNELS];
+  float *input_ =
+      (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
+  if (!input_) return false;
+  const int in_stride = in_width;
+
+  for (int c = 0; c < in_channels; ++c) {
+    inputs[c] = input_ + c * in_stride * in_height;
+    float *input =
+        inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
+
+    if (cnn_config->strict_bounds) {
+      for (int i = 0; i < height; ++i)
+        for (int j = 0; j < width; ++j)
+          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
+      // extend left and right
+      for (int i = 0; i < height; ++i) {
+        for (int j = -cnn_config->ext_width; j < 0; ++j)
+          input[i * in_stride + j] = input[i * in_stride];
+        for (int j = width; j < width + cnn_config->ext_width; ++j)
+          input[i * in_stride + j] = input[i * in_stride + width - 1];
+      }
+      // extend top and bottom
+      for (int i = -cnn_config->ext_height; i < 0; ++i)
+        memcpy(&input[i * in_stride - cnn_config->ext_width],
+               &input[-cnn_config->ext_width], in_width * sizeof(*input));
+      for (int i = height; i < height + cnn_config->ext_height; ++i)
+        memcpy(&input[i * in_stride - cnn_config->ext_width],
+               &input[(height - 1) * in_stride - cnn_config->ext_width],
+               in_width * sizeof(*input));
+    } else {
+      for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
+           ++i)
+        for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
+             ++j)
+          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
+    }
+  }
+
+  bool success = av1_cnn_predict((const float **)inputs, in_width, in_height,
+                                 in_stride, cnn_config, thread_data, output);
+
+  aom_free(input_);
+  return success;
+}