diff options
Diffstat (limited to 'third_party/aom/av1/encoder/x86/cnn_avx2.c')
-rw-r--r-- | third_party/aom/av1/encoder/x86/cnn_avx2.c | 532 |
1 files changed, 532 insertions, 0 deletions
diff --git a/third_party/aom/av1/encoder/x86/cnn_avx2.c b/third_party/aom/av1/encoder/x86/cnn_avx2.c new file mode 100644 index 0000000000..ee93b3d5a0 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/cnn_avx2.c @@ -0,0 +1,532 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <immintrin.h> +#include <math.h> + +#include "aom_dsp/aom_dsp_common.h" +#include "av1/common/av1_common_int.h" +#include "av1/encoder/cnn.h" + +// This mask rearranges source pixels in the order shown below. +// shuffle_src_layer0[0][8]: applied on source pixels 0 to 7. +// shuffle_src_layer0[1][8]: applied on source pixels 7 to 14. +// This shuffling is needed to process 3 5x5 blocks which need +// source pixels in the following order. +// 1st 5x5 block: source pixels needed are 0 to 4, +// 2nd 5x5 block: source pixels needed are 4 to 8, +// 3rd 5x5 block: source pixels needed are 8 to 12. +// Source pixels are loaded like mentioned below. +// load_src0 : 0, 1, 2, 3, 4, 5, 6, 7 +// load_src1 : 7, 8, 9, 10, 11, 12, 13, 14 +// After applying masks, source bytes will be in the order: +// load_src0 : 0, 1, 2, 3, 4, 4, 5, 6 +// consists 5 pixels needed for 1st 5x5 block and +// first 3 pixels needed for 2nd 5x5 block. +// load_src1 : 7, 8, 8, 9, 10, 11, 12, x +// consists last 2 pixels needed for 2nd 5x5 block and +// 5 pixels needed for 3rd 5x5 block. +DECLARE_ALIGNED(32, static const uint32_t, + shuffle_src_layer0[2][8]) = { { 0, 1, 2, 3, 4, 4, 5, 6 }, + { 0, 1, 1, 2, 3, 4, 5, 0 } }; + +// This mask rearrange the weights to match shuffled source pixels order. +DECLARE_ALIGNED(32, static const uint32_t, + shuffle_weight_layer0[2][8]) = { { 0, 1, 2, 3, 4, 0, 1, 2 }, + { 3, 4, 0, 1, 2, 3, 4, 0 } }; + +// Shuffle mask used to rearrange weights corresponding to layer 1 and layer 2. +// For layer 1 and layer 2, convolution happens at 2x2 as filter_width and +// filter_height are equal to 2. So rearranging the weights in the +// order shown below to match source pixels. Basically this mask replicates +// the weights across the width of 2. +DECLARE_ALIGNED(32, static const uint32_t, + shuffle_weight_layer_1_and_2[2][8]) = { + { 0, 1, 0, 1, 0, 1, 0, 1 }, { 2, 3, 2, 3, 2, 3, 2, 3 } +}; + +// After the stages of multiplication and accumulation, the output values +// in the register will be jumbled. In order to store register into +// output buffer in a proper way, the following mask is applied on output +// register. +DECLARE_ALIGNED(32, static const uint32_t, + shuffle_output_layer_1_and_2[8]) = { 0, 1, 4, 5, 2, 3, 6, 7 }; + +// Load weights needed for layer 0 (for 5x5 block processing), +// and fill the registers appropriately to match source pixel mapping. +static INLINE void prepare_weights_for_5x5_convolve( + const float *layer_config_weights, int off, float weight[5][8], + const int cstep, __m256 *shuffle_weight, const __m256i weight_mask_0, + const __m256i weight_mask_1) { + for (int row = 0; row < 5; ++row) { + for (int col = 0; col < 5; ++col) { + weight[row][col] = layer_config_weights[off]; + off += cstep; + } + } + shuffle_weight[0] = _mm256_loadu_ps(weight[0]); + shuffle_weight[1] = _mm256_loadu_ps(weight[1]); + shuffle_weight[2] = _mm256_loadu_ps(weight[2]); + shuffle_weight[3] = _mm256_loadu_ps(weight[3]); + shuffle_weight[4] = _mm256_loadu_ps(weight[4]); + + shuffle_weight[0] = + _mm256_permutevar8x32_ps(shuffle_weight[0], weight_mask_0); + shuffle_weight[1] = + _mm256_permutevar8x32_ps(shuffle_weight[1], weight_mask_0); + shuffle_weight[2] = + _mm256_permutevar8x32_ps(shuffle_weight[2], weight_mask_0); + shuffle_weight[3] = + _mm256_permutevar8x32_ps(shuffle_weight[3], weight_mask_0); + shuffle_weight[4] = + _mm256_permutevar8x32_ps(shuffle_weight[4], weight_mask_0); + shuffle_weight[5] = + _mm256_permutevar8x32_ps(shuffle_weight[0], weight_mask_1); + shuffle_weight[6] = + _mm256_permutevar8x32_ps(shuffle_weight[1], weight_mask_1); + shuffle_weight[7] = + _mm256_permutevar8x32_ps(shuffle_weight[2], weight_mask_1); + shuffle_weight[8] = + _mm256_permutevar8x32_ps(shuffle_weight[3], weight_mask_1); + shuffle_weight[9] = + _mm256_permutevar8x32_ps(shuffle_weight[4], weight_mask_1); +} + +// For each row, loads source pixels 0 to 7(load_src_0), 7 to 14(load_src_1) and +// arranges them appropriately to process 3 blocks. +#define PERFORM_CONVOLVE_FOR_3_5X5_BLOCKS() \ + do { \ + for (int row = 0; row < 5; row++) { \ + load_src_0 = _mm256_loadu_ps(input_ptr); \ + load_src_1 = _mm256_loadu_ps(input_ptr + 7); \ + load_src_0 = _mm256_permutevar8x32_ps(load_src_0, block0_1); \ + load_src_1 = _mm256_permutevar8x32_ps(load_src_1, block1_2); \ + load_src_0 = _mm256_mul_ps(load_src_0, shuffle_weight[0 + row]); \ + load_src_1 = _mm256_mul_ps(load_src_1, shuffle_weight[5 + row]); \ + accum_src_0 = _mm256_add_ps(load_src_0, accum_src_0); \ + accum_src_1 = _mm256_add_ps(load_src_1, accum_src_1); \ + input_ptr += in_stride; \ + } \ + } while (0) + +// Load masks needed for shuffling of output and weights. +static INLINE void load_shuffle_masks_for_2x2_convolve(__m256i *output_mask, + __m256i *weight_mask) { + // Load shuffle buffer needed to sort the output. + *output_mask = + _mm256_load_si256((const __m256i *)shuffle_output_layer_1_and_2); + + // Load shuffle buffers needed for weight. + weight_mask[0] = + _mm256_load_si256((const __m256i *)shuffle_weight_layer_1_and_2[0]); + weight_mask[1] = + _mm256_load_si256((const __m256i *)shuffle_weight_layer_1_and_2[1]); +} + +// Load weights needed for layer 1 and 2 (for 2x2 block processing), +// and fill the registers appropriately to match source pixel mapping. +static INLINE void prepare_weights_for_2x2_convolve( + const float *layer_config_weights, int off, const int cstep, + __m256 *shuffle_weight, __m256i *weight_mask) { + // Weights needed for 2x2 block. + float weight[4] = { 0 }; + for (int i = 0; i < 4; ++i) { + weight[i] = layer_config_weights[off]; + off += cstep; + } + + const __m256 weight_vec = _mm256_castps128_ps256(_mm_loadu_ps(weight)); + shuffle_weight[0] = _mm256_permutevar8x32_ps(weight_vec, weight_mask[0]); + shuffle_weight[1] = _mm256_permutevar8x32_ps(weight_vec, weight_mask[1]); +} + +// Do convolution of one 5x5 block. +#define PERFORM_CONVOLVE_FOR_1_5X5_BLOCK(w, accum0, in_stride) \ + do { \ + __m128 load_src[5]; \ + load_src[0] = _mm_loadu_ps(input_ptr); \ + last_column_sum += input_ptr[4] * weight[0][4]; \ + input_ptr += in_stride; \ + load_src[1] = _mm_loadu_ps(input_ptr); \ + last_column_sum += input_ptr[4] * weight[1][4]; \ + input_ptr += in_stride; \ + load_src[2] = _mm_loadu_ps(input_ptr); \ + last_column_sum += input_ptr[4] * weight[2][4]; \ + input_ptr += in_stride; \ + load_src[3] = _mm_loadu_ps(input_ptr); \ + last_column_sum += input_ptr[4] * weight[3][4]; \ + input_ptr += in_stride; \ + load_src[4] = _mm_loadu_ps(input_ptr); \ + last_column_sum += input_ptr[4] * weight[4][4]; \ + \ + load_src[0] = _mm_mul_ps(load_src[0], _mm256_castps256_ps128(w[0])); \ + load_src[1] = _mm_mul_ps(load_src[1], _mm256_castps256_ps128(w[1])); \ + load_src[2] = _mm_mul_ps(load_src[2], _mm256_castps256_ps128(w[2])); \ + load_src[3] = _mm_mul_ps(load_src[3], _mm256_castps256_ps128(w[3])); \ + load_src[4] = _mm_mul_ps(load_src[4], _mm256_castps256_ps128(w[4])); \ + \ + accum0 = _mm_add_ps(load_src[0], accum0); \ + load_src[1] = _mm_add_ps(load_src[1], load_src[2]); \ + load_src[3] = _mm_add_ps(load_src[3], load_src[4]); \ + load_src[1] = _mm_add_ps(load_src[1], load_src[3]); \ + accum0 = _mm_add_ps(accum0, load_src[1]); \ + } while (0) + +// Do convolution on 8 horizontal 2x2 blocks. +static INLINE void perform_convolve_for_8h_2x2_blocks( + const float *input_ptr, int in_stride, __m256 *weight, __m256 *out_accum, + __m256i shuffle_output_mask) { + __m256 load_src[4]; + // Load input into source registers. + load_src[0] = _mm256_loadu_ps(input_ptr); + load_src[1] = _mm256_loadu_ps(input_ptr + 8); + load_src[2] = _mm256_loadu_ps(input_ptr + in_stride); + load_src[3] = _mm256_loadu_ps(input_ptr + in_stride + 8); + + // Multiply the loaded input with corresponding weights. + load_src[0] = _mm256_mul_ps(load_src[0], weight[0]); + load_src[1] = _mm256_mul_ps(load_src[1], weight[0]); + load_src[2] = _mm256_mul_ps(load_src[2], weight[1]); + load_src[3] = _mm256_mul_ps(load_src[3], weight[1]); + + // Accumulate across 2x2 blocks. + load_src[0] = _mm256_add_ps(load_src[0], load_src[2]); + load_src[1] = _mm256_add_ps(load_src[1], load_src[3]); + load_src[0] = _mm256_hadd_ps(load_src[0], load_src[1]); + + // Sort the output in order to store into output buffer. + load_src[0] = _mm256_permutevar8x32_ps(load_src[0], shuffle_output_mask); + *out_accum = _mm256_add_ps(*out_accum, load_src[0]); +} + +// Do convolution on 8 (4 horizontal x 2 vertical) 2x2 blocks. +static INLINE void perform_convolve_for_4hx2v_2x2_blocks( + const float *input_ptr, int in_stride, __m256 *weight, __m256 *out_accum, + __m256i shuffle_output_mask) { + __m256 load_src[4]; + // Load input into source registers. + load_src[0] = _mm256_loadu_ps(input_ptr); + load_src[1] = _mm256_loadu_ps(input_ptr + in_stride); + load_src[2] = _mm256_loadu_ps(input_ptr + (in_stride * 2)); + load_src[3] = _mm256_loadu_ps(input_ptr + (in_stride * 3)); + + // Multiply the loaded input with corresponding weights. + load_src[0] = _mm256_mul_ps(load_src[0], weight[0]); + load_src[1] = _mm256_mul_ps(load_src[1], weight[1]); + load_src[2] = _mm256_mul_ps(load_src[2], weight[0]); + load_src[3] = _mm256_mul_ps(load_src[3], weight[1]); + + // Accumulate across 2x2 blocks. + load_src[0] = _mm256_add_ps(load_src[0], load_src[1]); + load_src[2] = _mm256_add_ps(load_src[2], load_src[3]); + load_src[0] = _mm256_hadd_ps(load_src[0], load_src[2]); + + // Sort the output in order to store into output buffer. + load_src[0] = _mm256_permutevar8x32_ps(load_src[0], shuffle_output_mask); + *out_accum = _mm256_add_ps(*out_accum, load_src[0]); +} + +// AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c(), when +// filter_width and filter_height are equal to 5. +// CNN convolve parsing is based on av1_intra_mode_cnn_partition_cnn_config. +// Based on the configuration set for each layer, the current encoder +// always chooses the case of no_maxpool_padding_valid. +// And also for layer 0 convolution happens at 5x5 level as the +// filter_width and filter_height are set as 5. +static void cnn_convolve_no_maxpool_padding_valid_5x5_avx2( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, + int start_idx, const int cstep, const int channel_step) { + const int kFilterWidth = 5; + const int kFilterHeight = 5; + const int kSkipWidth = 4; + const int kSkipHeight = 4; + assert(layer_config->filter_width == kFilterWidth && + layer_config->filter_height == kFilterHeight); + assert(layer_config->skip_width == kSkipWidth && + layer_config->skip_height == kSkipHeight); + + // Load shuffle buffers needed for source. + const __m256i block0_1 = + _mm256_load_si256((const __m256i *)shuffle_src_layer0[0]); + const __m256i block1_2 = + _mm256_load_si256((const __m256i *)shuffle_src_layer0[1]); + + // Load shuffle buffers needed for weight. + const __m256i weight_mask_0 = + _mm256_load_si256((const __m256i *)shuffle_weight_layer0[0]); + const __m256i weight_mask_1 = + _mm256_load_si256((const __m256i *)shuffle_weight_layer0[1]); + + // Width needs to be moved to go to next iteration of processing 3 5x5 blocks. + const int kSkipWidthForNextIter = kSkipWidth * 3; + + // Minimum width required to process 3 5x5 blocks at a time. + // min width (for processing 3 5x5 block) = 2*skip_width + filter_width + // Here, skip_width specifies how much width we should move while processing + // next block convolution and filter_width specifies for how many pixels + // filter needs to be applied. + const int kMinWidthFor3_5x5Blocks = (kSkipWidth * 2) + kFilterWidth; + for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { + const float out_ch_bias = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + __m256 shuffle_weight[10]; + + // Weights needed are 5x5, for SIMD purpose made this array as 5x8. + float weight[5][8] = { { 0 } }; + int off = k * layer_config->out_channels + i; + + // In layer 0, the convolution process happens at 5x5. + // The weights needed for 5x5 block are same across the in-channels, + // which is why the load of weights happens once for each in-channel. + prepare_weights_for_5x5_convolve(layer_config->weights, off, weight, + cstep, shuffle_weight, weight_mask_0, + weight_mask_1); + + for (int h = 0, u = 0; h < in_height - kFilterHeight + 1; + h += kSkipHeight, ++u) { + const int out_h = u * out_stride; + int v = 0; + int w = 0; + int rem_width = in_width; + // Processing 3 5x5 blocks at a time, if sufficient width is present. + while (rem_width >= kMinWidthFor3_5x5Blocks) { + __m256 load_src_0, load_src_1; + __m256 accum_src_0 = _mm256_setzero_ps(); + __m256 accum_src_1 = _mm256_setzero_ps(); + const float *input_ptr = &input[k][h * in_stride + w]; + PERFORM_CONVOLVE_FOR_3_5X5_BLOCKS(); + + // Accumulate across column. + __m256 accum = _mm256_hadd_ps(accum_src_0, accum_src_1); + __m128 tmp_reg_0 = _mm256_extractf128_ps(accum_src_0, 1); + __m128 tmp_reg_1 = _mm256_extractf128_ps(accum_src_1, 1); + + __m128 accum_l = _mm256_castps256_ps128(accum); + __m128 accum_h = _mm256_extractf128_ps(accum, 1); + + __m128 tmp_reg_2 = _mm_add_ps(accum_l, tmp_reg_0); + __m128 tmp_reg_3 = _mm_add_ps(tmp_reg_0, accum_h); + __m128 tmp_reg_4 = _mm_add_ps(tmp_reg_1, accum_h); + + // 1st 5x5 block output. + output[i][out_h + v] = + out_ch_bias + _mm_cvtss_f32(tmp_reg_2) + + _mm_cvtss_f32(_mm_shuffle_ps(accum_l, accum_l, 1)); + + // 2nd 5x5 block output. + output[i][out_h + v + 1] = + out_ch_bias + + _mm_cvtss_f32(_mm_shuffle_ps(tmp_reg_3, tmp_reg_3, 1)) + + _mm_cvtss_f32(_mm_shuffle_ps(accum_l, accum_l, 2)); + + // 3rd 5x5 block output. + output[i][out_h + v + 2] = + out_ch_bias + + _mm_cvtss_f32(_mm_shuffle_ps(tmp_reg_4, tmp_reg_4, 2)) + + _mm_cvtss_f32(_mm_shuffle_ps(accum_l, accum_l, 3)); + + v += 3; + w += kSkipWidthForNextIter; + rem_width -= kSkipWidthForNextIter; + } + + // Process remaining blocks as single 5x5 block at a time. + while (rem_width >= kFilterWidth) { + float last_column_sum = 0; + __m128 accum = _mm_setzero_ps(); + const float *input_ptr = &input[k][h * in_stride + w]; + PERFORM_CONVOLVE_FOR_1_5X5_BLOCK(shuffle_weight, accum, in_stride); + + // Accumulate across column. + accum = _mm_hadd_ps(accum, accum); + output[i][out_h + v] = out_ch_bias + last_column_sum + + _mm_cvtss_f32(accum) + + _mm_cvtss_f32(_mm_shuffle_ps(accum, accum, 1)); + + v += 1; + w += kSkipWidth; + rem_width -= kSkipWidth; + } + } + } + } +} + +// AVX2 implementation for layer 1. +static INLINE void cnn_convolve_no_maxpool_padding_valid_layer1_avx2( + const float **input, int in_stride, + const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, + int start_idx, const int cstep, const int channel_step) { + __m256i weight_mask[2]; + __m256i shuffle_output_mask; + load_shuffle_masks_for_2x2_convolve(&shuffle_output_mask, weight_mask); + + const int kInHeight = 16; + const int kFilterHeight = 2; + const int kSkipHeight = 2; + for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { + __m256 bias_reg = _mm256_set1_ps(layer_config->bias[i]); + // out_accum registers are used to store the 2x2 convolve outputs + // (calculated over input block size), which are accumulated across the + // in_channels. As per the design, each iteration of for loop processes 8 + // (horizontal) 2x2 blocks and stores in corresponding out_accum register + // (as input size is 16x16, a total of 64 2x2 blocks are present and 8 + // out_accum registers are enough to store the outputs). + // Hence for loops corresponding to 'j' and 'h', below, run over the number + // of out_accum registers. + __m256 out_accum[8]; + for (int j = 0; j < 8; ++j) out_accum[j] = bias_reg; + for (int k = 0; k < layer_config->in_channels; ++k) { + __m256 shuffle_weight[2]; + int off = k * layer_config->out_channels + i; + // In layer 1, the convolution process happens at 2x2. + // The weights needed for 2x2 block are same across the in-channels, + // which is why the load of weights happens once for each in-channel. + prepare_weights_for_2x2_convolve(layer_config->weights, off, cstep, + shuffle_weight, weight_mask); + + for (int h = 0, u = 0; h < kInHeight - kFilterHeight + 1; + h += kSkipHeight, ++u) { + const float *input_ptr = &input[k][h * in_stride]; + perform_convolve_for_8h_2x2_blocks(input_ptr, in_stride, shuffle_weight, + &out_accum[u], shuffle_output_mask); + } + } + // Store output of layer 1. + for (int j = 0; j < 8; ++j) { + _mm256_storeu_ps(&output[i][j * out_stride], out_accum[j]); + } + } +} + +// AVX2 implementation for layer 2. +static INLINE void cnn_convolve_no_maxpool_padding_valid_layer2_avx2( + const float **input, int in_stride, + const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, + int start_idx, const int cstep, const int channel_step) { + __m256i weight_mask[2]; + __m256i shuffle_output_mask; + load_shuffle_masks_for_2x2_convolve(&shuffle_output_mask, weight_mask); + + const int kInHeight = 8; + const int kFilterHeight = 2; + const int kSkipHeight = 2; + for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { + __m256 bias_reg = _mm256_set1_ps(layer_config->bias[i]); + // out_accum registers are used to store the 2x2 convolve outputs + // (calculated over input block size), which are accumulated across the + // in_channels. As per the design, each iteration of for loop processes 8 + // (4 horizontal x 2 vertical) 2x2 blocks and stores in corresponding + // out_accum register (as input size is 8x8, a total of 16 2x2 blocks are + // present and 2 out_accum registers are enough to store the outputs). + // Hence for loops corresponding to 'j' and 'h', below, run over the number + // of out_accum registers. + __m256 out_accum[2]; + + // Height needs to be moved to go to next iteration of processing + // while processing 2 2x2 blocks vertically. + const int kSkipHeightForNextIter = kSkipHeight * 2; + for (int j = 0; j < 2; ++j) out_accum[j] = bias_reg; + for (int k = 0; k < layer_config->in_channels; ++k) { + __m256 shuffle_weight[2]; + int off = k * layer_config->out_channels + i; + // In layer 2, the convolution process happens at 2x2. + // The weights needed for 2x2 block are same across the in-channels, + // which is why the load of weights happens once for each in-channel. + prepare_weights_for_2x2_convolve(layer_config->weights, off, cstep, + shuffle_weight, weight_mask); + + for (int h = 0, u = 0; h < kInHeight - kFilterHeight + 1; + h += kSkipHeightForNextIter, ++u) { + const float *input_ptr = &input[k][h * in_stride]; + perform_convolve_for_4hx2v_2x2_blocks(input_ptr, in_stride, + shuffle_weight, &out_accum[u], + shuffle_output_mask); + } + } + // Store output of layer 2. + for (int j = 0; j < 2; ++j) { + _mm256_storeu_ps(&output[i][j * out_stride * 2], out_accum[j]); + } + } +} + +// AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c(), when +// filter_width and filter_height are equal to 2. +// As per the layer config set by av1_intra_mode_cnn_partition_cnn_config, +// the filter_width and filter_height are equal to 2 for layer >= 1. So +// convolution happens at 2x2 for layer >= 1. +void cnn_convolve_no_maxpool_padding_valid_2x2_avx2( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, + int start_idx, const int cstep, const int channel_step) { + assert(layer_config->filter_width == 2 && layer_config->filter_height == 2); + assert(layer_config->skip_width == 2 && layer_config->skip_height == 2); + + if (in_width == 16 && in_height == 16) { + // This case of in_width and in_height equal to 16 corresponds to layer 1. + // The output size of this layer is 8x8. + cnn_convolve_no_maxpool_padding_valid_layer1_avx2( + input, in_stride, layer_config, output, out_stride, start_idx, cstep, + channel_step); + } else if (in_width == 8 && in_height == 8) { + // This case of in_width and in_height equal to 8 corresponds to layer 2. + // The output size of this layer is 4x4. + cnn_convolve_no_maxpool_padding_valid_layer2_avx2( + input, in_stride, layer_config, output, out_stride, start_idx, cstep, + channel_step); + } else { + // For layer equal to 3 and 4, the input is of size 4x4 and 2x2 + // respectively. Implementing SIMD for these cases might not be optimal, + // which is why we call C path for layer >= 3. + av1_cnn_convolve_no_maxpool_padding_valid_c( + input, in_width, in_height, in_stride, layer_config, output, out_stride, + start_idx, cstep, channel_step); + } +} + +// AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c(). +// As per the current encoder, av1_cnn_convolve function gets called for +// block size equal to 64x64. av1_cnn_convolve() uses layer config values +// set by av1_intra_mode_cnn_partition_cnn_config. The following are a few +// details related to each layer's config parameters. +// Layer_Number in_size out_size filter_wd filter_ht skip_wd skip_ht +// 0 64x64 16x16 5 5 4 4 +// 1 16x16 8x8 2 2 2 2 +// 2 8x8 4x4 2 2 2 2 +// 3 4x4 2x2 2 2 2 2 +// 4 2x2 1x1 2 2 2 2 +// Here, +// filter_wd = filter_width and filter_ht = filter_height, +// skip_wd = skip_width and skip_ht = skip_height. +void av1_cnn_convolve_no_maxpool_padding_valid_avx2( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, + int start_idx, int cstep, int channel_step) { + if (layer_config->filter_width == 5 && layer_config->filter_height == 5 && + layer_config->skip_width == 4 && layer_config->skip_height == 4) { + cnn_convolve_no_maxpool_padding_valid_5x5_avx2( + input, in_width, in_height, in_stride, layer_config, output, out_stride, + start_idx, cstep, channel_step); + } else if (layer_config->filter_width == 2 && + layer_config->filter_height == 2 && + layer_config->skip_width == 2 && layer_config->skip_height == 2) { + cnn_convolve_no_maxpool_padding_valid_2x2_avx2( + input, in_width, in_height, in_stride, layer_config, output, out_stride, + start_idx, cstep, channel_step); + } else { + av1_cnn_convolve_no_maxpool_padding_valid_c( + input, in_width, in_height, in_stride, layer_config, output, out_stride, + start_idx, cstep, channel_step); + } +} |