diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-15 03:35:49 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-15 03:35:49 +0000 |
commit | d8bbc7858622b6d9c278469aab701ca0b609cddf (patch) | |
tree | eff41dc61d9f714852212739e6b3738b82a2af87 /media | |
parent | Releasing progress-linux version 125.0.3-1~progress7.99u1. (diff) | |
download | firefox-d8bbc7858622b6d9c278469aab701ca0b609cddf.tar.xz firefox-d8bbc7858622b6d9c278469aab701ca0b609cddf.zip |
Merging upstream version 126.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'media')
238 files changed, 10624 insertions, 6869 deletions
diff --git a/media/ffvpx/README_MOZILLA b/media/ffvpx/README_MOZILLA index 1c00f2761a..b766be4abd 100644 --- a/media/ffvpx/README_MOZILLA +++ b/media/ffvpx/README_MOZILLA @@ -158,6 +158,8 @@ There are going to be a lot of changes in terms of symbols exported. Adjust `libavutil/avutil.symbols` and `libavcodec/avcodec.symbols` by removing and adding symbols until the build passes. -Finally, apply the patch: +Finally, apply the patches: - no-unicode-stdio.patch to avoid passing the infity symbol in unicode to an stdio.h function, that causes bug 1879740 issue on Windows. +- opusenc-dtx.patch to allow enabling DTX in the opus encoder. + diff --git a/media/ffvpx/config_components_audio_only.h b/media/ffvpx/config_components_audio_only.h index 0e61e23898..4ba265a9f7 100644 --- a/media/ffvpx/config_components_audio_only.h +++ b/media/ffvpx/config_components_audio_only.h @@ -787,7 +787,7 @@ #define CONFIG_LIBMP3LAME_ENCODER 0 #define CONFIG_LIBOPENCORE_AMRNB_ENCODER 0 #define CONFIG_LIBOPENJPEG_ENCODER 0 -#define CONFIG_LIBOPUS_ENCODER 0 +#define CONFIG_LIBOPUS_ENCODER 1 #define CONFIG_LIBRAV1E_ENCODER 0 #define CONFIG_LIBSHINE_ENCODER 0 #define CONFIG_LIBSPEEX_ENCODER 0 @@ -795,7 +795,7 @@ #define CONFIG_LIBTHEORA_ENCODER 0 #define CONFIG_LIBTWOLAME_ENCODER 0 #define CONFIG_LIBVO_AMRWBENC_ENCODER 0 -#define CONFIG_LIBVORBIS_ENCODER 0 +#define CONFIG_LIBVORBIS_ENCODER 1 #define CONFIG_LIBVPX_VP8_ENCODER 0 #define CONFIG_LIBVPX_VP9_ENCODER 0 #define CONFIG_LIBWEBP_ANIM_ENCODER 0 diff --git a/media/ffvpx/config_components_audio_video.h b/media/ffvpx/config_components_audio_video.h index c8423a895e..220eb6ca52 100644 --- a/media/ffvpx/config_components_audio_video.h +++ b/media/ffvpx/config_components_audio_video.h @@ -810,7 +810,7 @@ #define CONFIG_LIBMP3LAME_ENCODER 0 #define CONFIG_LIBOPENCORE_AMRNB_ENCODER 0 #define CONFIG_LIBOPENJPEG_ENCODER 0 -#define CONFIG_LIBOPUS_ENCODER 0 +#define CONFIG_LIBOPUS_ENCODER 1 #define CONFIG_LIBRAV1E_ENCODER 0 #define CONFIG_LIBSHINE_ENCODER 0 #define CONFIG_LIBSPEEX_ENCODER 0 @@ -818,7 +818,7 @@ #define CONFIG_LIBTHEORA_ENCODER 0 #define CONFIG_LIBTWOLAME_ENCODER 0 #define CONFIG_LIBVO_AMRWBENC_ENCODER 0 -#define CONFIG_LIBVORBIS_ENCODER 0 +#define CONFIG_LIBVORBIS_ENCODER 1 #define CONFIG_LIBVPX_VP8_ENCODER 1 #define CONFIG_LIBVPX_VP9_ENCODER 1 #define CONFIG_LIBWEBP_ANIM_ENCODER 0 diff --git a/media/ffvpx/libavcodec/audio_frame_queue.c b/media/ffvpx/libavcodec/audio_frame_queue.c new file mode 100644 index 0000000000..08b4b368c7 --- /dev/null +++ b/media/ffvpx/libavcodec/audio_frame_queue.c @@ -0,0 +1,113 @@ +/* + * Audio Frame Queue + * Copyright (c) 2012 Justin Ruggles + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/common.h" +#include "audio_frame_queue.h" +#include "encode.h" +#include "libavutil/avassert.h" + +av_cold void ff_af_queue_init(AVCodecContext *avctx, AudioFrameQueue *afq) +{ + afq->avctx = avctx; + afq->remaining_delay = avctx->initial_padding; + afq->remaining_samples = avctx->initial_padding; + afq->frame_count = 0; +} + +void ff_af_queue_close(AudioFrameQueue *afq) +{ + if(afq->frame_count) + av_log(afq->avctx, AV_LOG_WARNING, "%d frames left in the queue on closing\n", afq->frame_count); + av_freep(&afq->frames); + memset(afq, 0, sizeof(*afq)); +} + +int ff_af_queue_add(AudioFrameQueue *afq, const AVFrame *f) +{ + AudioFrame *new = av_fast_realloc(afq->frames, &afq->frame_alloc, sizeof(*afq->frames)*(afq->frame_count+1)); + if(!new) + return AVERROR(ENOMEM); + afq->frames = new; + new += afq->frame_count; + + /* get frame parameters */ + new->duration = f->nb_samples; + new->duration += afq->remaining_delay; + if (f->pts != AV_NOPTS_VALUE) { + new->pts = av_rescale_q(f->pts, + afq->avctx->time_base, + (AVRational){ 1, afq->avctx->sample_rate }); + new->pts -= afq->remaining_delay; + if(afq->frame_count && new[-1].pts >= new->pts) + av_log(afq->avctx, AV_LOG_WARNING, "Queue input is backward in time\n"); + } else { + new->pts = AV_NOPTS_VALUE; + } + afq->remaining_delay = 0; + + /* add frame sample count */ + afq->remaining_samples += f->nb_samples; + + afq->frame_count++; + + return 0; +} + +void ff_af_queue_remove(AudioFrameQueue *afq, int nb_samples, int64_t *pts, + int64_t *duration) +{ + int64_t out_pts = AV_NOPTS_VALUE; + int removed_samples = 0; + int i; + + if (afq->frame_count || afq->frame_alloc) { + if (afq->frames->pts != AV_NOPTS_VALUE) + out_pts = afq->frames->pts; + } + if(!afq->frame_count) + av_log(afq->avctx, AV_LOG_WARNING, "Trying to remove %d samples, but the queue is empty\n", nb_samples); + if (pts) + *pts = ff_samples_to_time_base(afq->avctx, out_pts); + + for(i=0; nb_samples && i<afq->frame_count; i++){ + int n= FFMIN(afq->frames[i].duration, nb_samples); + afq->frames[i].duration -= n; + nb_samples -= n; + removed_samples += n; + if(afq->frames[i].pts != AV_NOPTS_VALUE) + afq->frames[i].pts += n; + } + afq->remaining_samples -= removed_samples; + i -= i && afq->frames[i-1].duration; + memmove(afq->frames, afq->frames + i, sizeof(*afq->frames) * (afq->frame_count - i)); + afq->frame_count -= i; + + if(nb_samples){ + av_assert0(!afq->frame_count); + av_assert0(afq->remaining_samples == afq->remaining_delay); + if(afq->frames && afq->frames[0].pts != AV_NOPTS_VALUE) + afq->frames[0].pts += nb_samples; + av_log(afq->avctx, AV_LOG_DEBUG, "Trying to remove %d more samples than there are in the queue\n", nb_samples); + } + if (duration) + *duration = ff_samples_to_time_base(afq->avctx, removed_samples); +} diff --git a/media/ffvpx/libavcodec/audio_frame_queue.h b/media/ffvpx/libavcodec/audio_frame_queue.h new file mode 100644 index 0000000000..d8076eae54 --- /dev/null +++ b/media/ffvpx/libavcodec/audio_frame_queue.h @@ -0,0 +1,83 @@ +/* + * Audio Frame Queue + * Copyright (c) 2012 Justin Ruggles + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_AUDIO_FRAME_QUEUE_H +#define AVCODEC_AUDIO_FRAME_QUEUE_H + +#include "avcodec.h" + +typedef struct AudioFrame { + int64_t pts; + int duration; +} AudioFrame; + +typedef struct AudioFrameQueue { + AVCodecContext *avctx; + int remaining_delay; + int remaining_samples; + AudioFrame *frames; + unsigned frame_count; + unsigned frame_alloc; +} AudioFrameQueue; + +/** + * Initialize AudioFrameQueue. + * + * @param avctx context to use for time_base and av_log + * @param afq queue context + */ +void ff_af_queue_init(AVCodecContext *avctx, AudioFrameQueue *afq); + +/** + * Close AudioFrameQueue. + * + * Frees memory if needed. + * + * @param afq queue context + */ +void ff_af_queue_close(AudioFrameQueue *afq); + +/** + * Add a frame to the queue. + * + * @param afq queue context + * @param f frame to add to the queue + */ +int ff_af_queue_add(AudioFrameQueue *afq, const AVFrame *f); + +/** + * Remove frame(s) from the queue. + * + * Retrieves the pts of the next available frame, or a generated pts based on + * the last frame duration if there are no frames left in the queue. The number + * of requested samples should be the full number of samples represented by the + * packet that will be output by the encoder. If fewer samples are available + * in the queue, a smaller value will be used for the output duration. + * + * @param afq queue context + * @param nb_samples number of samples to remove from the queue + * @param[out] pts output packet pts + * @param[out] duration output packet duration + */ +void ff_af_queue_remove(AudioFrameQueue *afq, int nb_samples, int64_t *pts, + int64_t *duration); + +#endif /* AVCODEC_AUDIO_FRAME_QUEUE_H */ diff --git a/media/ffvpx/libavcodec/codec_list.c b/media/ffvpx/libavcodec/codec_list.c index 04259e3cd7..7c6b0ceacd 100644 --- a/media/ffvpx/libavcodec/codec_list.c +++ b/media/ffvpx/libavcodec/codec_list.c @@ -20,6 +20,9 @@ static const FFCodec * const codec_list[] = { #if CONFIG_LIBVORBIS_DECODER &ff_libvorbis_decoder, #endif +#if CONFIG_LIBVORBIS_ENCODER + &ff_libvorbis_encoder, +#endif #if CONFIG_PCM_ALAW_DECODER &ff_pcm_alaw_decoder, #endif @@ -44,6 +47,9 @@ static const FFCodec * const codec_list[] = { #if CONFIG_LIBOPUS_DECODER &ff_libopus_decoder, #endif +#if CONFIG_LIBOPUS_ENCODER + &ff_libopus_encoder, +#endif #if CONFIG_LIBVPX_VP8_DECODER &ff_libvpx_vp8_decoder, #endif diff --git a/media/ffvpx/libavcodec/libopusenc.c b/media/ffvpx/libavcodec/libopusenc.c new file mode 100644 index 0000000000..68667e3350 --- /dev/null +++ b/media/ffvpx/libavcodec/libopusenc.c @@ -0,0 +1,610 @@ +/* + * Opus encoder using libopus + * Copyright (c) 2012 Nathan Caldwell + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <opus.h> +#include <opus_multistream.h> + +#include "libavutil/channel_layout.h" +#include "libavutil/opt.h" +#include "avcodec.h" +#include "bytestream.h" +#include "codec_internal.h" +#include "encode.h" +#include "libopus.h" +#include "audio_frame_queue.h" +#include "vorbis_data.h" + +typedef struct LibopusEncOpts { + int vbr; + int application; + int packet_loss; + int fec; + int complexity; + float frame_duration; + int packet_size; + int max_bandwidth; + int mapping_family; + int dtx; +#ifdef OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST + int apply_phase_inv; +#endif +} LibopusEncOpts; + +typedef struct LibopusEncContext { + AVClass *class; + OpusMSEncoder *enc; + int stream_count; + uint8_t *samples; + LibopusEncOpts opts; + AudioFrameQueue afq; + const uint8_t *encoder_channel_map; +} LibopusEncContext; + +static const uint8_t opus_coupled_streams[8] = { + 0, 1, 1, 2, 2, 2, 2, 3 +}; + +/* Opus internal to Vorbis channel order mapping written in the header */ +static const uint8_t opus_vorbis_channel_map[8][8] = { + { 0 }, + { 0, 1 }, + { 0, 2, 1 }, + { 0, 1, 2, 3 }, + { 0, 4, 1, 2, 3 }, + { 0, 4, 1, 2, 3, 5 }, + { 0, 4, 1, 2, 3, 5, 6 }, + { 0, 6, 1, 2, 3, 4, 5, 7 }, +}; + +/* libavcodec to libopus channel order mapping, passed to libopus */ +static const uint8_t libavcodec_libopus_channel_map[8][8] = { + { 0 }, + { 0, 1 }, + { 0, 1, 2 }, + { 0, 1, 2, 3 }, + { 0, 1, 3, 4, 2 }, + { 0, 1, 4, 5, 2, 3 }, + { 0, 1, 5, 6, 2, 4, 3 }, + { 0, 1, 6, 7, 4, 5, 2, 3 }, +}; + +static void libopus_write_header(AVCodecContext *avctx, int stream_count, + int coupled_stream_count, + int mapping_family, + const uint8_t *channel_mapping) +{ + uint8_t *p = avctx->extradata; + int channels = avctx->ch_layout.nb_channels; + + bytestream_put_buffer(&p, "OpusHead", 8); + bytestream_put_byte(&p, 1); /* Version */ + bytestream_put_byte(&p, channels); + bytestream_put_le16(&p, avctx->initial_padding * 48000 / avctx->sample_rate); /* Lookahead samples at 48kHz */ + bytestream_put_le32(&p, avctx->sample_rate); /* Original sample rate */ + bytestream_put_le16(&p, 0); /* Gain of 0dB is recommended. */ + + /* Channel mapping */ + bytestream_put_byte(&p, mapping_family); + if (mapping_family != 0) { + bytestream_put_byte(&p, stream_count); + bytestream_put_byte(&p, coupled_stream_count); + bytestream_put_buffer(&p, channel_mapping, channels); + } +} + +static int libopus_configure_encoder(AVCodecContext *avctx, OpusMSEncoder *enc, + LibopusEncOpts *opts) +{ + int ret; + + if (avctx->global_quality) { + av_log(avctx, AV_LOG_ERROR, + "Quality-based encoding not supported, " + "please specify a bitrate and VBR setting.\n"); + return AVERROR(EINVAL); + } + + ret = opus_multistream_encoder_ctl(enc, OPUS_SET_BITRATE(avctx->bit_rate)); + if (ret != OPUS_OK) { + av_log(avctx, AV_LOG_ERROR, + "Failed to set bitrate: %s\n", opus_strerror(ret)); + return ret; + } + + ret = opus_multistream_encoder_ctl(enc, + OPUS_SET_COMPLEXITY(opts->complexity)); + if (ret != OPUS_OK) + av_log(avctx, AV_LOG_WARNING, + "Unable to set complexity: %s\n", opus_strerror(ret)); + + ret = opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(!!opts->vbr)); + if (ret != OPUS_OK) + av_log(avctx, AV_LOG_WARNING, + "Unable to set VBR: %s\n", opus_strerror(ret)); + + ret = opus_multistream_encoder_ctl(enc, + OPUS_SET_VBR_CONSTRAINT(opts->vbr == 2)); + if (ret != OPUS_OK) + av_log(avctx, AV_LOG_WARNING, + "Unable to set constrained VBR: %s\n", opus_strerror(ret)); + + ret = opus_multistream_encoder_ctl(enc, + OPUS_SET_PACKET_LOSS_PERC(opts->packet_loss)); + if (ret != OPUS_OK) + av_log(avctx, AV_LOG_WARNING, + "Unable to set expected packet loss percentage: %s\n", + opus_strerror(ret)); + + ret = opus_multistream_encoder_ctl(enc, + OPUS_SET_INBAND_FEC(opts->fec)); + if (ret != OPUS_OK) + av_log(avctx, AV_LOG_WARNING, + "Unable to set inband FEC: %s\n", + opus_strerror(ret)); + + ret = opus_multistream_encoder_ctl(enc, + OPUS_SET_DTX(opts->dtx)); + if (ret != OPUS_OK) + av_log(avctx, AV_LOG_WARNING, + "Unable to set DTX: %s\n", + opus_strerror(ret)); + + if (avctx->cutoff) { + ret = opus_multistream_encoder_ctl(enc, + OPUS_SET_MAX_BANDWIDTH(opts->max_bandwidth)); + if (ret != OPUS_OK) + av_log(avctx, AV_LOG_WARNING, + "Unable to set maximum bandwidth: %s\n", opus_strerror(ret)); + } + +#ifdef OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST + ret = opus_multistream_encoder_ctl(enc, + OPUS_SET_PHASE_INVERSION_DISABLED(!opts->apply_phase_inv)); + if (ret != OPUS_OK) + av_log(avctx, AV_LOG_WARNING, + "Unable to set phase inversion: %s\n", + opus_strerror(ret)); +#endif + return OPUS_OK; +} + +static int libopus_check_max_channels(AVCodecContext *avctx, + int max_channels) { + if (avctx->ch_layout.nb_channels > max_channels) { + av_log(avctx, AV_LOG_ERROR, "Opus mapping family undefined for %d channels.\n", + avctx->ch_layout.nb_channels); + return AVERROR(EINVAL); + } + + return 0; +} + +static int libopus_check_vorbis_layout(AVCodecContext *avctx, int mapping_family) { + av_assert2(avctx->ch_layout.nb_channels < FF_ARRAY_ELEMS(ff_vorbis_ch_layouts)); + + if (avctx->ch_layout.order == AV_CHANNEL_ORDER_UNSPEC) { + av_log(avctx, AV_LOG_WARNING, + "No channel layout specified. Opus encoder will use Vorbis " + "channel layout for %d channels.\n", avctx->ch_layout.nb_channels); + } else if (av_channel_layout_compare(&avctx->ch_layout, &ff_vorbis_ch_layouts[avctx->ch_layout.nb_channels - 1])) { + char name[32]; + + av_channel_layout_describe(&avctx->ch_layout, name, sizeof(name)); + av_log(avctx, AV_LOG_ERROR, + "Invalid channel layout %s for specified mapping family %d.\n", + name, mapping_family); + + return AVERROR(EINVAL); + } + + return 0; +} + +static int libopus_validate_layout_and_get_channel_map( + AVCodecContext *avctx, + int mapping_family, + const uint8_t ** channel_map_result) +{ + const uint8_t * channel_map = NULL; + int ret; + + switch (mapping_family) { + case -1: + ret = libopus_check_max_channels(avctx, 8); + if (ret == 0) { + ret = libopus_check_vorbis_layout(avctx, mapping_family); + /* Channels do not need to be reordered. */ + } + + break; + case 0: + ret = libopus_check_max_channels(avctx, 2); + if (ret == 0) { + ret = libopus_check_vorbis_layout(avctx, mapping_family); + } + break; + case 1: + /* Opus expects channels to be in Vorbis order. */ + ret = libopus_check_max_channels(avctx, 8); + if (ret == 0) { + ret = libopus_check_vorbis_layout(avctx, mapping_family); + channel_map = ff_vorbis_channel_layout_offsets[avctx->ch_layout.nb_channels - 1]; + } + break; + case 255: + ret = libopus_check_max_channels(avctx, 254); + break; + default: + av_log(avctx, AV_LOG_WARNING, + "Unknown channel mapping family %d. Output channel layout may be invalid.\n", + mapping_family); + ret = 0; + } + + *channel_map_result = channel_map; + return ret; +} + +static av_cold int libopus_encode_init(AVCodecContext *avctx) +{ + LibopusEncContext *opus = avctx->priv_data; + OpusMSEncoder *enc; + uint8_t libopus_channel_mapping[255]; + int ret = OPUS_OK; + int channels = avctx->ch_layout.nb_channels; + int av_ret; + int coupled_stream_count, header_size, frame_size; + int mapping_family; + + frame_size = opus->opts.frame_duration * 48000 / 1000; + switch (frame_size) { + case 120: + case 240: + if (opus->opts.application != OPUS_APPLICATION_RESTRICTED_LOWDELAY) + av_log(avctx, AV_LOG_WARNING, + "LPC mode cannot be used with a frame duration of less " + "than 10ms. Enabling restricted low-delay mode.\n" + "Use a longer frame duration if this is not what you want.\n"); + /* Frame sizes less than 10 ms can only use MDCT mode, so switching to + * RESTRICTED_LOWDELAY avoids an unnecessary extra 2.5ms lookahead. */ + opus->opts.application = OPUS_APPLICATION_RESTRICTED_LOWDELAY; + case 480: + case 960: + case 1920: + case 2880: +#ifdef OPUS_FRAMESIZE_120_MS + case 3840: + case 4800: + case 5760: +#endif + opus->opts.packet_size = + avctx->frame_size = frame_size * avctx->sample_rate / 48000; + break; + default: + av_log(avctx, AV_LOG_ERROR, "Invalid frame duration: %g.\n" + "Frame duration must be exactly one of: 2.5, 5, 10, 20, 40" +#ifdef OPUS_FRAMESIZE_120_MS + ", 60, 80, 100 or 120.\n", +#else + " or 60.\n", +#endif + opus->opts.frame_duration); + return AVERROR(EINVAL); + } + + if (avctx->compression_level < 0 || avctx->compression_level > 10) { + av_log(avctx, AV_LOG_WARNING, + "Compression level must be in the range 0 to 10. " + "Defaulting to 10.\n"); + opus->opts.complexity = 10; + } else { + opus->opts.complexity = avctx->compression_level; + } + + if (avctx->cutoff) { + switch (avctx->cutoff) { + case 4000: + opus->opts.max_bandwidth = OPUS_BANDWIDTH_NARROWBAND; + break; + case 6000: + opus->opts.max_bandwidth = OPUS_BANDWIDTH_MEDIUMBAND; + break; + case 8000: + opus->opts.max_bandwidth = OPUS_BANDWIDTH_WIDEBAND; + break; + case 12000: + opus->opts.max_bandwidth = OPUS_BANDWIDTH_SUPERWIDEBAND; + break; + case 20000: + opus->opts.max_bandwidth = OPUS_BANDWIDTH_FULLBAND; + break; + default: + av_log(avctx, AV_LOG_WARNING, + "Invalid frequency cutoff: %d. Using default maximum bandwidth.\n" + "Cutoff frequency must be exactly one of: 4000, 6000, 8000, 12000 or 20000.\n", + avctx->cutoff); + avctx->cutoff = 0; + } + } + + /* Channels may need to be reordered to match opus mapping. */ + av_ret = libopus_validate_layout_and_get_channel_map(avctx, opus->opts.mapping_family, + &opus->encoder_channel_map); + if (av_ret) { + return av_ret; + } + + if (opus->opts.mapping_family == -1) { + /* By default, use mapping family 1 for the header but use the older + * libopus multistream API to avoid surround masking. */ + + /* Set the mapping family so that the value is correct in the header */ + mapping_family = channels > 2 ? 1 : 0; + coupled_stream_count = opus_coupled_streams[channels - 1]; + opus->stream_count = channels - coupled_stream_count; + memcpy(libopus_channel_mapping, + opus_vorbis_channel_map[channels - 1], + channels * sizeof(*libopus_channel_mapping)); + + enc = opus_multistream_encoder_create( + avctx->sample_rate, channels, opus->stream_count, + coupled_stream_count, + libavcodec_libopus_channel_map[channels - 1], + opus->opts.application, &ret); + } else { + /* Use the newer multistream API. The encoder will set the channel + * mapping and coupled stream counts to its internal defaults and will + * use surround masking analysis to save bits. */ + mapping_family = opus->opts.mapping_family; + enc = opus_multistream_surround_encoder_create( + avctx->sample_rate, channels, mapping_family, + &opus->stream_count, &coupled_stream_count, libopus_channel_mapping, + opus->opts.application, &ret); + } + + if (ret != OPUS_OK) { + av_log(avctx, AV_LOG_ERROR, + "Failed to create encoder: %s\n", opus_strerror(ret)); + return ff_opus_error_to_averror(ret); + } + + if (!avctx->bit_rate) { + /* Sane default copied from opusenc */ + avctx->bit_rate = 64000 * opus->stream_count + + 32000 * coupled_stream_count; + av_log(avctx, AV_LOG_WARNING, + "No bit rate set. Defaulting to %"PRId64" bps.\n", avctx->bit_rate); + } + + if (avctx->bit_rate < 500 || avctx->bit_rate > 256000 * channels) { + av_log(avctx, AV_LOG_ERROR, "The bit rate %"PRId64" bps is unsupported. " + "Please choose a value between 500 and %d.\n", avctx->bit_rate, + 256000 * channels); + ret = AVERROR(EINVAL); + goto fail; + } + + ret = libopus_configure_encoder(avctx, enc, &opus->opts); + if (ret != OPUS_OK) { + ret = ff_opus_error_to_averror(ret); + goto fail; + } + + /* Header includes channel mapping table if and only if mapping family is NOT 0 */ + header_size = 19 + (mapping_family == 0 ? 0 : 2 + channels); + avctx->extradata = av_malloc(header_size + AV_INPUT_BUFFER_PADDING_SIZE); + if (!avctx->extradata) { + av_log(avctx, AV_LOG_ERROR, "Failed to allocate extradata.\n"); + ret = AVERROR(ENOMEM); + goto fail; + } + avctx->extradata_size = header_size; + + opus->samples = av_calloc(frame_size, channels * + av_get_bytes_per_sample(avctx->sample_fmt)); + if (!opus->samples) { + av_log(avctx, AV_LOG_ERROR, "Failed to allocate samples buffer.\n"); + ret = AVERROR(ENOMEM); + goto fail; + } + + ret = opus_multistream_encoder_ctl(enc, OPUS_GET_LOOKAHEAD(&avctx->initial_padding)); + if (ret != OPUS_OK) + av_log(avctx, AV_LOG_WARNING, + "Unable to get number of lookahead samples: %s\n", + opus_strerror(ret)); + + libopus_write_header(avctx, opus->stream_count, coupled_stream_count, + mapping_family, libopus_channel_mapping); + + ff_af_queue_init(avctx, &opus->afq); + + opus->enc = enc; + + return 0; + +fail: + opus_multistream_encoder_destroy(enc); + return ret; +} + +static void libopus_copy_samples_with_channel_map( + uint8_t *dst, const uint8_t *src, const uint8_t *channel_map, + int nb_channels, int nb_samples, int bytes_per_sample) { + int sample, channel; + for (sample = 0; sample < nb_samples; ++sample) { + for (channel = 0; channel < nb_channels; ++channel) { + const size_t src_pos = bytes_per_sample * (nb_channels * sample + channel); + const size_t dst_pos = bytes_per_sample * (nb_channels * sample + channel_map[channel]); + + memcpy(&dst[dst_pos], &src[src_pos], bytes_per_sample); + } + } +} + +static int libopus_encode(AVCodecContext *avctx, AVPacket *avpkt, + const AVFrame *frame, int *got_packet_ptr) +{ + LibopusEncContext *opus = avctx->priv_data; + const int bytes_per_sample = av_get_bytes_per_sample(avctx->sample_fmt); + const int channels = avctx->ch_layout.nb_channels; + const int sample_size = channels * bytes_per_sample; + const uint8_t *audio; + int ret; + int discard_padding; + + if (frame) { + ret = ff_af_queue_add(&opus->afq, frame); + if (ret < 0) + return ret; + if (opus->encoder_channel_map != NULL) { + audio = opus->samples; + libopus_copy_samples_with_channel_map( + opus->samples, frame->data[0], opus->encoder_channel_map, + channels, frame->nb_samples, bytes_per_sample); + } else if (frame->nb_samples < opus->opts.packet_size) { + audio = opus->samples; + memcpy(opus->samples, frame->data[0], frame->nb_samples * sample_size); + } else + audio = frame->data[0]; + } else { + if (!opus->afq.remaining_samples || (!opus->afq.frame_alloc && !opus->afq.frame_count)) + return 0; + audio = opus->samples; + memset(opus->samples, 0, opus->opts.packet_size * sample_size); + } + + /* Maximum packet size taken from opusenc in opus-tools. 120ms packets + * consist of 6 frames in one packet. The maximum frame size is 1275 + * bytes along with the largest possible packet header of 7 bytes. */ + if ((ret = ff_alloc_packet(avctx, avpkt, (1275 * 6 + 7) * opus->stream_count)) < 0) + return ret; + + if (avctx->sample_fmt == AV_SAMPLE_FMT_FLT) + ret = opus_multistream_encode_float(opus->enc, (const float *)audio, + opus->opts.packet_size, + avpkt->data, avpkt->size); + else + ret = opus_multistream_encode(opus->enc, (const opus_int16 *)audio, + opus->opts.packet_size, + avpkt->data, avpkt->size); + + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, + "Error encoding frame: %s\n", opus_strerror(ret)); + return ff_opus_error_to_averror(ret); + } + + av_shrink_packet(avpkt, ret); + + ff_af_queue_remove(&opus->afq, opus->opts.packet_size, + &avpkt->pts, &avpkt->duration); + + discard_padding = opus->opts.packet_size - avpkt->duration; + // Check if subtraction resulted in an overflow + if ((discard_padding < opus->opts.packet_size) != (avpkt->duration > 0)) + return AVERROR(EINVAL); + if (discard_padding > 0) { + uint8_t* side_data = av_packet_new_side_data(avpkt, + AV_PKT_DATA_SKIP_SAMPLES, + 10); + if (!side_data) + return AVERROR(ENOMEM); + AV_WL32(side_data + 4, discard_padding); + } + + *got_packet_ptr = 1; + + return 0; +} + +static av_cold int libopus_encode_close(AVCodecContext *avctx) +{ + LibopusEncContext *opus = avctx->priv_data; + + opus_multistream_encoder_destroy(opus->enc); + + ff_af_queue_close(&opus->afq); + + av_freep(&opus->samples); + + return 0; +} + +#define OFFSET(x) offsetof(LibopusEncContext, opts.x) +#define FLAGS AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM +static const AVOption libopus_options[] = { + { "application", "Intended application type", OFFSET(application), AV_OPT_TYPE_INT, { .i64 = OPUS_APPLICATION_AUDIO }, OPUS_APPLICATION_VOIP, OPUS_APPLICATION_RESTRICTED_LOWDELAY, FLAGS, "application" }, + { "voip", "Favor improved speech intelligibility", 0, AV_OPT_TYPE_CONST, { .i64 = OPUS_APPLICATION_VOIP }, 0, 0, FLAGS, "application" }, + { "audio", "Favor faithfulness to the input", 0, AV_OPT_TYPE_CONST, { .i64 = OPUS_APPLICATION_AUDIO }, 0, 0, FLAGS, "application" }, + { "lowdelay", "Restrict to only the lowest delay modes", 0, AV_OPT_TYPE_CONST, { .i64 = OPUS_APPLICATION_RESTRICTED_LOWDELAY }, 0, 0, FLAGS, "application" }, + { "frame_duration", "Duration of a frame in milliseconds", OFFSET(frame_duration), AV_OPT_TYPE_FLOAT, { .dbl = 20.0 }, 2.5, 120.0, FLAGS }, + { "packet_loss", "Expected packet loss percentage", OFFSET(packet_loss), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 100, FLAGS }, + { "fec", "Enable inband FEC. Expected packet loss must be non-zero", OFFSET(fec), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS }, + { "vbr", "Variable bit rate mode", OFFSET(vbr), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 2, FLAGS, "vbr" }, + { "off", "Use constant bit rate", 0, AV_OPT_TYPE_CONST, { .i64 = 0 }, 0, 0, FLAGS, "vbr" }, + { "on", "Use variable bit rate", 0, AV_OPT_TYPE_CONST, { .i64 = 1 }, 0, 0, FLAGS, "vbr" }, + { "constrained", "Use constrained VBR", 0, AV_OPT_TYPE_CONST, { .i64 = 2 }, 0, 0, FLAGS, "vbr" }, + { "mapping_family", "Channel Mapping Family", OFFSET(mapping_family), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 255, FLAGS, "mapping_family" }, + { "dtx", "Enable DTX", OFFSET(dtx), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS }, +#ifdef OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST + { "apply_phase_inv", "Apply intensity stereo phase inversion", OFFSET(apply_phase_inv), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, FLAGS }, +#endif + { NULL }, +}; + +static const AVClass libopus_class = { + .class_name = "libopus", + .item_name = av_default_item_name, + .option = libopus_options, + .version = LIBAVUTIL_VERSION_INT, +}; + +static const FFCodecDefault libopus_defaults[] = { + { "b", "0" }, + { "compression_level", "10" }, + { NULL }, +}; + +static const int libopus_sample_rates[] = { + 48000, 24000, 16000, 12000, 8000, 0, +}; + +const FFCodec ff_libopus_encoder = { + .p.name = "libopus", + CODEC_LONG_NAME("libopus Opus"), + .p.type = AVMEDIA_TYPE_AUDIO, + .p.id = AV_CODEC_ID_OPUS, + .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | + AV_CODEC_CAP_SMALL_LAST_FRAME, + .caps_internal = FF_CODEC_CAP_NOT_INIT_THREADSAFE, + .priv_data_size = sizeof(LibopusEncContext), + .init = libopus_encode_init, + FF_CODEC_ENCODE_CB(libopus_encode), + .close = libopus_encode_close, + .p.sample_fmts = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16, + AV_SAMPLE_FMT_FLT, + AV_SAMPLE_FMT_NONE }, + .p.supported_samplerates = libopus_sample_rates, + .p.priv_class = &libopus_class, + .defaults = libopus_defaults, + .p.wrapper_name = "libopus", +}; diff --git a/media/ffvpx/libavcodec/libvorbisenc.c b/media/ffvpx/libavcodec/libvorbisenc.c new file mode 100644 index 0000000000..6331cf0d79 --- /dev/null +++ b/media/ffvpx/libavcodec/libvorbisenc.c @@ -0,0 +1,393 @@ +/* + * Copyright (c) 2002 Mark Hills <mark@pogo.org.uk> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <vorbis/vorbisenc.h> + +#include "libavutil/avassert.h" +#include "libavutil/channel_layout.h" +#include "libavutil/fifo.h" +#include "libavutil/opt.h" +#include "avcodec.h" +#include "audio_frame_queue.h" +#include "codec_internal.h" +#include "encode.h" +#include "version.h" +#include "vorbis_parser.h" + + +/* Number of samples the user should send in each call. + * This value is used because it is the LCD of all possible frame sizes, so + * an output packet will always start at the same point as one of the input + * packets. + */ +#define LIBVORBIS_FRAME_SIZE 64 + +#define BUFFER_SIZE (1024 * 64) + +typedef struct LibvorbisEncContext { + AVClass *av_class; /**< class for AVOptions */ + vorbis_info vi; /**< vorbis_info used during init */ + vorbis_dsp_state vd; /**< DSP state used for analysis */ + vorbis_block vb; /**< vorbis_block used for analysis */ + AVFifo *pkt_fifo; /**< output packet buffer */ + int eof; /**< end-of-file flag */ + int dsp_initialized; /**< vd has been initialized */ + vorbis_comment vc; /**< VorbisComment info */ + double iblock; /**< impulse block bias option */ + AVVorbisParseContext *vp; /**< parse context to get durations */ + AudioFrameQueue afq; /**< frame queue for timestamps */ +} LibvorbisEncContext; + +static const AVOption options[] = { + { "iblock", "Sets the impulse block bias", offsetof(LibvorbisEncContext, iblock), AV_OPT_TYPE_DOUBLE, { .dbl = 0 }, -15, 0, AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM }, + { NULL } +}; + +static const FFCodecDefault defaults[] = { + { "b", "0" }, + { NULL }, +}; + +static const AVClass vorbis_class = { + .class_name = "libvorbis", + .item_name = av_default_item_name, + .option = options, + .version = LIBAVUTIL_VERSION_INT, +}; + +static const uint8_t vorbis_encoding_channel_layout_offsets[8][8] = { + { 0 }, + { 0, 1 }, + { 0, 2, 1 }, + { 0, 1, 2, 3 }, + { 0, 2, 1, 3, 4 }, + { 0, 2, 1, 4, 5, 3 }, + { 0, 2, 1, 5, 6, 4, 3 }, + { 0, 2, 1, 6, 7, 4, 5, 3 }, +}; + +static int vorbis_error_to_averror(int ov_err) +{ + switch (ov_err) { + case OV_EFAULT: return AVERROR_BUG; + case OV_EINVAL: return AVERROR(EINVAL); + case OV_EIMPL: return AVERROR(EINVAL); + default: return AVERROR_UNKNOWN; + } +} + +static av_cold int libvorbis_setup(vorbis_info *vi, AVCodecContext *avctx) +{ + LibvorbisEncContext *s = avctx->priv_data; + int channels = avctx->ch_layout.nb_channels; + double cfreq; + int ret; + + if (avctx->flags & AV_CODEC_FLAG_QSCALE || !avctx->bit_rate) { + /* variable bitrate + * NOTE: we use the oggenc range of -1 to 10 for global_quality for + * user convenience, but libvorbis uses -0.1 to 1.0. + */ + float q = avctx->global_quality / (float)FF_QP2LAMBDA; + /* default to 3 if the user did not set quality or bitrate */ + if (!(avctx->flags & AV_CODEC_FLAG_QSCALE)) + q = 3.0; + if ((ret = vorbis_encode_setup_vbr(vi, channels, + avctx->sample_rate, + q / 10.0))) + goto error; + } else { + int minrate = avctx->rc_min_rate > 0 ? avctx->rc_min_rate : -1; + int maxrate = avctx->rc_max_rate > 0 ? avctx->rc_max_rate : -1; + + /* average bitrate */ + if ((ret = vorbis_encode_setup_managed(vi, channels, + avctx->sample_rate, maxrate, + avctx->bit_rate, minrate))) + goto error; + + /* variable bitrate by estimate, disable slow rate management */ + if (minrate == -1 && maxrate == -1) + if ((ret = vorbis_encode_ctl(vi, OV_ECTL_RATEMANAGE2_SET, NULL))) + goto error; /* should not happen */ + } + + /* cutoff frequency */ + if (avctx->cutoff > 0) { + cfreq = avctx->cutoff / 1000.0; + if ((ret = vorbis_encode_ctl(vi, OV_ECTL_LOWPASS_SET, &cfreq))) + goto error; /* should not happen */ + } + + /* impulse block bias */ + if (s->iblock) { + if ((ret = vorbis_encode_ctl(vi, OV_ECTL_IBLOCK_SET, &s->iblock))) + goto error; + } + + if ((channels == 3 && + av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_SURROUND)) || + (channels == 4 && + av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_2_2) && + av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_QUAD)) || + (channels == 5 && + av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_5POINT0) && + av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_5POINT0_BACK)) || + (channels == 6 && + av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_5POINT1) && + av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_5POINT1_BACK)) || + (channels == 7 && + av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_6POINT1)) || + (channels == 8 && + av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_7POINT1))) { + if (avctx->ch_layout.order != AV_CHANNEL_ORDER_UNSPEC) { + char name[32]; + av_channel_layout_describe(&avctx->ch_layout, name, sizeof(name)); + av_log(avctx, AV_LOG_ERROR, "%s not supported by Vorbis: " + "output stream will have incorrect " + "channel layout.\n", name); + } else { + av_log(avctx, AV_LOG_WARNING, "No channel layout specified. The encoder " + "will use Vorbis channel layout for " + "%d channels.\n", channels); + } + } + + if ((ret = vorbis_encode_setup_init(vi))) + goto error; + + return 0; +error: + return vorbis_error_to_averror(ret); +} + +/* How many bytes are needed for a buffer of length 'l' */ +static int xiph_len(int l) +{ + return 1 + l / 255 + l; +} + +static av_cold int libvorbis_encode_close(AVCodecContext *avctx) +{ + LibvorbisEncContext *s = avctx->priv_data; + + /* notify vorbisenc this is EOF */ + if (s->dsp_initialized) + vorbis_analysis_wrote(&s->vd, 0); + + vorbis_block_clear(&s->vb); + vorbis_dsp_clear(&s->vd); + vorbis_info_clear(&s->vi); + + av_fifo_freep2(&s->pkt_fifo); + ff_af_queue_close(&s->afq); + + av_vorbis_parse_free(&s->vp); + + return 0; +} + +static av_cold int libvorbis_encode_init(AVCodecContext *avctx) +{ + LibvorbisEncContext *s = avctx->priv_data; + ogg_packet header, header_comm, header_code; + uint8_t *p; + unsigned int offset; + int ret; + + vorbis_info_init(&s->vi); + if ((ret = libvorbis_setup(&s->vi, avctx))) { + av_log(avctx, AV_LOG_ERROR, "encoder setup failed\n"); + goto error; + } + if ((ret = vorbis_analysis_init(&s->vd, &s->vi))) { + av_log(avctx, AV_LOG_ERROR, "analysis init failed\n"); + ret = vorbis_error_to_averror(ret); + goto error; + } + s->dsp_initialized = 1; + if ((ret = vorbis_block_init(&s->vd, &s->vb))) { + av_log(avctx, AV_LOG_ERROR, "dsp init failed\n"); + ret = vorbis_error_to_averror(ret); + goto error; + } + + vorbis_comment_init(&s->vc); + if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) + vorbis_comment_add_tag(&s->vc, "encoder", LIBAVCODEC_IDENT); + + if ((ret = vorbis_analysis_headerout(&s->vd, &s->vc, &header, &header_comm, + &header_code))) { + ret = vorbis_error_to_averror(ret); + goto error; + } + + avctx->extradata_size = 1 + xiph_len(header.bytes) + + xiph_len(header_comm.bytes) + + header_code.bytes; + p = avctx->extradata = av_malloc(avctx->extradata_size + + AV_INPUT_BUFFER_PADDING_SIZE); + if (!p) { + ret = AVERROR(ENOMEM); + goto error; + } + p[0] = 2; + offset = 1; + offset += av_xiphlacing(&p[offset], header.bytes); + offset += av_xiphlacing(&p[offset], header_comm.bytes); + memcpy(&p[offset], header.packet, header.bytes); + offset += header.bytes; + memcpy(&p[offset], header_comm.packet, header_comm.bytes); + offset += header_comm.bytes; + memcpy(&p[offset], header_code.packet, header_code.bytes); + offset += header_code.bytes; + av_assert0(offset == avctx->extradata_size); + + s->vp = av_vorbis_parse_init(avctx->extradata, avctx->extradata_size); + if (!s->vp) { + av_log(avctx, AV_LOG_ERROR, "invalid extradata\n"); + return ret; + } + + vorbis_comment_clear(&s->vc); + + avctx->frame_size = LIBVORBIS_FRAME_SIZE; + ff_af_queue_init(avctx, &s->afq); + + s->pkt_fifo = av_fifo_alloc2(BUFFER_SIZE, 1, 0); + if (!s->pkt_fifo) { + ret = AVERROR(ENOMEM); + goto error; + } + + return 0; +error: + libvorbis_encode_close(avctx); + return ret; +} + +static int libvorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt, + const AVFrame *frame, int *got_packet_ptr) +{ + LibvorbisEncContext *s = avctx->priv_data; + ogg_packet op; + int ret, duration; + + /* send samples to libvorbis */ + if (frame) { + const int samples = frame->nb_samples; + float **buffer; + int c, channels = s->vi.channels; + + buffer = vorbis_analysis_buffer(&s->vd, samples); + for (c = 0; c < channels; c++) { + int co = (channels > 8) ? c : + vorbis_encoding_channel_layout_offsets[channels - 1][c]; + memcpy(buffer[c], frame->extended_data[co], + samples * sizeof(*buffer[c])); + } + if ((ret = vorbis_analysis_wrote(&s->vd, samples)) < 0) { + av_log(avctx, AV_LOG_ERROR, "error in vorbis_analysis_wrote()\n"); + return vorbis_error_to_averror(ret); + } + if ((ret = ff_af_queue_add(&s->afq, frame)) < 0) + return ret; + } else { + if (!s->eof && s->afq.frame_alloc) + if ((ret = vorbis_analysis_wrote(&s->vd, 0)) < 0) { + av_log(avctx, AV_LOG_ERROR, "error in vorbis_analysis_wrote()\n"); + return vorbis_error_to_averror(ret); + } + s->eof = 1; + } + + /* retrieve available packets from libvorbis */ + while ((ret = vorbis_analysis_blockout(&s->vd, &s->vb)) == 1) { + if ((ret = vorbis_analysis(&s->vb, NULL)) < 0) + break; + if ((ret = vorbis_bitrate_addblock(&s->vb)) < 0) + break; + + /* add any available packets to the output packet buffer */ + while ((ret = vorbis_bitrate_flushpacket(&s->vd, &op)) == 1) { + if (av_fifo_can_write(s->pkt_fifo) < sizeof(ogg_packet) + op.bytes) { + av_log(avctx, AV_LOG_ERROR, "packet buffer is too small\n"); + return AVERROR_BUG; + } + av_fifo_write(s->pkt_fifo, &op, sizeof(ogg_packet)); + av_fifo_write(s->pkt_fifo, op.packet, op.bytes); + } + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "error getting available packets\n"); + break; + } + } + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "error getting available packets\n"); + return vorbis_error_to_averror(ret); + } + + /* Read an available packet if possible */ + if (av_fifo_read(s->pkt_fifo, &op, sizeof(ogg_packet)) < 0) + return 0; + + if ((ret = ff_get_encode_buffer(avctx, avpkt, op.bytes, 0)) < 0) + return ret; + av_fifo_read(s->pkt_fifo, avpkt->data, op.bytes); + + avpkt->pts = ff_samples_to_time_base(avctx, op.granulepos); + + duration = av_vorbis_parse_frame(s->vp, avpkt->data, avpkt->size); + if (duration > 0) { + /* we do not know encoder delay until we get the first packet from + * libvorbis, so we have to update the AudioFrameQueue counts */ + if (!avctx->initial_padding && s->afq.frames) { + avctx->initial_padding = duration; + av_assert0(!s->afq.remaining_delay); + s->afq.frames->duration += duration; + if (s->afq.frames->pts != AV_NOPTS_VALUE) + s->afq.frames->pts -= duration; + s->afq.remaining_samples += duration; + } + ff_af_queue_remove(&s->afq, duration, &avpkt->pts, &avpkt->duration); + } + + *got_packet_ptr = 1; + return 0; +} + +const FFCodec ff_libvorbis_encoder = { + .p.name = "libvorbis", + CODEC_LONG_NAME("libvorbis"), + .p.type = AVMEDIA_TYPE_AUDIO, + .p.id = AV_CODEC_ID_VORBIS, + .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | + AV_CODEC_CAP_SMALL_LAST_FRAME, + .caps_internal = FF_CODEC_CAP_NOT_INIT_THREADSAFE, + .priv_data_size = sizeof(LibvorbisEncContext), + .init = libvorbis_encode_init, + FF_CODEC_ENCODE_CB(libvorbis_encode_frame), + .close = libvorbis_encode_close, + .p.sample_fmts = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP, + AV_SAMPLE_FMT_NONE }, + .p.priv_class = &vorbis_class, + .defaults = defaults, + .p.wrapper_name = "libvorbis", +}; diff --git a/media/ffvpx/libavcodec/moz.build b/media/ffvpx/libavcodec/moz.build index 0ba603d172..886fa7a2cb 100644 --- a/media/ffvpx/libavcodec/moz.build +++ b/media/ffvpx/libavcodec/moz.build @@ -20,6 +20,7 @@ LOCAL_INCLUDES += ['/modules/fdlibm/inexact-math-override'] SharedLibrary('mozavcodec') SOURCES += [ 'allcodecs.c', + 'audio_frame_queue.c', 'avcodec.c', 'avdct.c', 'avfft.c', @@ -47,7 +48,9 @@ SOURCES += [ 'jrevdct.c', 'libopus.c', 'libopusdec.c', + 'libopusenc.c', 'libvorbisdec.c', + 'libvorbisenc.c', 'log2_tab.c', 'mpegaudio.c', 'mpegaudiodata.c', diff --git a/media/ffvpx/libavutil/avutil.symbols b/media/ffvpx/libavutil/avutil.symbols index 0ad6fad9cd..5ee7afb855 100644 --- a/media/ffvpx/libavutil/avutil.symbols +++ b/media/ffvpx/libavutil/avutil.symbols @@ -92,6 +92,7 @@ av_fifo_alloc av_fifo_alloc2 av_fifo_alloc_array av_fifo_can_read +av_fifo_can_write av_fifo_drain av_fifo_drain2 av_fifo_free diff --git a/media/ffvpx/opusenc-dtx.patch b/media/ffvpx/opusenc-dtx.patch new file mode 100644 index 0000000000..bf9fc9de87 --- /dev/null +++ b/media/ffvpx/opusenc-dtx.patch @@ -0,0 +1,63 @@ +diff --git a/media/ffvpx/libavcodec/libopusenc.c b/media/ffvpx/libavcodec/libopusenc.c +--- a/media/ffvpx/libavcodec/libopusenc.c ++++ b/media/ffvpx/libavcodec/libopusenc.c +@@ -37,16 +37,17 @@ typedef struct LibopusEncOpts { + int application; + int packet_loss; + int fec; + int complexity; + float frame_duration; + int packet_size; + int max_bandwidth; + int mapping_family; ++ int dtx; + #ifdef OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST + int apply_phase_inv; + #endif + } LibopusEncOpts; + + typedef struct LibopusEncContext { + AVClass *class; + OpusMSEncoder *enc; +@@ -154,16 +155,23 @@ static int libopus_configure_encoder(AVC + + ret = opus_multistream_encoder_ctl(enc, + OPUS_SET_INBAND_FEC(opts->fec)); + if (ret != OPUS_OK) + av_log(avctx, AV_LOG_WARNING, + "Unable to set inband FEC: %s\n", + opus_strerror(ret)); + ++ ret = opus_multistream_encoder_ctl(enc, ++ OPUS_SET_DTX(opts->dtx)); ++ if (ret != OPUS_OK) ++ av_log(avctx, AV_LOG_WARNING, ++ "Unable to set DTX: %s\n", ++ opus_strerror(ret)); ++ + if (avctx->cutoff) { + ret = opus_multistream_encoder_ctl(enc, + OPUS_SET_MAX_BANDWIDTH(opts->max_bandwidth)); + if (ret != OPUS_OK) + av_log(avctx, AV_LOG_WARNING, + "Unable to set maximum bandwidth: %s\n", opus_strerror(ret)); + } + +@@ -551,16 +559,17 @@ static const AVOption libopus_options[] + { "frame_duration", "Duration of a frame in milliseconds", OFFSET(frame_duration), AV_OPT_TYPE_FLOAT, { .dbl = 20.0 }, 2.5, 120.0, FLAGS }, + { "packet_loss", "Expected packet loss percentage", OFFSET(packet_loss), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 100, FLAGS }, + { "fec", "Enable inband FEC. Expected packet loss must be non-zero", OFFSET(fec), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS }, + { "vbr", "Variable bit rate mode", OFFSET(vbr), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 2, FLAGS, "vbr" }, + { "off", "Use constant bit rate", 0, AV_OPT_TYPE_CONST, { .i64 = 0 }, 0, 0, FLAGS, "vbr" }, + { "on", "Use variable bit rate", 0, AV_OPT_TYPE_CONST, { .i64 = 1 }, 0, 0, FLAGS, "vbr" }, + { "constrained", "Use constrained VBR", 0, AV_OPT_TYPE_CONST, { .i64 = 2 }, 0, 0, FLAGS, "vbr" }, + { "mapping_family", "Channel Mapping Family", OFFSET(mapping_family), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 255, FLAGS, "mapping_family" }, ++ { "dtx", "Enable DTX", OFFSET(dtx), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS }, + #ifdef OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST + { "apply_phase_inv", "Apply intensity stereo phase inversion", OFFSET(apply_phase_inv), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, FLAGS }, + #endif + { NULL }, + }; + + static const AVClass libopus_class = { + .class_name = "libopus", diff --git a/media/libaom/0001-errno.patch b/media/libaom/0001-errno.patch new file mode 100644 index 0000000000..6040c42e38 --- /dev/null +++ b/media/libaom/0001-errno.patch @@ -0,0 +1,22 @@ +diff --git a/aom_util/aom_pthread.h b/aom/aom_util/aom_pthread.h +--- a/aom_util/aom_pthread.h ++++ b/aom_util/aom_pthread.h +@@ -30,16 +30,18 @@ extern "C" { + #define WIN32_LEAN_AND_MEAN + #include <process.h> // NOLINT + #include <stddef.h> // NOLINT + #include <windows.h> // NOLINT + typedef HANDLE pthread_t; + typedef int pthread_attr_t; + typedef CRITICAL_SECTION pthread_mutex_t; + ++#include <errno.h> ++ + #if _WIN32_WINNT < 0x0600 + #error _WIN32_WINNT must target Windows Vista / Server 2008 or newer. + #endif + typedef CONDITION_VARIABLE pthread_cond_t; + + #ifndef WINAPI_FAMILY_PARTITION + #define WINAPI_PARTITION_DESKTOP 1 + #define WINAPI_FAMILY_PARTITION(x) x diff --git a/media/libaom/0002-mmloadusi64.patch b/media/libaom/0002-mmloadusi64.patch new file mode 100644 index 0000000000..9d23c90f22 --- /dev/null +++ b/media/libaom/0002-mmloadusi64.patch @@ -0,0 +1,79 @@ +diff --git a/aom_dsp/x86/synonyms.h b/aom_dsp/x86/synonyms.h +--- a/aom_dsp/x86/synonyms.h ++++ b/aom_dsp/x86/synonyms.h +@@ -41,22 +41,35 @@ static INLINE __m128i xx_loadl_64(const + static INLINE __m128i xx_load_128(const void *a) { + return _mm_load_si128((const __m128i *)a); + } + + static INLINE __m128i xx_loadu_128(const void *a) { + return _mm_loadu_si128((const __m128i *)a); + } + ++ ++// _mm_loadu_si64 has been introduced in GCC 9, reimplement the function ++// manually on older compilers. ++#if !defined(__clang__) && __GNUC_MAJOR__ < 9 ++static INLINE __m128i xx_loadu_2x64(const void *hi, const void *lo) { ++ __m64 hi_, lo_; ++ memcpy(&hi_, hi, sizeof(hi_)); ++ memcpy(&lo_, lo, sizeof(lo_)); ++ return _mm_set_epi64(hi_, lo_); ++} ++#endif ++#else + // Load 64 bits from each of hi and low, and pack into an SSE register + // Since directly loading as `int64_t`s and using _mm_set_epi64 may violate + // the strict aliasing rule, this takes a different approach + static INLINE __m128i xx_loadu_2x64(const void *hi, const void *lo) { + return _mm_unpacklo_epi64(_mm_loadu_si64(lo), _mm_loadu_si64(hi)); + } ++#endif + + static INLINE void xx_storel_32(void *const a, const __m128i v) { + const int val = _mm_cvtsi128_si32(v); + memcpy(a, &val, sizeof(val)); + } + + static INLINE void xx_storel_64(void *const a, const __m128i v) { + _mm_storel_epi64((__m128i *)a, v); +diff --git a/aom_dsp/x86/synonyms_avx2.h b/aom_dsp/x86/synonyms_avx2.h +--- a/aom_dsp/x86/synonyms_avx2.h ++++ b/aom_dsp/x86/synonyms_avx2.h +@@ -66,21 +66,36 @@ static INLINE __m256i yy_set1_64_from_32 + + // Some compilers don't have _mm256_set_m128i defined in immintrin.h. We + // therefore define an equivalent function using a different intrinsic. + // ([ hi ], [ lo ]) -> [ hi ][ lo ] + static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) { + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); + } + ++#define GCC_VERSION (__GNUC__ * 10000 \ ++ + __GNUC_MINOR__ * 100 \ ++ + __GNUC_PATCHLEVEL__) ++ ++// _mm256_loadu2_m128i has been introduced in GCC 10.1 ++#if !defined(__clang__) && GCC_VERSION < 101000 ++static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) { ++ __m128i mhi = _mm_loadu_si128((const __m128i *)(hi)); ++ __m128i mlo = _mm_loadu_si128((const __m128i *)(lo)); ++ return _mm256_set_m128i(mhi, mlo); ++} ++#else + static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) { + __m128i mhi = _mm_loadu_si128((const __m128i *)(hi)); + __m128i mlo = _mm_loadu_si128((const __m128i *)(lo)); + return yy_set_m128i(mhi, mlo); + } ++#endif ++ ++#undef GCC_VERSION + + static INLINE void yy_storeu2_128(void *hi, void *lo, const __m256i a) { + _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1)); + _mm_storeu_si128((__m128i *)lo, _mm256_castsi256_si128(a)); + } + + static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) { + const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1); diff --git a/media/libaom/config/generic/config/aom_config.asm b/media/libaom/config/generic/config/aom_config.asm index be0715562c..0f329a7df5 100644 --- a/media/libaom/config/generic/config/aom_config.asm +++ b/media/libaom/config/generic/config/aom_config.asm @@ -53,6 +53,7 @@ CONFIG_OS_SUPPORT equ 1 CONFIG_OUTPUT_FRAME_SIZE equ 0 CONFIG_PARTITION_SEARCH_ORDER equ 0 CONFIG_PIC equ 0 +CONFIG_QUANT_MATRIX equ 1 CONFIG_RATECTRL_LOG equ 0 CONFIG_RD_COMMAND equ 0 CONFIG_RD_DEBUG equ 0 @@ -87,6 +88,7 @@ HAVE_SSE4_1 equ 0 HAVE_SSE4_2 equ 0 HAVE_SSSE3 equ 0 HAVE_SVE equ 0 +HAVE_SVE2 equ 0 HAVE_VSX equ 0 HAVE_WXWIDGETS equ 0 STATIC_LINK_JXL equ 0 diff --git a/media/libaom/config/generic/config/aom_config.h b/media/libaom/config/generic/config/aom_config.h index a695b0b3e6..c89e1d755c 100644 --- a/media/libaom/config/generic/config/aom_config.h +++ b/media/libaom/config/generic/config/aom_config.h @@ -55,6 +55,7 @@ #define CONFIG_OUTPUT_FRAME_SIZE 0 #define CONFIG_PARTITION_SEARCH_ORDER 0 #define CONFIG_PIC 0 +#define CONFIG_QUANT_MATRIX 1 #define CONFIG_RATECTRL_LOG 0 #define CONFIG_RD_COMMAND 0 #define CONFIG_RD_DEBUG 0 @@ -89,6 +90,7 @@ #define HAVE_SSE4_2 0 #define HAVE_SSSE3 0 #define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_VSX 0 #define HAVE_WXWIDGETS 0 #define INLINE inline diff --git a/media/libaom/config/generic/config/aom_dsp_rtcd.h b/media/libaom/config/generic/config/aom_dsp_rtcd.h index 0418b3568e..a61dc47a47 100644 --- a/media/libaom/config/generic/config/aom_dsp_rtcd.h +++ b/media/libaom/config/generic/config/aom_dsp_rtcd.h @@ -46,9 +46,15 @@ void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask); #define aom_comp_mask_pred aom_comp_mask_pred_c +double aom_compute_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2); +#define aom_compute_correlation aom_compute_correlation_c + void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v); #define aom_compute_flow_at_point aom_compute_flow_at_point_c +bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev); +#define aom_compute_mean_stddev aom_compute_mean_stddev_c + void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define aom_convolve8 aom_convolve8_c @@ -4693,9 +4699,6 @@ unsigned int aom_variance8x8_c(const uint8_t *src_ptr, int source_stride, const int aom_vector_var_c(const int16_t *ref, const int16_t *src, int bwl); #define aom_vector_var aom_vector_var_c -double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2); -#define av1_compute_cross_correlation av1_compute_cross_correlation_c - void aom_dsp_rtcd(void); #include "config/aom_config.h" diff --git a/media/libaom/config/generic/config/aom_scale_rtcd.h b/media/libaom/config/generic/config/aom_scale_rtcd.h index 733b2d9ea1..dd09c4e3a6 100644 --- a/media/libaom/config/generic/config/aom_scale_rtcd.h +++ b/media/libaom/config/generic/config/aom_scale_rtcd.h @@ -8,13 +8,15 @@ #define RTCD_EXTERN extern #endif +#include <stdbool.h> + struct yv12_buffer_config; #ifdef __cplusplus extern "C" { #endif -void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes); +void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, int num_planes); #define aom_extend_frame_borders aom_extend_frame_borders_c void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end); @@ -50,13 +52,13 @@ void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigne void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes); #define aom_yv12_copy_frame aom_yv12_copy_frame_c -void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc); +void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop); #define aom_yv12_copy_u aom_yv12_copy_u_c -void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc); +void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop); #define aom_yv12_copy_v aom_yv12_copy_v_c -void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int use_crop); #define aom_yv12_copy_y aom_yv12_copy_y_c void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes); @@ -80,7 +82,7 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2); #define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c -int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_pyramid_levels, int num_planes); +int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes); #define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c void aom_scale_rtcd(void); diff --git a/media/libaom/config/linux/arm/config/aom_config.asm b/media/libaom/config/linux/arm/config/aom_config.asm index 63034fd7e2..1ec673f263 100644 --- a/media/libaom/config/linux/arm/config/aom_config.asm +++ b/media/libaom/config/linux/arm/config/aom_config.asm @@ -53,6 +53,7 @@ .equ CONFIG_OUTPUT_FRAME_SIZE, 0 .equ CONFIG_PARTITION_SEARCH_ORDER, 0 .equ CONFIG_PIC, 1 +.equ CONFIG_QUANT_MATRIX, 1 .equ CONFIG_RATECTRL_LOG, 0 .equ CONFIG_RD_COMMAND, 0 .equ CONFIG_RD_DEBUG, 0 @@ -87,6 +88,7 @@ .equ HAVE_SSE4_2, 0 .equ HAVE_SSSE3, 0 .equ HAVE_SVE, 0 +.equ HAVE_SVE2, 0 .equ HAVE_VSX, 0 .equ HAVE_WXWIDGETS, 0 .equ STATIC_LINK_JXL, 0 diff --git a/media/libaom/config/linux/arm/config/aom_config.h b/media/libaom/config/linux/arm/config/aom_config.h index 3cbe7bf169..fb73e8431e 100644 --- a/media/libaom/config/linux/arm/config/aom_config.h +++ b/media/libaom/config/linux/arm/config/aom_config.h @@ -55,6 +55,7 @@ #define CONFIG_OUTPUT_FRAME_SIZE 0 #define CONFIG_PARTITION_SEARCH_ORDER 0 #define CONFIG_PIC 1 +#define CONFIG_QUANT_MATRIX 1 #define CONFIG_RATECTRL_LOG 0 #define CONFIG_RD_COMMAND 0 #define CONFIG_RD_DEBUG 0 @@ -89,6 +90,7 @@ #define HAVE_SSE4_2 0 #define HAVE_SSSE3 0 #define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_VSX 0 #define HAVE_WXWIDGETS 0 #define INLINE inline diff --git a/media/libaom/config/linux/arm/config/aom_dsp_rtcd.h b/media/libaom/config/linux/arm/config/aom_dsp_rtcd.h index 50ee78932c..fffcc5a3e9 100644 --- a/media/libaom/config/linux/arm/config/aom_dsp_rtcd.h +++ b/media/libaom/config/linux/arm/config/aom_dsp_rtcd.h @@ -54,10 +54,16 @@ void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, in void aom_comp_mask_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask); RTCD_EXTERN void (*aom_comp_mask_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask); +double aom_compute_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2); +#define aom_compute_correlation aom_compute_correlation_c + void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v); void aom_compute_flow_at_point_neon(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v); RTCD_EXTERN void (*aom_compute_flow_at_point)(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v); +bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev); +#define aom_compute_mean_stddev aom_compute_mean_stddev_c + void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define aom_convolve8 aom_convolve8_c @@ -6212,9 +6218,6 @@ int aom_vector_var_c(const int16_t *ref, const int16_t *src, int bwl); int aom_vector_var_neon(const int16_t *ref, const int16_t *src, int bwl); RTCD_EXTERN int (*aom_vector_var)(const int16_t *ref, const int16_t *src, int bwl); -double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2); -#define av1_compute_cross_correlation av1_compute_cross_correlation_c - void aom_dsp_rtcd(void); #include "config/aom_config.h" diff --git a/media/libaom/config/linux/arm/config/aom_scale_rtcd.h b/media/libaom/config/linux/arm/config/aom_scale_rtcd.h index d296957f84..1024a666fe 100644 --- a/media/libaom/config/linux/arm/config/aom_scale_rtcd.h +++ b/media/libaom/config/linux/arm/config/aom_scale_rtcd.h @@ -8,13 +8,15 @@ #define RTCD_EXTERN extern #endif +#include <stdbool.h> + struct yv12_buffer_config; #ifdef __cplusplus extern "C" { #endif -void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes); +void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, int num_planes); #define aom_extend_frame_borders aom_extend_frame_borders_c void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end); @@ -50,13 +52,13 @@ void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigne void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes); #define aom_yv12_copy_frame aom_yv12_copy_frame_c -void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc); +void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop); #define aom_yv12_copy_u aom_yv12_copy_u_c -void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc); +void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop); #define aom_yv12_copy_v aom_yv12_copy_v_c -void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int use_crop); #define aom_yv12_copy_y aom_yv12_copy_y_c void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes); @@ -80,7 +82,7 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2); #define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c -int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_pyramid_levels, int num_planes); +int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes); #define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c void aom_scale_rtcd(void); diff --git a/media/libaom/config/linux/ia32/config/aom_config.asm b/media/libaom/config/linux/ia32/config/aom_config.asm index e75260cb09..4fd596e34b 100644 --- a/media/libaom/config/linux/ia32/config/aom_config.asm +++ b/media/libaom/config/linux/ia32/config/aom_config.asm @@ -53,6 +53,7 @@ CONFIG_OS_SUPPORT equ 1 CONFIG_OUTPUT_FRAME_SIZE equ 0 CONFIG_PARTITION_SEARCH_ORDER equ 0 CONFIG_PIC equ 1 +CONFIG_QUANT_MATRIX equ 1 CONFIG_RATECTRL_LOG equ 0 CONFIG_RD_COMMAND equ 0 CONFIG_RD_DEBUG equ 0 @@ -87,6 +88,7 @@ HAVE_SSE4_1 equ 1 HAVE_SSE4_2 equ 1 HAVE_SSSE3 equ 1 HAVE_SVE equ 0 +HAVE_SVE2 equ 0 HAVE_VSX equ 0 HAVE_WXWIDGETS equ 0 STATIC_LINK_JXL equ 0 diff --git a/media/libaom/config/linux/ia32/config/aom_config.h b/media/libaom/config/linux/ia32/config/aom_config.h index b0e5b5cabc..256f556662 100644 --- a/media/libaom/config/linux/ia32/config/aom_config.h +++ b/media/libaom/config/linux/ia32/config/aom_config.h @@ -55,6 +55,7 @@ #define CONFIG_OUTPUT_FRAME_SIZE 0 #define CONFIG_PARTITION_SEARCH_ORDER 0 #define CONFIG_PIC 1 +#define CONFIG_QUANT_MATRIX 1 #define CONFIG_RATECTRL_LOG 0 #define CONFIG_RD_COMMAND 0 #define CONFIG_RD_DEBUG 0 @@ -89,6 +90,7 @@ #define HAVE_SSE4_2 1 #define HAVE_SSSE3 1 #define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_VSX 0 #define HAVE_WXWIDGETS 0 #define INLINE inline diff --git a/media/libaom/config/linux/ia32/config/aom_dsp_rtcd.h b/media/libaom/config/linux/ia32/config/aom_dsp_rtcd.h index a19adf5f61..93472f0e92 100644 --- a/media/libaom/config/linux/ia32/config/aom_dsp_rtcd.h +++ b/media/libaom/config/linux/ia32/config/aom_dsp_rtcd.h @@ -57,21 +57,30 @@ void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, int width void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask); RTCD_EXTERN void (*aom_comp_mask_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask); +double aom_compute_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2); +double aom_compute_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2); +double aom_compute_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2); +RTCD_EXTERN double (*aom_compute_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2); + void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v); void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v); +void aom_compute_flow_at_point_avx2(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v); RTCD_EXTERN void (*aom_compute_flow_at_point)(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v); +bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev); +bool aom_compute_mean_stddev_sse4_1(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev); +bool aom_compute_mean_stddev_avx2(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev); +RTCD_EXTERN bool (*aom_compute_mean_stddev)(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev); + void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define aom_convolve8 aom_convolve8_c void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*aom_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*aom_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); @@ -903,6 +912,7 @@ RTCD_EXTERN unsigned int (*aom_highbd_10_masked_sub_pixel_variance8x8)(const uin unsigned int aom_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*aom_highbd_10_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int aom_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); @@ -5130,7 +5140,8 @@ unsigned int aom_sad16x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const RTCD_EXTERN unsigned int (*aom_sad16x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); void aom_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); -#define aom_sad16x4x3d aom_sad16x4x3d_c +void aom_sad16x4x3d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*aom_sad16x4x3d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); void aom_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); void aom_sad16x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); @@ -5466,7 +5477,8 @@ unsigned int aom_sad_skip_16x4_c(const uint8_t *src_ptr, int src_stride, const u #define aom_sad_skip_16x4 aom_sad_skip_16x4_c void aom_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); -#define aom_sad_skip_16x4x4d aom_sad_skip_16x4x4d_c +void aom_sad_skip_16x4x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*aom_sad_skip_16x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); unsigned int aom_sad_skip_16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int aom_sad_skip_16x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -5867,243 +5879,199 @@ void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp, ui #define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x4_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x8_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); @@ -6326,11 +6294,6 @@ int aom_vector_var_sse4_1(const int16_t *ref, const int16_t *src, int bwl); int aom_vector_var_avx2(const int16_t *ref, const int16_t *src, int bwl); RTCD_EXTERN int (*aom_vector_var)(const int16_t *ref, const int16_t *src, int bwl); -double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2); -double av1_compute_cross_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2); -double av1_compute_cross_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2); -RTCD_EXTERN double (*av1_compute_cross_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2); - void aom_dsp_rtcd(void); #ifdef RTCD_C @@ -6360,14 +6323,19 @@ static void setup_rtcd_internal(void) aom_comp_mask_pred = aom_comp_mask_pred_c; if (flags & HAS_SSSE3) aom_comp_mask_pred = aom_comp_mask_pred_ssse3; if (flags & HAS_AVX2) aom_comp_mask_pred = aom_comp_mask_pred_avx2; + aom_compute_correlation = aom_compute_correlation_c; + if (flags & HAS_SSE4_1) aom_compute_correlation = aom_compute_correlation_sse4_1; + if (flags & HAS_AVX2) aom_compute_correlation = aom_compute_correlation_avx2; aom_compute_flow_at_point = aom_compute_flow_at_point_c; if (flags & HAS_SSE4_1) aom_compute_flow_at_point = aom_compute_flow_at_point_sse4_1; + if (flags & HAS_AVX2) aom_compute_flow_at_point = aom_compute_flow_at_point_avx2; + aom_compute_mean_stddev = aom_compute_mean_stddev_c; + if (flags & HAS_SSE4_1) aom_compute_mean_stddev = aom_compute_mean_stddev_sse4_1; + if (flags & HAS_AVX2) aom_compute_mean_stddev = aom_compute_mean_stddev_avx2; aom_convolve8_horiz = aom_convolve8_horiz_c; - if (flags & HAS_SSE2) aom_convolve8_horiz = aom_convolve8_horiz_sse2; if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3; if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2; aom_convolve8_vert = aom_convolve8_vert_c; - if (flags & HAS_SSE2) aom_convolve8_vert = aom_convolve8_vert_sse2; if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3; if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2; aom_convolve_copy = aom_convolve_copy_c; @@ -6768,6 +6736,7 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_ssse3; aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_c; if (flags & HAS_SSE2) aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_sse2; + if (flags & HAS_AVX2) aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_avx2; aom_highbd_10_mse8x8 = aom_highbd_10_mse8x8_c; if (flags & HAS_SSE2) aom_highbd_10_mse8x8 = aom_highbd_10_mse8x8_sse2; aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_c; @@ -8526,6 +8495,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) aom_sad16x4 = aom_sad16x4_sse2; aom_sad16x4_avg = aom_sad16x4_avg_c; if (flags & HAS_SSE2) aom_sad16x4_avg = aom_sad16x4_avg_sse2; + aom_sad16x4x3d = aom_sad16x4x3d_c; + if (flags & HAS_AVX2) aom_sad16x4x3d = aom_sad16x4x3d_avx2; aom_sad16x4x4d = aom_sad16x4x4d_c; if (flags & HAS_SSE2) aom_sad16x4x4d = aom_sad16x4x4d_sse2; if (flags & HAS_AVX2) aom_sad16x4x4d = aom_sad16x4x4d_avx2; @@ -8695,6 +8666,8 @@ static void setup_rtcd_internal(void) aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_c; if (flags & HAS_SSE2) aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_sse2; if (flags & HAS_AVX2) aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_avx2; + aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_c; + if (flags & HAS_AVX2) aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_avx2; aom_sad_skip_16x64 = aom_sad_skip_16x64_c; if (flags & HAS_SSE2) aom_sad_skip_16x64 = aom_sad_skip_16x64_sse2; aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_c; @@ -8897,157 +8870,113 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE4_1) aom_sse = aom_sse_sse4_1; if (flags & HAS_AVX2) aom_sse = aom_sse_avx2; aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_avx2; aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_avx2; aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_ssse3; aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_ssse3; aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_ssse3; aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_ssse3; aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_ssse3; aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_avx2; aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_avx2; aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_avx2; aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_ssse3; aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_ssse3; aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_ssse3; aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_ssse3; aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_avx2; aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_ssse3; aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_avx2; aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_avx2; aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_ssse3; aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_ssse3; aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_ssse3; aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_ssse3; aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_avx2; aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_avx2; aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_avx2; aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_avx2; aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_avx2; aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_avx2; aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_avx2; aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_avx2; aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_avx2; aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_avx2; aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_ssse3; aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_ssse3; aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_ssse3; aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_ssse3; aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_avx2; aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_ssse3; aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_avx2; aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_avx2; aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_ssse3; aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_ssse3; aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_ssse3; aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_ssse3; aom_subtract_block = aom_subtract_block_c; if (flags & HAS_SSE2) aom_subtract_block = aom_subtract_block_sse2; @@ -9172,9 +9101,6 @@ static void setup_rtcd_internal(void) aom_vector_var = aom_vector_var_c; if (flags & HAS_SSE4_1) aom_vector_var = aom_vector_var_sse4_1; if (flags & HAS_AVX2) aom_vector_var = aom_vector_var_avx2; - av1_compute_cross_correlation = av1_compute_cross_correlation_c; - if (flags & HAS_SSE4_1) av1_compute_cross_correlation = av1_compute_cross_correlation_sse4_1; - if (flags & HAS_AVX2) av1_compute_cross_correlation = av1_compute_cross_correlation_avx2; } #endif diff --git a/media/libaom/config/linux/ia32/config/aom_scale_rtcd.h b/media/libaom/config/linux/ia32/config/aom_scale_rtcd.h index 3b70fb47c3..cdabb21106 100644 --- a/media/libaom/config/linux/ia32/config/aom_scale_rtcd.h +++ b/media/libaom/config/linux/ia32/config/aom_scale_rtcd.h @@ -8,13 +8,15 @@ #define RTCD_EXTERN extern #endif +#include <stdbool.h> + struct yv12_buffer_config; #ifdef __cplusplus extern "C" { #endif -void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes); +void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, int num_planes); #define aom_extend_frame_borders aom_extend_frame_borders_c void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end); @@ -50,13 +52,13 @@ void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigne void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes); #define aom_yv12_copy_frame aom_yv12_copy_frame_c -void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc); +void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop); #define aom_yv12_copy_u aom_yv12_copy_u_c -void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc); +void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop); #define aom_yv12_copy_v aom_yv12_copy_v_c -void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int use_crop); #define aom_yv12_copy_y aom_yv12_copy_y_c void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes); @@ -80,7 +82,7 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2); #define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c -int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_pyramid_levels, int num_planes); +int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes); #define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c void aom_scale_rtcd(void); diff --git a/media/libaom/config/linux/ia32/config/av1_rtcd.h b/media/libaom/config/linux/ia32/config/av1_rtcd.h index 3f404f61c8..37716517bf 100644 --- a/media/libaom/config/linux/ia32/config/av1_rtcd.h +++ b/media/libaom/config/linux/ia32/config/av1_rtcd.h @@ -265,7 +265,6 @@ void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *ds #define av1_convolve_y_sr_intrabc av1_convolve_y_sr_intrabc_c void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); -void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); RTCD_EXTERN void (*av1_dist_wtd_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); @@ -764,84 +763,72 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params); void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); -void cdef_copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); void cdef_copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); void cdef_copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); void cdef_copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); RTCD_EXTERN void (*cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); -void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); void cdef_copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); RTCD_EXTERN void (*cdef_copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_0_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_0_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_0_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_0_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_16_0)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_1_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_1_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_1_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_1_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_1_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_16_1)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_2_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_2_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_2_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_2_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_2_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_16_2)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_3_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_3_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_3_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_3_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_3_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_16_3)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_0_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_0_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_0_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_0_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_0_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_8_0)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_1_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_1_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_1_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_1_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_1_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_8_1)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_2_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_2_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_2_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_2_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_2_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_8_2)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_3_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_3_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_3_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_3_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_3_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_8_3)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift); -int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift); int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift); int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift); int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift); RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift); void cdef_find_dir_dual_c(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); -void cdef_find_dir_dual_sse2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); void cdef_find_dir_dual_ssse3(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); void cdef_find_dir_dual_sse4_1(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); @@ -969,7 +956,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) av1_convolve_y_sr = av1_convolve_y_sr_sse2; if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2; av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_c; - if (flags & HAS_SSE2) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_sse2; if (flags & HAS_SSSE3) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_ssse3; if (flags & HAS_AVX2) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_avx2; av1_dist_wtd_convolve_2d_copy = av1_dist_wtd_convolve_2d_copy_c; @@ -1176,62 +1162,50 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2; if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2; cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_c; - if (flags & HAS_SSE2) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse2; if (flags & HAS_SSSE3) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_ssse3; if (flags & HAS_SSE4_1) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse4_1; if (flags & HAS_AVX2) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_avx2; cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_c; - if (flags & HAS_SSE2) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse2; if (flags & HAS_SSSE3) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_ssse3; if (flags & HAS_SSE4_1) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse4_1; if (flags & HAS_AVX2) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_avx2; cdef_filter_16_0 = cdef_filter_16_0_c; - if (flags & HAS_SSE2) cdef_filter_16_0 = cdef_filter_16_0_sse2; if (flags & HAS_SSSE3) cdef_filter_16_0 = cdef_filter_16_0_ssse3; if (flags & HAS_SSE4_1) cdef_filter_16_0 = cdef_filter_16_0_sse4_1; if (flags & HAS_AVX2) cdef_filter_16_0 = cdef_filter_16_0_avx2; cdef_filter_16_1 = cdef_filter_16_1_c; - if (flags & HAS_SSE2) cdef_filter_16_1 = cdef_filter_16_1_sse2; if (flags & HAS_SSSE3) cdef_filter_16_1 = cdef_filter_16_1_ssse3; if (flags & HAS_SSE4_1) cdef_filter_16_1 = cdef_filter_16_1_sse4_1; if (flags & HAS_AVX2) cdef_filter_16_1 = cdef_filter_16_1_avx2; cdef_filter_16_2 = cdef_filter_16_2_c; - if (flags & HAS_SSE2) cdef_filter_16_2 = cdef_filter_16_2_sse2; if (flags & HAS_SSSE3) cdef_filter_16_2 = cdef_filter_16_2_ssse3; if (flags & HAS_SSE4_1) cdef_filter_16_2 = cdef_filter_16_2_sse4_1; if (flags & HAS_AVX2) cdef_filter_16_2 = cdef_filter_16_2_avx2; cdef_filter_16_3 = cdef_filter_16_3_c; - if (flags & HAS_SSE2) cdef_filter_16_3 = cdef_filter_16_3_sse2; if (flags & HAS_SSSE3) cdef_filter_16_3 = cdef_filter_16_3_ssse3; if (flags & HAS_SSE4_1) cdef_filter_16_3 = cdef_filter_16_3_sse4_1; if (flags & HAS_AVX2) cdef_filter_16_3 = cdef_filter_16_3_avx2; cdef_filter_8_0 = cdef_filter_8_0_c; - if (flags & HAS_SSE2) cdef_filter_8_0 = cdef_filter_8_0_sse2; if (flags & HAS_SSSE3) cdef_filter_8_0 = cdef_filter_8_0_ssse3; if (flags & HAS_SSE4_1) cdef_filter_8_0 = cdef_filter_8_0_sse4_1; if (flags & HAS_AVX2) cdef_filter_8_0 = cdef_filter_8_0_avx2; cdef_filter_8_1 = cdef_filter_8_1_c; - if (flags & HAS_SSE2) cdef_filter_8_1 = cdef_filter_8_1_sse2; if (flags & HAS_SSSE3) cdef_filter_8_1 = cdef_filter_8_1_ssse3; if (flags & HAS_SSE4_1) cdef_filter_8_1 = cdef_filter_8_1_sse4_1; if (flags & HAS_AVX2) cdef_filter_8_1 = cdef_filter_8_1_avx2; cdef_filter_8_2 = cdef_filter_8_2_c; - if (flags & HAS_SSE2) cdef_filter_8_2 = cdef_filter_8_2_sse2; if (flags & HAS_SSSE3) cdef_filter_8_2 = cdef_filter_8_2_ssse3; if (flags & HAS_SSE4_1) cdef_filter_8_2 = cdef_filter_8_2_sse4_1; if (flags & HAS_AVX2) cdef_filter_8_2 = cdef_filter_8_2_avx2; cdef_filter_8_3 = cdef_filter_8_3_c; - if (flags & HAS_SSE2) cdef_filter_8_3 = cdef_filter_8_3_sse2; if (flags & HAS_SSSE3) cdef_filter_8_3 = cdef_filter_8_3_ssse3; if (flags & HAS_SSE4_1) cdef_filter_8_3 = cdef_filter_8_3_sse4_1; if (flags & HAS_AVX2) cdef_filter_8_3 = cdef_filter_8_3_avx2; cdef_find_dir = cdef_find_dir_c; - if (flags & HAS_SSE2) cdef_find_dir = cdef_find_dir_sse2; if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3; if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1; if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2; cdef_find_dir_dual = cdef_find_dir_dual_c; - if (flags & HAS_SSE2) cdef_find_dir_dual = cdef_find_dir_dual_sse2; if (flags & HAS_SSSE3) cdef_find_dir_dual = cdef_find_dir_dual_ssse3; if (flags & HAS_SSE4_1) cdef_find_dir_dual = cdef_find_dir_dual_sse4_1; if (flags & HAS_AVX2) cdef_find_dir_dual = cdef_find_dir_dual_avx2; diff --git a/media/libaom/config/linux/x64/config/aom_config.asm b/media/libaom/config/linux/x64/config/aom_config.asm index f793ff3c6d..3f470f3a5f 100644 --- a/media/libaom/config/linux/x64/config/aom_config.asm +++ b/media/libaom/config/linux/x64/config/aom_config.asm @@ -53,6 +53,7 @@ CONFIG_OS_SUPPORT equ 1 CONFIG_OUTPUT_FRAME_SIZE equ 0 CONFIG_PARTITION_SEARCH_ORDER equ 0 CONFIG_PIC equ 0 +CONFIG_QUANT_MATRIX equ 1 CONFIG_RATECTRL_LOG equ 0 CONFIG_RD_COMMAND equ 0 CONFIG_RD_DEBUG equ 0 @@ -87,6 +88,7 @@ HAVE_SSE4_1 equ 1 HAVE_SSE4_2 equ 1 HAVE_SSSE3 equ 1 HAVE_SVE equ 0 +HAVE_SVE2 equ 0 HAVE_VSX equ 0 HAVE_WXWIDGETS equ 0 STATIC_LINK_JXL equ 0 diff --git a/media/libaom/config/linux/x64/config/aom_config.h b/media/libaom/config/linux/x64/config/aom_config.h index 670d2ffe56..6d96b65b07 100644 --- a/media/libaom/config/linux/x64/config/aom_config.h +++ b/media/libaom/config/linux/x64/config/aom_config.h @@ -55,6 +55,7 @@ #define CONFIG_OUTPUT_FRAME_SIZE 0 #define CONFIG_PARTITION_SEARCH_ORDER 0 #define CONFIG_PIC 0 +#define CONFIG_QUANT_MATRIX 1 #define CONFIG_RATECTRL_LOG 0 #define CONFIG_RD_COMMAND 0 #define CONFIG_RD_DEBUG 0 @@ -89,6 +90,7 @@ #define HAVE_SSE4_2 1 #define HAVE_SSSE3 1 #define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_VSX 0 #define HAVE_WXWIDGETS 0 #define INLINE inline diff --git a/media/libaom/config/linux/x64/config/aom_dsp_rtcd.h b/media/libaom/config/linux/x64/config/aom_dsp_rtcd.h index 8e979cc189..9135c6f423 100644 --- a/media/libaom/config/linux/x64/config/aom_dsp_rtcd.h +++ b/media/libaom/config/linux/x64/config/aom_dsp_rtcd.h @@ -57,21 +57,30 @@ void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, int width void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask); RTCD_EXTERN void (*aom_comp_mask_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask); +double aom_compute_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2); +double aom_compute_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2); +double aom_compute_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2); +RTCD_EXTERN double (*aom_compute_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2); + void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v); void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v); +void aom_compute_flow_at_point_avx2(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v); RTCD_EXTERN void (*aom_compute_flow_at_point)(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v); +bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev); +bool aom_compute_mean_stddev_sse4_1(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev); +bool aom_compute_mean_stddev_avx2(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev); +RTCD_EXTERN bool (*aom_compute_mean_stddev)(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev); + void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define aom_convolve8 aom_convolve8_c void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*aom_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*aom_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); @@ -903,7 +912,8 @@ RTCD_EXTERN unsigned int (*aom_highbd_10_masked_sub_pixel_variance8x8)(const uin unsigned int aom_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define aom_highbd_10_mse16x16 aom_highbd_10_mse16x16_sse2 +unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*aom_highbd_10_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int aom_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); #define aom_highbd_10_mse16x8 aom_highbd_10_mse16x8_c @@ -5132,7 +5142,8 @@ unsigned int aom_sad16x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const #define aom_sad16x4_avg aom_sad16x4_avg_sse2 void aom_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); -#define aom_sad16x4x3d aom_sad16x4x3d_c +void aom_sad16x4x3d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*aom_sad16x4x3d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); void aom_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); void aom_sad16x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); @@ -5468,7 +5479,8 @@ unsigned int aom_sad_skip_16x4_c(const uint8_t *src_ptr, int src_stride, const u #define aom_sad_skip_16x4 aom_sad_skip_16x4_c void aom_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); -#define aom_sad_skip_16x4x4d aom_sad_skip_16x4x4d_c +void aom_sad_skip_16x4x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*aom_sad_skip_16x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); unsigned int aom_sad_skip_16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int aom_sad_skip_16x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -5870,243 +5882,199 @@ void aom_ssim_parms_8x8_sse2(const uint8_t *s, int sp, const uint8_t *r, int rp, #define aom_ssim_parms_8x8 aom_ssim_parms_8x8_sse2 uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x4_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x8_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); @@ -6329,11 +6297,6 @@ int aom_vector_var_sse4_1(const int16_t *ref, const int16_t *src, int bwl); int aom_vector_var_avx2(const int16_t *ref, const int16_t *src, int bwl); RTCD_EXTERN int (*aom_vector_var)(const int16_t *ref, const int16_t *src, int bwl); -double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2); -double av1_compute_cross_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2); -double av1_compute_cross_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2); -RTCD_EXTERN double (*av1_compute_cross_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2); - void aom_dsp_rtcd(void); #ifdef RTCD_C @@ -6358,12 +6321,19 @@ static void setup_rtcd_internal(void) aom_comp_mask_pred = aom_comp_mask_pred_c; if (flags & HAS_SSSE3) aom_comp_mask_pred = aom_comp_mask_pred_ssse3; if (flags & HAS_AVX2) aom_comp_mask_pred = aom_comp_mask_pred_avx2; + aom_compute_correlation = aom_compute_correlation_c; + if (flags & HAS_SSE4_1) aom_compute_correlation = aom_compute_correlation_sse4_1; + if (flags & HAS_AVX2) aom_compute_correlation = aom_compute_correlation_avx2; aom_compute_flow_at_point = aom_compute_flow_at_point_c; if (flags & HAS_SSE4_1) aom_compute_flow_at_point = aom_compute_flow_at_point_sse4_1; - aom_convolve8_horiz = aom_convolve8_horiz_sse2; + if (flags & HAS_AVX2) aom_compute_flow_at_point = aom_compute_flow_at_point_avx2; + aom_compute_mean_stddev = aom_compute_mean_stddev_c; + if (flags & HAS_SSE4_1) aom_compute_mean_stddev = aom_compute_mean_stddev_sse4_1; + if (flags & HAS_AVX2) aom_compute_mean_stddev = aom_compute_mean_stddev_avx2; + aom_convolve8_horiz = aom_convolve8_horiz_c; if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3; if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2; - aom_convolve8_vert = aom_convolve8_vert_sse2; + aom_convolve8_vert = aom_convolve8_vert_c; if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3; if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2; aom_convolve_copy = aom_convolve_copy_sse2; @@ -6528,6 +6498,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x4 = aom_highbd_10_masked_sub_pixel_variance8x4_ssse3; aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_c; if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_ssse3; + aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_sse2; + if (flags & HAS_AVX2) aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_avx2; aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_c; if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_sse4_1; aom_highbd_10_obmc_variance128x64 = aom_highbd_10_obmc_variance128x64_c; @@ -7626,6 +7598,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) aom_sad16x32x3d = aom_sad16x32x3d_avx2; aom_sad16x32x4d = aom_sad16x32x4d_sse2; if (flags & HAS_AVX2) aom_sad16x32x4d = aom_sad16x32x4d_avx2; + aom_sad16x4x3d = aom_sad16x4x3d_c; + if (flags & HAS_AVX2) aom_sad16x4x3d = aom_sad16x4x3d_avx2; aom_sad16x4x4d = aom_sad16x4x4d_sse2; if (flags & HAS_AVX2) aom_sad16x4x4d = aom_sad16x4x4d_avx2; aom_sad16x64x3d = aom_sad16x64x3d_c; @@ -7704,6 +7678,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) aom_sad_skip_16x16x4d = aom_sad_skip_16x16x4d_avx2; aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_sse2; if (flags & HAS_AVX2) aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_avx2; + aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_c; + if (flags & HAS_AVX2) aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_avx2; aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_sse2; if (flags & HAS_AVX2) aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_avx2; aom_sad_skip_16x8x4d = aom_sad_skip_16x8x4d_sse2; @@ -7859,114 +7835,114 @@ static void setup_rtcd_internal(void) aom_sse = aom_sse_c; if (flags & HAS_SSE4_1) aom_sse = aom_sse_sse4_1; if (flags & HAS_AVX2) aom_sse = aom_sse_avx2; - aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_sse2; + aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_avx2; - aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_sse2; + aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_avx2; - aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_sse2; + aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_ssse3; - aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_sse2; + aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_ssse3; - aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_sse2; + aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_ssse3; - aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_sse2; + aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_ssse3; - aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_sse2; + aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_ssse3; - aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_sse2; + aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_avx2; - aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_sse2; + aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_avx2; - aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_sse2; + aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_avx2; - aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_sse2; + aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_ssse3; - aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_sse2; + aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_ssse3; - aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_sse2; + aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_ssse3; - aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_sse2; + aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_ssse3; - aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_sse2; + aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_avx2; - aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_sse2; + aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_ssse3; - aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_sse2; + aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_avx2; - aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_sse2; + aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_avx2; - aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_sse2; + aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_ssse3; - aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_sse2; + aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_ssse3; - aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_sse2; + aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_ssse3; - aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_sse2; + aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_ssse3; - aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_sse2; + aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_avx2; - aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_sse2; + aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_avx2; - aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_sse2; + aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_avx2; - aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_sse2; + aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_avx2; - aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_sse2; + aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_avx2; - aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_sse2; + aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_avx2; - aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_sse2; + aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_avx2; - aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_sse2; + aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_avx2; - aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_sse2; + aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_avx2; - aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_sse2; + aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_avx2; - aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_sse2; + aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_ssse3; - aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_sse2; + aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_ssse3; - aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_sse2; + aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_ssse3; - aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_sse2; + aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_ssse3; - aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_sse2; + aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_avx2; - aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_sse2; + aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_ssse3; - aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_sse2; + aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_avx2; - aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_sse2; + aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_avx2; - aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_sse2; + aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_ssse3; - aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_sse2; + aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_ssse3; - aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_sse2; + aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_ssse3; - aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_sse2; + aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_ssse3; aom_subtract_block = aom_subtract_block_sse2; if (flags & HAS_AVX2) aom_subtract_block = aom_subtract_block_avx2; @@ -8023,9 +7999,6 @@ static void setup_rtcd_internal(void) aom_vector_var = aom_vector_var_c; if (flags & HAS_SSE4_1) aom_vector_var = aom_vector_var_sse4_1; if (flags & HAS_AVX2) aom_vector_var = aom_vector_var_avx2; - av1_compute_cross_correlation = av1_compute_cross_correlation_c; - if (flags & HAS_SSE4_1) av1_compute_cross_correlation = av1_compute_cross_correlation_sse4_1; - if (flags & HAS_AVX2) av1_compute_cross_correlation = av1_compute_cross_correlation_avx2; } #endif diff --git a/media/libaom/config/linux/x64/config/aom_scale_rtcd.h b/media/libaom/config/linux/x64/config/aom_scale_rtcd.h index 3b70fb47c3..cdabb21106 100644 --- a/media/libaom/config/linux/x64/config/aom_scale_rtcd.h +++ b/media/libaom/config/linux/x64/config/aom_scale_rtcd.h @@ -8,13 +8,15 @@ #define RTCD_EXTERN extern #endif +#include <stdbool.h> + struct yv12_buffer_config; #ifdef __cplusplus extern "C" { #endif -void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes); +void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, int num_planes); #define aom_extend_frame_borders aom_extend_frame_borders_c void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end); @@ -50,13 +52,13 @@ void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigne void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes); #define aom_yv12_copy_frame aom_yv12_copy_frame_c -void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc); +void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop); #define aom_yv12_copy_u aom_yv12_copy_u_c -void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc); +void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop); #define aom_yv12_copy_v aom_yv12_copy_v_c -void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int use_crop); #define aom_yv12_copy_y aom_yv12_copy_y_c void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes); @@ -80,7 +82,7 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2); #define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c -int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_pyramid_levels, int num_planes); +int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes); #define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c void aom_scale_rtcd(void); diff --git a/media/libaom/config/linux/x64/config/av1_rtcd.h b/media/libaom/config/linux/x64/config/av1_rtcd.h index b1cdc99700..ad72985afe 100644 --- a/media/libaom/config/linux/x64/config/av1_rtcd.h +++ b/media/libaom/config/linux/x64/config/av1_rtcd.h @@ -253,7 +253,6 @@ void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *ds #define av1_convolve_y_sr_intrabc av1_convolve_y_sr_intrabc_c void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); -void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); RTCD_EXTERN void (*av1_dist_wtd_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); @@ -659,7 +658,6 @@ void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param); -void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param); void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param); void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param); RTCD_EXTERN void (*av1_lowbd_fwd_txfm)(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param); @@ -755,85 +753,61 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params); void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); -void cdef_copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); -void cdef_copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); void cdef_copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); void cdef_copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); RTCD_EXTERN void (*cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); -void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); -void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); void cdef_copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); RTCD_EXTERN void (*cdef_copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_0_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_0_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_0_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_0_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_16_0)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_1_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_1_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_1_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_1_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_1_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_16_1)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_2_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_2_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_2_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_2_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_2_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_16_2)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_3_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_3_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_3_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_3_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_3_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_16_3)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_0_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_0_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_0_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_0_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_0_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_8_0)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_1_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_1_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_1_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_1_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_1_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_8_1)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_2_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_2_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_2_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_2_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_2_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_8_2)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_3_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_3_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_3_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_3_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_3_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_8_3)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift); -int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift); -int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift); int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift); int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift); RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift); void cdef_find_dir_dual_c(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); -void cdef_find_dir_dual_sse2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); -void cdef_find_dir_dual_ssse3(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); void cdef_find_dir_dual_sse4_1(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); RTCD_EXTERN void (*cdef_find_dir_dual)(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); @@ -941,7 +915,7 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) av1_convolve_x_sr = av1_convolve_x_sr_avx2; av1_convolve_y_sr = av1_convolve_y_sr_sse2; if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2; - av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_sse2; + av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_c; if (flags & HAS_SSSE3) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_ssse3; if (flags & HAS_AVX2) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_avx2; av1_dist_wtd_convolve_2d_copy = av1_dist_wtd_convolve_2d_copy_sse2; @@ -1091,7 +1065,7 @@ static void setup_rtcd_internal(void) av1_inv_txfm_add = av1_inv_txfm_add_c; if (flags & HAS_SSSE3) av1_inv_txfm_add = av1_inv_txfm_add_ssse3; if (flags & HAS_AVX2) av1_inv_txfm_add = av1_inv_txfm_add_avx2; - av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_sse2; + av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_c; if (flags & HAS_SSE4_1) av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_sse4_1; if (flags & HAS_AVX2) av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_avx2; av1_lowbd_pixel_proj_error = av1_lowbd_pixel_proj_error_c; @@ -1133,52 +1107,40 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) av1_wedge_sse_from_residuals = av1_wedge_sse_from_residuals_avx2; av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2; if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2; - cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse2; - if (flags & HAS_SSSE3) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_ssse3; + cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_c; if (flags & HAS_SSE4_1) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse4_1; if (flags & HAS_AVX2) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_avx2; - cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse2; - if (flags & HAS_SSSE3) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_ssse3; + cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_c; if (flags & HAS_SSE4_1) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse4_1; if (flags & HAS_AVX2) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_avx2; - cdef_filter_16_0 = cdef_filter_16_0_sse2; - if (flags & HAS_SSSE3) cdef_filter_16_0 = cdef_filter_16_0_ssse3; + cdef_filter_16_0 = cdef_filter_16_0_c; if (flags & HAS_SSE4_1) cdef_filter_16_0 = cdef_filter_16_0_sse4_1; if (flags & HAS_AVX2) cdef_filter_16_0 = cdef_filter_16_0_avx2; - cdef_filter_16_1 = cdef_filter_16_1_sse2; - if (flags & HAS_SSSE3) cdef_filter_16_1 = cdef_filter_16_1_ssse3; + cdef_filter_16_1 = cdef_filter_16_1_c; if (flags & HAS_SSE4_1) cdef_filter_16_1 = cdef_filter_16_1_sse4_1; if (flags & HAS_AVX2) cdef_filter_16_1 = cdef_filter_16_1_avx2; - cdef_filter_16_2 = cdef_filter_16_2_sse2; - if (flags & HAS_SSSE3) cdef_filter_16_2 = cdef_filter_16_2_ssse3; + cdef_filter_16_2 = cdef_filter_16_2_c; if (flags & HAS_SSE4_1) cdef_filter_16_2 = cdef_filter_16_2_sse4_1; if (flags & HAS_AVX2) cdef_filter_16_2 = cdef_filter_16_2_avx2; - cdef_filter_16_3 = cdef_filter_16_3_sse2; - if (flags & HAS_SSSE3) cdef_filter_16_3 = cdef_filter_16_3_ssse3; + cdef_filter_16_3 = cdef_filter_16_3_c; if (flags & HAS_SSE4_1) cdef_filter_16_3 = cdef_filter_16_3_sse4_1; if (flags & HAS_AVX2) cdef_filter_16_3 = cdef_filter_16_3_avx2; - cdef_filter_8_0 = cdef_filter_8_0_sse2; - if (flags & HAS_SSSE3) cdef_filter_8_0 = cdef_filter_8_0_ssse3; + cdef_filter_8_0 = cdef_filter_8_0_c; if (flags & HAS_SSE4_1) cdef_filter_8_0 = cdef_filter_8_0_sse4_1; if (flags & HAS_AVX2) cdef_filter_8_0 = cdef_filter_8_0_avx2; - cdef_filter_8_1 = cdef_filter_8_1_sse2; - if (flags & HAS_SSSE3) cdef_filter_8_1 = cdef_filter_8_1_ssse3; + cdef_filter_8_1 = cdef_filter_8_1_c; if (flags & HAS_SSE4_1) cdef_filter_8_1 = cdef_filter_8_1_sse4_1; if (flags & HAS_AVX2) cdef_filter_8_1 = cdef_filter_8_1_avx2; - cdef_filter_8_2 = cdef_filter_8_2_sse2; - if (flags & HAS_SSSE3) cdef_filter_8_2 = cdef_filter_8_2_ssse3; + cdef_filter_8_2 = cdef_filter_8_2_c; if (flags & HAS_SSE4_1) cdef_filter_8_2 = cdef_filter_8_2_sse4_1; if (flags & HAS_AVX2) cdef_filter_8_2 = cdef_filter_8_2_avx2; - cdef_filter_8_3 = cdef_filter_8_3_sse2; - if (flags & HAS_SSSE3) cdef_filter_8_3 = cdef_filter_8_3_ssse3; + cdef_filter_8_3 = cdef_filter_8_3_c; if (flags & HAS_SSE4_1) cdef_filter_8_3 = cdef_filter_8_3_sse4_1; if (flags & HAS_AVX2) cdef_filter_8_3 = cdef_filter_8_3_avx2; - cdef_find_dir = cdef_find_dir_sse2; - if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3; + cdef_find_dir = cdef_find_dir_c; if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1; if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2; - cdef_find_dir_dual = cdef_find_dir_dual_sse2; - if (flags & HAS_SSSE3) cdef_find_dir_dual = cdef_find_dir_dual_ssse3; + cdef_find_dir_dual = cdef_find_dir_dual_c; if (flags & HAS_SSE4_1) cdef_find_dir_dual = cdef_find_dir_dual_sse4_1; if (flags & HAS_AVX2) cdef_find_dir_dual = cdef_find_dir_dual_avx2; cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_c; diff --git a/media/libaom/config/mac/x64/config/aom_config.asm b/media/libaom/config/mac/x64/config/aom_config.asm index f793ff3c6d..3f470f3a5f 100644 --- a/media/libaom/config/mac/x64/config/aom_config.asm +++ b/media/libaom/config/mac/x64/config/aom_config.asm @@ -53,6 +53,7 @@ CONFIG_OS_SUPPORT equ 1 CONFIG_OUTPUT_FRAME_SIZE equ 0 CONFIG_PARTITION_SEARCH_ORDER equ 0 CONFIG_PIC equ 0 +CONFIG_QUANT_MATRIX equ 1 CONFIG_RATECTRL_LOG equ 0 CONFIG_RD_COMMAND equ 0 CONFIG_RD_DEBUG equ 0 @@ -87,6 +88,7 @@ HAVE_SSE4_1 equ 1 HAVE_SSE4_2 equ 1 HAVE_SSSE3 equ 1 HAVE_SVE equ 0 +HAVE_SVE2 equ 0 HAVE_VSX equ 0 HAVE_WXWIDGETS equ 0 STATIC_LINK_JXL equ 0 diff --git a/media/libaom/config/mac/x64/config/aom_config.h b/media/libaom/config/mac/x64/config/aom_config.h index 670d2ffe56..6d96b65b07 100644 --- a/media/libaom/config/mac/x64/config/aom_config.h +++ b/media/libaom/config/mac/x64/config/aom_config.h @@ -55,6 +55,7 @@ #define CONFIG_OUTPUT_FRAME_SIZE 0 #define CONFIG_PARTITION_SEARCH_ORDER 0 #define CONFIG_PIC 0 +#define CONFIG_QUANT_MATRIX 1 #define CONFIG_RATECTRL_LOG 0 #define CONFIG_RD_COMMAND 0 #define CONFIG_RD_DEBUG 0 @@ -89,6 +90,7 @@ #define HAVE_SSE4_2 1 #define HAVE_SSSE3 1 #define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_VSX 0 #define HAVE_WXWIDGETS 0 #define INLINE inline diff --git a/media/libaom/config/mac/x64/config/aom_dsp_rtcd.h b/media/libaom/config/mac/x64/config/aom_dsp_rtcd.h index 8e979cc189..9135c6f423 100644 --- a/media/libaom/config/mac/x64/config/aom_dsp_rtcd.h +++ b/media/libaom/config/mac/x64/config/aom_dsp_rtcd.h @@ -57,21 +57,30 @@ void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, int width void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask); RTCD_EXTERN void (*aom_comp_mask_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask); +double aom_compute_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2); +double aom_compute_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2); +double aom_compute_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2); +RTCD_EXTERN double (*aom_compute_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2); + void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v); void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v); +void aom_compute_flow_at_point_avx2(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v); RTCD_EXTERN void (*aom_compute_flow_at_point)(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v); +bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev); +bool aom_compute_mean_stddev_sse4_1(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev); +bool aom_compute_mean_stddev_avx2(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev); +RTCD_EXTERN bool (*aom_compute_mean_stddev)(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev); + void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define aom_convolve8 aom_convolve8_c void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*aom_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*aom_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); @@ -903,7 +912,8 @@ RTCD_EXTERN unsigned int (*aom_highbd_10_masked_sub_pixel_variance8x8)(const uin unsigned int aom_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define aom_highbd_10_mse16x16 aom_highbd_10_mse16x16_sse2 +unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*aom_highbd_10_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int aom_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); #define aom_highbd_10_mse16x8 aom_highbd_10_mse16x8_c @@ -5132,7 +5142,8 @@ unsigned int aom_sad16x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const #define aom_sad16x4_avg aom_sad16x4_avg_sse2 void aom_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); -#define aom_sad16x4x3d aom_sad16x4x3d_c +void aom_sad16x4x3d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*aom_sad16x4x3d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); void aom_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); void aom_sad16x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); @@ -5468,7 +5479,8 @@ unsigned int aom_sad_skip_16x4_c(const uint8_t *src_ptr, int src_stride, const u #define aom_sad_skip_16x4 aom_sad_skip_16x4_c void aom_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); -#define aom_sad_skip_16x4x4d aom_sad_skip_16x4x4d_c +void aom_sad_skip_16x4x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*aom_sad_skip_16x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); unsigned int aom_sad_skip_16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int aom_sad_skip_16x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -5870,243 +5882,199 @@ void aom_ssim_parms_8x8_sse2(const uint8_t *s, int sp, const uint8_t *r, int rp, #define aom_ssim_parms_8x8 aom_ssim_parms_8x8_sse2 uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x4_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x8_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); @@ -6329,11 +6297,6 @@ int aom_vector_var_sse4_1(const int16_t *ref, const int16_t *src, int bwl); int aom_vector_var_avx2(const int16_t *ref, const int16_t *src, int bwl); RTCD_EXTERN int (*aom_vector_var)(const int16_t *ref, const int16_t *src, int bwl); -double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2); -double av1_compute_cross_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2); -double av1_compute_cross_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2); -RTCD_EXTERN double (*av1_compute_cross_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2); - void aom_dsp_rtcd(void); #ifdef RTCD_C @@ -6358,12 +6321,19 @@ static void setup_rtcd_internal(void) aom_comp_mask_pred = aom_comp_mask_pred_c; if (flags & HAS_SSSE3) aom_comp_mask_pred = aom_comp_mask_pred_ssse3; if (flags & HAS_AVX2) aom_comp_mask_pred = aom_comp_mask_pred_avx2; + aom_compute_correlation = aom_compute_correlation_c; + if (flags & HAS_SSE4_1) aom_compute_correlation = aom_compute_correlation_sse4_1; + if (flags & HAS_AVX2) aom_compute_correlation = aom_compute_correlation_avx2; aom_compute_flow_at_point = aom_compute_flow_at_point_c; if (flags & HAS_SSE4_1) aom_compute_flow_at_point = aom_compute_flow_at_point_sse4_1; - aom_convolve8_horiz = aom_convolve8_horiz_sse2; + if (flags & HAS_AVX2) aom_compute_flow_at_point = aom_compute_flow_at_point_avx2; + aom_compute_mean_stddev = aom_compute_mean_stddev_c; + if (flags & HAS_SSE4_1) aom_compute_mean_stddev = aom_compute_mean_stddev_sse4_1; + if (flags & HAS_AVX2) aom_compute_mean_stddev = aom_compute_mean_stddev_avx2; + aom_convolve8_horiz = aom_convolve8_horiz_c; if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3; if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2; - aom_convolve8_vert = aom_convolve8_vert_sse2; + aom_convolve8_vert = aom_convolve8_vert_c; if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3; if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2; aom_convolve_copy = aom_convolve_copy_sse2; @@ -6528,6 +6498,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x4 = aom_highbd_10_masked_sub_pixel_variance8x4_ssse3; aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_c; if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_ssse3; + aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_sse2; + if (flags & HAS_AVX2) aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_avx2; aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_c; if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_sse4_1; aom_highbd_10_obmc_variance128x64 = aom_highbd_10_obmc_variance128x64_c; @@ -7626,6 +7598,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) aom_sad16x32x3d = aom_sad16x32x3d_avx2; aom_sad16x32x4d = aom_sad16x32x4d_sse2; if (flags & HAS_AVX2) aom_sad16x32x4d = aom_sad16x32x4d_avx2; + aom_sad16x4x3d = aom_sad16x4x3d_c; + if (flags & HAS_AVX2) aom_sad16x4x3d = aom_sad16x4x3d_avx2; aom_sad16x4x4d = aom_sad16x4x4d_sse2; if (flags & HAS_AVX2) aom_sad16x4x4d = aom_sad16x4x4d_avx2; aom_sad16x64x3d = aom_sad16x64x3d_c; @@ -7704,6 +7678,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) aom_sad_skip_16x16x4d = aom_sad_skip_16x16x4d_avx2; aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_sse2; if (flags & HAS_AVX2) aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_avx2; + aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_c; + if (flags & HAS_AVX2) aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_avx2; aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_sse2; if (flags & HAS_AVX2) aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_avx2; aom_sad_skip_16x8x4d = aom_sad_skip_16x8x4d_sse2; @@ -7859,114 +7835,114 @@ static void setup_rtcd_internal(void) aom_sse = aom_sse_c; if (flags & HAS_SSE4_1) aom_sse = aom_sse_sse4_1; if (flags & HAS_AVX2) aom_sse = aom_sse_avx2; - aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_sse2; + aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_avx2; - aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_sse2; + aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_avx2; - aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_sse2; + aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_ssse3; - aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_sse2; + aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_ssse3; - aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_sse2; + aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_ssse3; - aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_sse2; + aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_ssse3; - aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_sse2; + aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_ssse3; - aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_sse2; + aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_avx2; - aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_sse2; + aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_avx2; - aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_sse2; + aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_avx2; - aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_sse2; + aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_ssse3; - aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_sse2; + aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_ssse3; - aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_sse2; + aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_ssse3; - aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_sse2; + aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_ssse3; - aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_sse2; + aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_avx2; - aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_sse2; + aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_ssse3; - aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_sse2; + aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_avx2; - aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_sse2; + aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_avx2; - aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_sse2; + aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_ssse3; - aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_sse2; + aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_ssse3; - aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_sse2; + aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_ssse3; - aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_sse2; + aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_ssse3; - aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_sse2; + aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_avx2; - aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_sse2; + aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_avx2; - aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_sse2; + aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_avx2; - aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_sse2; + aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_avx2; - aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_sse2; + aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_avx2; - aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_sse2; + aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_avx2; - aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_sse2; + aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_avx2; - aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_sse2; + aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_avx2; - aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_sse2; + aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_avx2; - aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_sse2; + aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_avx2; - aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_sse2; + aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_ssse3; - aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_sse2; + aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_ssse3; - aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_sse2; + aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_ssse3; - aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_sse2; + aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_ssse3; - aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_sse2; + aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_avx2; - aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_sse2; + aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_ssse3; - aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_sse2; + aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_avx2; - aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_sse2; + aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_avx2; - aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_sse2; + aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_ssse3; - aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_sse2; + aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_ssse3; - aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_sse2; + aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_ssse3; - aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_sse2; + aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_ssse3; aom_subtract_block = aom_subtract_block_sse2; if (flags & HAS_AVX2) aom_subtract_block = aom_subtract_block_avx2; @@ -8023,9 +7999,6 @@ static void setup_rtcd_internal(void) aom_vector_var = aom_vector_var_c; if (flags & HAS_SSE4_1) aom_vector_var = aom_vector_var_sse4_1; if (flags & HAS_AVX2) aom_vector_var = aom_vector_var_avx2; - av1_compute_cross_correlation = av1_compute_cross_correlation_c; - if (flags & HAS_SSE4_1) av1_compute_cross_correlation = av1_compute_cross_correlation_sse4_1; - if (flags & HAS_AVX2) av1_compute_cross_correlation = av1_compute_cross_correlation_avx2; } #endif diff --git a/media/libaom/config/mac/x64/config/aom_scale_rtcd.h b/media/libaom/config/mac/x64/config/aom_scale_rtcd.h index 3b70fb47c3..cdabb21106 100644 --- a/media/libaom/config/mac/x64/config/aom_scale_rtcd.h +++ b/media/libaom/config/mac/x64/config/aom_scale_rtcd.h @@ -8,13 +8,15 @@ #define RTCD_EXTERN extern #endif +#include <stdbool.h> + struct yv12_buffer_config; #ifdef __cplusplus extern "C" { #endif -void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes); +void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, int num_planes); #define aom_extend_frame_borders aom_extend_frame_borders_c void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end); @@ -50,13 +52,13 @@ void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigne void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes); #define aom_yv12_copy_frame aom_yv12_copy_frame_c -void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc); +void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop); #define aom_yv12_copy_u aom_yv12_copy_u_c -void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc); +void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop); #define aom_yv12_copy_v aom_yv12_copy_v_c -void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int use_crop); #define aom_yv12_copy_y aom_yv12_copy_y_c void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes); @@ -80,7 +82,7 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2); #define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c -int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_pyramid_levels, int num_planes); +int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes); #define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c void aom_scale_rtcd(void); diff --git a/media/libaom/config/mac/x64/config/av1_rtcd.h b/media/libaom/config/mac/x64/config/av1_rtcd.h index b1cdc99700..ad72985afe 100644 --- a/media/libaom/config/mac/x64/config/av1_rtcd.h +++ b/media/libaom/config/mac/x64/config/av1_rtcd.h @@ -253,7 +253,6 @@ void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *ds #define av1_convolve_y_sr_intrabc av1_convolve_y_sr_intrabc_c void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); -void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); RTCD_EXTERN void (*av1_dist_wtd_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); @@ -659,7 +658,6 @@ void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param); -void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param); void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param); void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param); RTCD_EXTERN void (*av1_lowbd_fwd_txfm)(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param); @@ -755,85 +753,61 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params); void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); -void cdef_copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); -void cdef_copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); void cdef_copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); void cdef_copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); RTCD_EXTERN void (*cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); -void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); -void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); void cdef_copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); RTCD_EXTERN void (*cdef_copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_0_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_0_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_0_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_0_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_16_0)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_1_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_1_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_1_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_1_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_1_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_16_1)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_2_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_2_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_2_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_2_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_2_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_16_2)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_3_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_3_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_3_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_3_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_3_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_16_3)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_0_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_0_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_0_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_0_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_0_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_8_0)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_1_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_1_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_1_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_1_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_1_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_8_1)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_2_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_2_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_2_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_2_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_2_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_8_2)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_3_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_3_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_3_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_3_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_3_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_8_3)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift); -int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift); -int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift); int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift); int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift); RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift); void cdef_find_dir_dual_c(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); -void cdef_find_dir_dual_sse2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); -void cdef_find_dir_dual_ssse3(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); void cdef_find_dir_dual_sse4_1(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); RTCD_EXTERN void (*cdef_find_dir_dual)(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); @@ -941,7 +915,7 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) av1_convolve_x_sr = av1_convolve_x_sr_avx2; av1_convolve_y_sr = av1_convolve_y_sr_sse2; if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2; - av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_sse2; + av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_c; if (flags & HAS_SSSE3) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_ssse3; if (flags & HAS_AVX2) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_avx2; av1_dist_wtd_convolve_2d_copy = av1_dist_wtd_convolve_2d_copy_sse2; @@ -1091,7 +1065,7 @@ static void setup_rtcd_internal(void) av1_inv_txfm_add = av1_inv_txfm_add_c; if (flags & HAS_SSSE3) av1_inv_txfm_add = av1_inv_txfm_add_ssse3; if (flags & HAS_AVX2) av1_inv_txfm_add = av1_inv_txfm_add_avx2; - av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_sse2; + av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_c; if (flags & HAS_SSE4_1) av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_sse4_1; if (flags & HAS_AVX2) av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_avx2; av1_lowbd_pixel_proj_error = av1_lowbd_pixel_proj_error_c; @@ -1133,52 +1107,40 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) av1_wedge_sse_from_residuals = av1_wedge_sse_from_residuals_avx2; av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2; if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2; - cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse2; - if (flags & HAS_SSSE3) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_ssse3; + cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_c; if (flags & HAS_SSE4_1) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse4_1; if (flags & HAS_AVX2) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_avx2; - cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse2; - if (flags & HAS_SSSE3) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_ssse3; + cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_c; if (flags & HAS_SSE4_1) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse4_1; if (flags & HAS_AVX2) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_avx2; - cdef_filter_16_0 = cdef_filter_16_0_sse2; - if (flags & HAS_SSSE3) cdef_filter_16_0 = cdef_filter_16_0_ssse3; + cdef_filter_16_0 = cdef_filter_16_0_c; if (flags & HAS_SSE4_1) cdef_filter_16_0 = cdef_filter_16_0_sse4_1; if (flags & HAS_AVX2) cdef_filter_16_0 = cdef_filter_16_0_avx2; - cdef_filter_16_1 = cdef_filter_16_1_sse2; - if (flags & HAS_SSSE3) cdef_filter_16_1 = cdef_filter_16_1_ssse3; + cdef_filter_16_1 = cdef_filter_16_1_c; if (flags & HAS_SSE4_1) cdef_filter_16_1 = cdef_filter_16_1_sse4_1; if (flags & HAS_AVX2) cdef_filter_16_1 = cdef_filter_16_1_avx2; - cdef_filter_16_2 = cdef_filter_16_2_sse2; - if (flags & HAS_SSSE3) cdef_filter_16_2 = cdef_filter_16_2_ssse3; + cdef_filter_16_2 = cdef_filter_16_2_c; if (flags & HAS_SSE4_1) cdef_filter_16_2 = cdef_filter_16_2_sse4_1; if (flags & HAS_AVX2) cdef_filter_16_2 = cdef_filter_16_2_avx2; - cdef_filter_16_3 = cdef_filter_16_3_sse2; - if (flags & HAS_SSSE3) cdef_filter_16_3 = cdef_filter_16_3_ssse3; + cdef_filter_16_3 = cdef_filter_16_3_c; if (flags & HAS_SSE4_1) cdef_filter_16_3 = cdef_filter_16_3_sse4_1; if (flags & HAS_AVX2) cdef_filter_16_3 = cdef_filter_16_3_avx2; - cdef_filter_8_0 = cdef_filter_8_0_sse2; - if (flags & HAS_SSSE3) cdef_filter_8_0 = cdef_filter_8_0_ssse3; + cdef_filter_8_0 = cdef_filter_8_0_c; if (flags & HAS_SSE4_1) cdef_filter_8_0 = cdef_filter_8_0_sse4_1; if (flags & HAS_AVX2) cdef_filter_8_0 = cdef_filter_8_0_avx2; - cdef_filter_8_1 = cdef_filter_8_1_sse2; - if (flags & HAS_SSSE3) cdef_filter_8_1 = cdef_filter_8_1_ssse3; + cdef_filter_8_1 = cdef_filter_8_1_c; if (flags & HAS_SSE4_1) cdef_filter_8_1 = cdef_filter_8_1_sse4_1; if (flags & HAS_AVX2) cdef_filter_8_1 = cdef_filter_8_1_avx2; - cdef_filter_8_2 = cdef_filter_8_2_sse2; - if (flags & HAS_SSSE3) cdef_filter_8_2 = cdef_filter_8_2_ssse3; + cdef_filter_8_2 = cdef_filter_8_2_c; if (flags & HAS_SSE4_1) cdef_filter_8_2 = cdef_filter_8_2_sse4_1; if (flags & HAS_AVX2) cdef_filter_8_2 = cdef_filter_8_2_avx2; - cdef_filter_8_3 = cdef_filter_8_3_sse2; - if (flags & HAS_SSSE3) cdef_filter_8_3 = cdef_filter_8_3_ssse3; + cdef_filter_8_3 = cdef_filter_8_3_c; if (flags & HAS_SSE4_1) cdef_filter_8_3 = cdef_filter_8_3_sse4_1; if (flags & HAS_AVX2) cdef_filter_8_3 = cdef_filter_8_3_avx2; - cdef_find_dir = cdef_find_dir_sse2; - if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3; + cdef_find_dir = cdef_find_dir_c; if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1; if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2; - cdef_find_dir_dual = cdef_find_dir_dual_sse2; - if (flags & HAS_SSSE3) cdef_find_dir_dual = cdef_find_dir_dual_ssse3; + cdef_find_dir_dual = cdef_find_dir_dual_c; if (flags & HAS_SSE4_1) cdef_find_dir_dual = cdef_find_dir_dual_sse4_1; if (flags & HAS_AVX2) cdef_find_dir_dual = cdef_find_dir_dual_avx2; cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_c; diff --git a/media/libaom/config/win/ia32/config/aom_config.asm b/media/libaom/config/win/ia32/config/aom_config.asm index af78328283..8f6c3592fa 100644 --- a/media/libaom/config/win/ia32/config/aom_config.asm +++ b/media/libaom/config/win/ia32/config/aom_config.asm @@ -53,6 +53,7 @@ CONFIG_OS_SUPPORT equ 1 CONFIG_OUTPUT_FRAME_SIZE equ 0 CONFIG_PARTITION_SEARCH_ORDER equ 0 CONFIG_PIC equ 0 +CONFIG_QUANT_MATRIX equ 1 CONFIG_RATECTRL_LOG equ 0 CONFIG_RD_COMMAND equ 0 CONFIG_RD_DEBUG equ 0 @@ -87,6 +88,7 @@ HAVE_SSE4_1 equ 1 HAVE_SSE4_2 equ 1 HAVE_SSSE3 equ 1 HAVE_SVE equ 0 +HAVE_SVE2 equ 0 HAVE_VSX equ 0 HAVE_WXWIDGETS equ 0 STATIC_LINK_JXL equ 0 diff --git a/media/libaom/config/win/ia32/config/aom_config.h b/media/libaom/config/win/ia32/config/aom_config.h index dba805b1b6..7d1ce61373 100644 --- a/media/libaom/config/win/ia32/config/aom_config.h +++ b/media/libaom/config/win/ia32/config/aom_config.h @@ -55,6 +55,7 @@ #define CONFIG_OUTPUT_FRAME_SIZE 0 #define CONFIG_PARTITION_SEARCH_ORDER 0 #define CONFIG_PIC 0 +#define CONFIG_QUANT_MATRIX 1 #define CONFIG_RATECTRL_LOG 0 #define CONFIG_RD_COMMAND 0 #define CONFIG_RD_DEBUG 0 @@ -89,6 +90,7 @@ #define HAVE_SSE4_2 1 #define HAVE_SSSE3 1 #define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_VSX 0 #define HAVE_WXWIDGETS 0 #define INLINE inline diff --git a/media/libaom/config/win/ia32/config/aom_dsp_rtcd.h b/media/libaom/config/win/ia32/config/aom_dsp_rtcd.h index a19adf5f61..93472f0e92 100644 --- a/media/libaom/config/win/ia32/config/aom_dsp_rtcd.h +++ b/media/libaom/config/win/ia32/config/aom_dsp_rtcd.h @@ -57,21 +57,30 @@ void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, int width void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask); RTCD_EXTERN void (*aom_comp_mask_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask); +double aom_compute_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2); +double aom_compute_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2); +double aom_compute_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2); +RTCD_EXTERN double (*aom_compute_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2); + void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v); void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v); +void aom_compute_flow_at_point_avx2(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v); RTCD_EXTERN void (*aom_compute_flow_at_point)(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v); +bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev); +bool aom_compute_mean_stddev_sse4_1(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev); +bool aom_compute_mean_stddev_avx2(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev); +RTCD_EXTERN bool (*aom_compute_mean_stddev)(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev); + void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define aom_convolve8 aom_convolve8_c void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*aom_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*aom_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); @@ -903,6 +912,7 @@ RTCD_EXTERN unsigned int (*aom_highbd_10_masked_sub_pixel_variance8x8)(const uin unsigned int aom_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*aom_highbd_10_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int aom_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); @@ -5130,7 +5140,8 @@ unsigned int aom_sad16x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const RTCD_EXTERN unsigned int (*aom_sad16x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); void aom_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); -#define aom_sad16x4x3d aom_sad16x4x3d_c +void aom_sad16x4x3d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*aom_sad16x4x3d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); void aom_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); void aom_sad16x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); @@ -5466,7 +5477,8 @@ unsigned int aom_sad_skip_16x4_c(const uint8_t *src_ptr, int src_stride, const u #define aom_sad_skip_16x4 aom_sad_skip_16x4_c void aom_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); -#define aom_sad_skip_16x4x4d aom_sad_skip_16x4x4d_c +void aom_sad_skip_16x4x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*aom_sad_skip_16x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); unsigned int aom_sad_skip_16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int aom_sad_skip_16x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -5867,243 +5879,199 @@ void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp, ui #define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x4_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x8_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); @@ -6326,11 +6294,6 @@ int aom_vector_var_sse4_1(const int16_t *ref, const int16_t *src, int bwl); int aom_vector_var_avx2(const int16_t *ref, const int16_t *src, int bwl); RTCD_EXTERN int (*aom_vector_var)(const int16_t *ref, const int16_t *src, int bwl); -double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2); -double av1_compute_cross_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2); -double av1_compute_cross_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2); -RTCD_EXTERN double (*av1_compute_cross_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2); - void aom_dsp_rtcd(void); #ifdef RTCD_C @@ -6360,14 +6323,19 @@ static void setup_rtcd_internal(void) aom_comp_mask_pred = aom_comp_mask_pred_c; if (flags & HAS_SSSE3) aom_comp_mask_pred = aom_comp_mask_pred_ssse3; if (flags & HAS_AVX2) aom_comp_mask_pred = aom_comp_mask_pred_avx2; + aom_compute_correlation = aom_compute_correlation_c; + if (flags & HAS_SSE4_1) aom_compute_correlation = aom_compute_correlation_sse4_1; + if (flags & HAS_AVX2) aom_compute_correlation = aom_compute_correlation_avx2; aom_compute_flow_at_point = aom_compute_flow_at_point_c; if (flags & HAS_SSE4_1) aom_compute_flow_at_point = aom_compute_flow_at_point_sse4_1; + if (flags & HAS_AVX2) aom_compute_flow_at_point = aom_compute_flow_at_point_avx2; + aom_compute_mean_stddev = aom_compute_mean_stddev_c; + if (flags & HAS_SSE4_1) aom_compute_mean_stddev = aom_compute_mean_stddev_sse4_1; + if (flags & HAS_AVX2) aom_compute_mean_stddev = aom_compute_mean_stddev_avx2; aom_convolve8_horiz = aom_convolve8_horiz_c; - if (flags & HAS_SSE2) aom_convolve8_horiz = aom_convolve8_horiz_sse2; if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3; if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2; aom_convolve8_vert = aom_convolve8_vert_c; - if (flags & HAS_SSE2) aom_convolve8_vert = aom_convolve8_vert_sse2; if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3; if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2; aom_convolve_copy = aom_convolve_copy_c; @@ -6768,6 +6736,7 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_ssse3; aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_c; if (flags & HAS_SSE2) aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_sse2; + if (flags & HAS_AVX2) aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_avx2; aom_highbd_10_mse8x8 = aom_highbd_10_mse8x8_c; if (flags & HAS_SSE2) aom_highbd_10_mse8x8 = aom_highbd_10_mse8x8_sse2; aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_c; @@ -8526,6 +8495,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) aom_sad16x4 = aom_sad16x4_sse2; aom_sad16x4_avg = aom_sad16x4_avg_c; if (flags & HAS_SSE2) aom_sad16x4_avg = aom_sad16x4_avg_sse2; + aom_sad16x4x3d = aom_sad16x4x3d_c; + if (flags & HAS_AVX2) aom_sad16x4x3d = aom_sad16x4x3d_avx2; aom_sad16x4x4d = aom_sad16x4x4d_c; if (flags & HAS_SSE2) aom_sad16x4x4d = aom_sad16x4x4d_sse2; if (flags & HAS_AVX2) aom_sad16x4x4d = aom_sad16x4x4d_avx2; @@ -8695,6 +8666,8 @@ static void setup_rtcd_internal(void) aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_c; if (flags & HAS_SSE2) aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_sse2; if (flags & HAS_AVX2) aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_avx2; + aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_c; + if (flags & HAS_AVX2) aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_avx2; aom_sad_skip_16x64 = aom_sad_skip_16x64_c; if (flags & HAS_SSE2) aom_sad_skip_16x64 = aom_sad_skip_16x64_sse2; aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_c; @@ -8897,157 +8870,113 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE4_1) aom_sse = aom_sse_sse4_1; if (flags & HAS_AVX2) aom_sse = aom_sse_avx2; aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_avx2; aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_avx2; aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_ssse3; aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_ssse3; aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_ssse3; aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_ssse3; aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_ssse3; aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_avx2; aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_avx2; aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_avx2; aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_ssse3; aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_ssse3; aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_ssse3; aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_ssse3; aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_avx2; aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_ssse3; aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_avx2; aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_avx2; aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_ssse3; aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_ssse3; aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_ssse3; aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_c; - if (flags & HAS_SSE2) aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_ssse3; aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_avx2; aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_avx2; aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_avx2; aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_avx2; aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_avx2; aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_avx2; aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_avx2; aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_avx2; aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_avx2; aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_avx2; aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_ssse3; aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_ssse3; aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_ssse3; aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_ssse3; aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_avx2; aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_ssse3; aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_avx2; aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_avx2; aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_ssse3; aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_ssse3; aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_ssse3; aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_c; - if (flags & HAS_SSE2) aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_sse2; if (flags & HAS_SSSE3) aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_ssse3; aom_subtract_block = aom_subtract_block_c; if (flags & HAS_SSE2) aom_subtract_block = aom_subtract_block_sse2; @@ -9172,9 +9101,6 @@ static void setup_rtcd_internal(void) aom_vector_var = aom_vector_var_c; if (flags & HAS_SSE4_1) aom_vector_var = aom_vector_var_sse4_1; if (flags & HAS_AVX2) aom_vector_var = aom_vector_var_avx2; - av1_compute_cross_correlation = av1_compute_cross_correlation_c; - if (flags & HAS_SSE4_1) av1_compute_cross_correlation = av1_compute_cross_correlation_sse4_1; - if (flags & HAS_AVX2) av1_compute_cross_correlation = av1_compute_cross_correlation_avx2; } #endif diff --git a/media/libaom/config/win/ia32/config/aom_scale_rtcd.h b/media/libaom/config/win/ia32/config/aom_scale_rtcd.h index 3b70fb47c3..cdabb21106 100644 --- a/media/libaom/config/win/ia32/config/aom_scale_rtcd.h +++ b/media/libaom/config/win/ia32/config/aom_scale_rtcd.h @@ -8,13 +8,15 @@ #define RTCD_EXTERN extern #endif +#include <stdbool.h> + struct yv12_buffer_config; #ifdef __cplusplus extern "C" { #endif -void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes); +void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, int num_planes); #define aom_extend_frame_borders aom_extend_frame_borders_c void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end); @@ -50,13 +52,13 @@ void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigne void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes); #define aom_yv12_copy_frame aom_yv12_copy_frame_c -void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc); +void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop); #define aom_yv12_copy_u aom_yv12_copy_u_c -void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc); +void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop); #define aom_yv12_copy_v aom_yv12_copy_v_c -void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int use_crop); #define aom_yv12_copy_y aom_yv12_copy_y_c void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes); @@ -80,7 +82,7 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2); #define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c -int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_pyramid_levels, int num_planes); +int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes); #define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c void aom_scale_rtcd(void); diff --git a/media/libaom/config/win/ia32/config/av1_rtcd.h b/media/libaom/config/win/ia32/config/av1_rtcd.h index 3f404f61c8..37716517bf 100644 --- a/media/libaom/config/win/ia32/config/av1_rtcd.h +++ b/media/libaom/config/win/ia32/config/av1_rtcd.h @@ -265,7 +265,6 @@ void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *ds #define av1_convolve_y_sr_intrabc av1_convolve_y_sr_intrabc_c void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); -void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); RTCD_EXTERN void (*av1_dist_wtd_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); @@ -764,84 +763,72 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params); void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); -void cdef_copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); void cdef_copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); void cdef_copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); void cdef_copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); RTCD_EXTERN void (*cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); -void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); void cdef_copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); RTCD_EXTERN void (*cdef_copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_0_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_0_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_0_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_0_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_16_0)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_1_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_1_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_1_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_1_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_1_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_16_1)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_2_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_2_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_2_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_2_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_2_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_16_2)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_3_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_3_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_3_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_3_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_3_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_16_3)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_0_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_0_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_0_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_0_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_0_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_8_0)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_1_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_1_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_1_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_1_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_1_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_8_1)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_2_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_2_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_2_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_2_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_2_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_8_2)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_3_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_3_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_3_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_3_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_3_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_8_3)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift); -int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift); int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift); int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift); int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift); RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift); void cdef_find_dir_dual_c(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); -void cdef_find_dir_dual_sse2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); void cdef_find_dir_dual_ssse3(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); void cdef_find_dir_dual_sse4_1(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); @@ -969,7 +956,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) av1_convolve_y_sr = av1_convolve_y_sr_sse2; if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2; av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_c; - if (flags & HAS_SSE2) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_sse2; if (flags & HAS_SSSE3) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_ssse3; if (flags & HAS_AVX2) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_avx2; av1_dist_wtd_convolve_2d_copy = av1_dist_wtd_convolve_2d_copy_c; @@ -1176,62 +1162,50 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2; if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2; cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_c; - if (flags & HAS_SSE2) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse2; if (flags & HAS_SSSE3) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_ssse3; if (flags & HAS_SSE4_1) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse4_1; if (flags & HAS_AVX2) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_avx2; cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_c; - if (flags & HAS_SSE2) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse2; if (flags & HAS_SSSE3) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_ssse3; if (flags & HAS_SSE4_1) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse4_1; if (flags & HAS_AVX2) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_avx2; cdef_filter_16_0 = cdef_filter_16_0_c; - if (flags & HAS_SSE2) cdef_filter_16_0 = cdef_filter_16_0_sse2; if (flags & HAS_SSSE3) cdef_filter_16_0 = cdef_filter_16_0_ssse3; if (flags & HAS_SSE4_1) cdef_filter_16_0 = cdef_filter_16_0_sse4_1; if (flags & HAS_AVX2) cdef_filter_16_0 = cdef_filter_16_0_avx2; cdef_filter_16_1 = cdef_filter_16_1_c; - if (flags & HAS_SSE2) cdef_filter_16_1 = cdef_filter_16_1_sse2; if (flags & HAS_SSSE3) cdef_filter_16_1 = cdef_filter_16_1_ssse3; if (flags & HAS_SSE4_1) cdef_filter_16_1 = cdef_filter_16_1_sse4_1; if (flags & HAS_AVX2) cdef_filter_16_1 = cdef_filter_16_1_avx2; cdef_filter_16_2 = cdef_filter_16_2_c; - if (flags & HAS_SSE2) cdef_filter_16_2 = cdef_filter_16_2_sse2; if (flags & HAS_SSSE3) cdef_filter_16_2 = cdef_filter_16_2_ssse3; if (flags & HAS_SSE4_1) cdef_filter_16_2 = cdef_filter_16_2_sse4_1; if (flags & HAS_AVX2) cdef_filter_16_2 = cdef_filter_16_2_avx2; cdef_filter_16_3 = cdef_filter_16_3_c; - if (flags & HAS_SSE2) cdef_filter_16_3 = cdef_filter_16_3_sse2; if (flags & HAS_SSSE3) cdef_filter_16_3 = cdef_filter_16_3_ssse3; if (flags & HAS_SSE4_1) cdef_filter_16_3 = cdef_filter_16_3_sse4_1; if (flags & HAS_AVX2) cdef_filter_16_3 = cdef_filter_16_3_avx2; cdef_filter_8_0 = cdef_filter_8_0_c; - if (flags & HAS_SSE2) cdef_filter_8_0 = cdef_filter_8_0_sse2; if (flags & HAS_SSSE3) cdef_filter_8_0 = cdef_filter_8_0_ssse3; if (flags & HAS_SSE4_1) cdef_filter_8_0 = cdef_filter_8_0_sse4_1; if (flags & HAS_AVX2) cdef_filter_8_0 = cdef_filter_8_0_avx2; cdef_filter_8_1 = cdef_filter_8_1_c; - if (flags & HAS_SSE2) cdef_filter_8_1 = cdef_filter_8_1_sse2; if (flags & HAS_SSSE3) cdef_filter_8_1 = cdef_filter_8_1_ssse3; if (flags & HAS_SSE4_1) cdef_filter_8_1 = cdef_filter_8_1_sse4_1; if (flags & HAS_AVX2) cdef_filter_8_1 = cdef_filter_8_1_avx2; cdef_filter_8_2 = cdef_filter_8_2_c; - if (flags & HAS_SSE2) cdef_filter_8_2 = cdef_filter_8_2_sse2; if (flags & HAS_SSSE3) cdef_filter_8_2 = cdef_filter_8_2_ssse3; if (flags & HAS_SSE4_1) cdef_filter_8_2 = cdef_filter_8_2_sse4_1; if (flags & HAS_AVX2) cdef_filter_8_2 = cdef_filter_8_2_avx2; cdef_filter_8_3 = cdef_filter_8_3_c; - if (flags & HAS_SSE2) cdef_filter_8_3 = cdef_filter_8_3_sse2; if (flags & HAS_SSSE3) cdef_filter_8_3 = cdef_filter_8_3_ssse3; if (flags & HAS_SSE4_1) cdef_filter_8_3 = cdef_filter_8_3_sse4_1; if (flags & HAS_AVX2) cdef_filter_8_3 = cdef_filter_8_3_avx2; cdef_find_dir = cdef_find_dir_c; - if (flags & HAS_SSE2) cdef_find_dir = cdef_find_dir_sse2; if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3; if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1; if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2; cdef_find_dir_dual = cdef_find_dir_dual_c; - if (flags & HAS_SSE2) cdef_find_dir_dual = cdef_find_dir_dual_sse2; if (flags & HAS_SSSE3) cdef_find_dir_dual = cdef_find_dir_dual_ssse3; if (flags & HAS_SSE4_1) cdef_find_dir_dual = cdef_find_dir_dual_sse4_1; if (flags & HAS_AVX2) cdef_find_dir_dual = cdef_find_dir_dual_avx2; diff --git a/media/libaom/config/win/x64/config/aom_config.asm b/media/libaom/config/win/x64/config/aom_config.asm index f793ff3c6d..3f470f3a5f 100644 --- a/media/libaom/config/win/x64/config/aom_config.asm +++ b/media/libaom/config/win/x64/config/aom_config.asm @@ -53,6 +53,7 @@ CONFIG_OS_SUPPORT equ 1 CONFIG_OUTPUT_FRAME_SIZE equ 0 CONFIG_PARTITION_SEARCH_ORDER equ 0 CONFIG_PIC equ 0 +CONFIG_QUANT_MATRIX equ 1 CONFIG_RATECTRL_LOG equ 0 CONFIG_RD_COMMAND equ 0 CONFIG_RD_DEBUG equ 0 @@ -87,6 +88,7 @@ HAVE_SSE4_1 equ 1 HAVE_SSE4_2 equ 1 HAVE_SSSE3 equ 1 HAVE_SVE equ 0 +HAVE_SVE2 equ 0 HAVE_VSX equ 0 HAVE_WXWIDGETS equ 0 STATIC_LINK_JXL equ 0 diff --git a/media/libaom/config/win/x64/config/aom_config.h b/media/libaom/config/win/x64/config/aom_config.h index 670d2ffe56..6d96b65b07 100644 --- a/media/libaom/config/win/x64/config/aom_config.h +++ b/media/libaom/config/win/x64/config/aom_config.h @@ -55,6 +55,7 @@ #define CONFIG_OUTPUT_FRAME_SIZE 0 #define CONFIG_PARTITION_SEARCH_ORDER 0 #define CONFIG_PIC 0 +#define CONFIG_QUANT_MATRIX 1 #define CONFIG_RATECTRL_LOG 0 #define CONFIG_RD_COMMAND 0 #define CONFIG_RD_DEBUG 0 @@ -89,6 +90,7 @@ #define HAVE_SSE4_2 1 #define HAVE_SSSE3 1 #define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_VSX 0 #define HAVE_WXWIDGETS 0 #define INLINE inline diff --git a/media/libaom/config/win/x64/config/aom_dsp_rtcd.h b/media/libaom/config/win/x64/config/aom_dsp_rtcd.h index 8e979cc189..9135c6f423 100644 --- a/media/libaom/config/win/x64/config/aom_dsp_rtcd.h +++ b/media/libaom/config/win/x64/config/aom_dsp_rtcd.h @@ -57,21 +57,30 @@ void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, int width void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask); RTCD_EXTERN void (*aom_comp_mask_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask); +double aom_compute_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2); +double aom_compute_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2); +double aom_compute_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2); +RTCD_EXTERN double (*aom_compute_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2); + void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v); void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v); +void aom_compute_flow_at_point_avx2(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v); RTCD_EXTERN void (*aom_compute_flow_at_point)(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v); +bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev); +bool aom_compute_mean_stddev_sse4_1(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev); +bool aom_compute_mean_stddev_avx2(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev); +RTCD_EXTERN bool (*aom_compute_mean_stddev)(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev); + void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define aom_convolve8 aom_convolve8_c void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*aom_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*aom_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); @@ -903,7 +912,8 @@ RTCD_EXTERN unsigned int (*aom_highbd_10_masked_sub_pixel_variance8x8)(const uin unsigned int aom_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define aom_highbd_10_mse16x16 aom_highbd_10_mse16x16_sse2 +unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*aom_highbd_10_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int aom_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); #define aom_highbd_10_mse16x8 aom_highbd_10_mse16x8_c @@ -5132,7 +5142,8 @@ unsigned int aom_sad16x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const #define aom_sad16x4_avg aom_sad16x4_avg_sse2 void aom_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); -#define aom_sad16x4x3d aom_sad16x4x3d_c +void aom_sad16x4x3d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*aom_sad16x4x3d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); void aom_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); void aom_sad16x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); @@ -5468,7 +5479,8 @@ unsigned int aom_sad_skip_16x4_c(const uint8_t *src_ptr, int src_stride, const u #define aom_sad_skip_16x4 aom_sad_skip_16x4_c void aom_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); -#define aom_sad_skip_16x4x4d aom_sad_skip_16x4x4d_c +void aom_sad_skip_16x4x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*aom_sad_skip_16x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]); unsigned int aom_sad_skip_16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int aom_sad_skip_16x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -5870,243 +5882,199 @@ void aom_ssim_parms_8x8_sse2(const uint8_t *s, int sp, const uint8_t *r, int rp, #define aom_ssim_parms_8x8 aom_ssim_parms_8x8_sse2 uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t aom_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); uint32_t aom_sub_pixel_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x4_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance16x8_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t aom_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); uint32_t aom_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); @@ -6329,11 +6297,6 @@ int aom_vector_var_sse4_1(const int16_t *ref, const int16_t *src, int bwl); int aom_vector_var_avx2(const int16_t *ref, const int16_t *src, int bwl); RTCD_EXTERN int (*aom_vector_var)(const int16_t *ref, const int16_t *src, int bwl); -double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2); -double av1_compute_cross_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2); -double av1_compute_cross_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2); -RTCD_EXTERN double (*av1_compute_cross_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2); - void aom_dsp_rtcd(void); #ifdef RTCD_C @@ -6358,12 +6321,19 @@ static void setup_rtcd_internal(void) aom_comp_mask_pred = aom_comp_mask_pred_c; if (flags & HAS_SSSE3) aom_comp_mask_pred = aom_comp_mask_pred_ssse3; if (flags & HAS_AVX2) aom_comp_mask_pred = aom_comp_mask_pred_avx2; + aom_compute_correlation = aom_compute_correlation_c; + if (flags & HAS_SSE4_1) aom_compute_correlation = aom_compute_correlation_sse4_1; + if (flags & HAS_AVX2) aom_compute_correlation = aom_compute_correlation_avx2; aom_compute_flow_at_point = aom_compute_flow_at_point_c; if (flags & HAS_SSE4_1) aom_compute_flow_at_point = aom_compute_flow_at_point_sse4_1; - aom_convolve8_horiz = aom_convolve8_horiz_sse2; + if (flags & HAS_AVX2) aom_compute_flow_at_point = aom_compute_flow_at_point_avx2; + aom_compute_mean_stddev = aom_compute_mean_stddev_c; + if (flags & HAS_SSE4_1) aom_compute_mean_stddev = aom_compute_mean_stddev_sse4_1; + if (flags & HAS_AVX2) aom_compute_mean_stddev = aom_compute_mean_stddev_avx2; + aom_convolve8_horiz = aom_convolve8_horiz_c; if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3; if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2; - aom_convolve8_vert = aom_convolve8_vert_sse2; + aom_convolve8_vert = aom_convolve8_vert_c; if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3; if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2; aom_convolve_copy = aom_convolve_copy_sse2; @@ -6528,6 +6498,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x4 = aom_highbd_10_masked_sub_pixel_variance8x4_ssse3; aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_c; if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_ssse3; + aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_sse2; + if (flags & HAS_AVX2) aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_avx2; aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_c; if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_sse4_1; aom_highbd_10_obmc_variance128x64 = aom_highbd_10_obmc_variance128x64_c; @@ -7626,6 +7598,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) aom_sad16x32x3d = aom_sad16x32x3d_avx2; aom_sad16x32x4d = aom_sad16x32x4d_sse2; if (flags & HAS_AVX2) aom_sad16x32x4d = aom_sad16x32x4d_avx2; + aom_sad16x4x3d = aom_sad16x4x3d_c; + if (flags & HAS_AVX2) aom_sad16x4x3d = aom_sad16x4x3d_avx2; aom_sad16x4x4d = aom_sad16x4x4d_sse2; if (flags & HAS_AVX2) aom_sad16x4x4d = aom_sad16x4x4d_avx2; aom_sad16x64x3d = aom_sad16x64x3d_c; @@ -7704,6 +7678,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) aom_sad_skip_16x16x4d = aom_sad_skip_16x16x4d_avx2; aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_sse2; if (flags & HAS_AVX2) aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_avx2; + aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_c; + if (flags & HAS_AVX2) aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_avx2; aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_sse2; if (flags & HAS_AVX2) aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_avx2; aom_sad_skip_16x8x4d = aom_sad_skip_16x8x4d_sse2; @@ -7859,114 +7835,114 @@ static void setup_rtcd_internal(void) aom_sse = aom_sse_c; if (flags & HAS_SSE4_1) aom_sse = aom_sse_sse4_1; if (flags & HAS_AVX2) aom_sse = aom_sse_avx2; - aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_sse2; + aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_avx2; - aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_sse2; + aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_avx2; - aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_sse2; + aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_ssse3; - aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_sse2; + aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_ssse3; - aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_sse2; + aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_ssse3; - aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_sse2; + aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_ssse3; - aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_sse2; + aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_ssse3; - aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_sse2; + aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_avx2; - aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_sse2; + aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_avx2; - aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_sse2; + aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_avx2; - aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_sse2; + aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_ssse3; - aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_sse2; + aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_ssse3; - aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_sse2; + aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_ssse3; - aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_sse2; + aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_ssse3; - aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_sse2; + aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_avx2; - aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_sse2; + aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_ssse3; - aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_sse2; + aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_avx2; - aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_sse2; + aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_avx2; - aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_sse2; + aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_ssse3; - aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_sse2; + aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_ssse3; - aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_sse2; + aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_ssse3; - aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_sse2; + aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_c; if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_ssse3; - aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_sse2; + aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_avx2; - aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_sse2; + aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_avx2; - aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_sse2; + aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_avx2; - aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_sse2; + aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_avx2; - aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_sse2; + aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_avx2; - aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_sse2; + aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_avx2; - aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_sse2; + aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_avx2; - aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_sse2; + aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_avx2; - aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_sse2; + aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_avx2; - aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_sse2; + aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_avx2; - aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_sse2; + aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_ssse3; - aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_sse2; + aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_ssse3; - aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_sse2; + aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_ssse3; - aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_sse2; + aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_ssse3; - aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_sse2; + aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_avx2; - aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_sse2; + aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_ssse3; - aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_sse2; + aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_avx2; - aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_sse2; + aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_ssse3; if (flags & HAS_AVX2) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_avx2; - aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_sse2; + aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_ssse3; - aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_sse2; + aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_ssse3; - aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_sse2; + aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_ssse3; - aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_sse2; + aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_c; if (flags & HAS_SSSE3) aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_ssse3; aom_subtract_block = aom_subtract_block_sse2; if (flags & HAS_AVX2) aom_subtract_block = aom_subtract_block_avx2; @@ -8023,9 +7999,6 @@ static void setup_rtcd_internal(void) aom_vector_var = aom_vector_var_c; if (flags & HAS_SSE4_1) aom_vector_var = aom_vector_var_sse4_1; if (flags & HAS_AVX2) aom_vector_var = aom_vector_var_avx2; - av1_compute_cross_correlation = av1_compute_cross_correlation_c; - if (flags & HAS_SSE4_1) av1_compute_cross_correlation = av1_compute_cross_correlation_sse4_1; - if (flags & HAS_AVX2) av1_compute_cross_correlation = av1_compute_cross_correlation_avx2; } #endif diff --git a/media/libaom/config/win/x64/config/aom_scale_rtcd.h b/media/libaom/config/win/x64/config/aom_scale_rtcd.h index 3b70fb47c3..cdabb21106 100644 --- a/media/libaom/config/win/x64/config/aom_scale_rtcd.h +++ b/media/libaom/config/win/x64/config/aom_scale_rtcd.h @@ -8,13 +8,15 @@ #define RTCD_EXTERN extern #endif +#include <stdbool.h> + struct yv12_buffer_config; #ifdef __cplusplus extern "C" { #endif -void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes); +void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, int num_planes); #define aom_extend_frame_borders aom_extend_frame_borders_c void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end); @@ -50,13 +52,13 @@ void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigne void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes); #define aom_yv12_copy_frame aom_yv12_copy_frame_c -void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc); +void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop); #define aom_yv12_copy_u aom_yv12_copy_u_c -void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc); +void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop); #define aom_yv12_copy_v aom_yv12_copy_v_c -void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int use_crop); #define aom_yv12_copy_y aom_yv12_copy_y_c void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes); @@ -80,7 +82,7 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2); #define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c -int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_pyramid_levels, int num_planes); +int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes); #define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c void aom_scale_rtcd(void); diff --git a/media/libaom/config/win/x64/config/av1_rtcd.h b/media/libaom/config/win/x64/config/av1_rtcd.h index b1cdc99700..ad72985afe 100644 --- a/media/libaom/config/win/x64/config/av1_rtcd.h +++ b/media/libaom/config/win/x64/config/av1_rtcd.h @@ -253,7 +253,6 @@ void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *ds #define av1_convolve_y_sr_intrabc av1_convolve_y_sr_intrabc_c void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); -void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); RTCD_EXTERN void (*av1_dist_wtd_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); @@ -659,7 +658,6 @@ void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param); -void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param); void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param); void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param); RTCD_EXTERN void (*av1_lowbd_fwd_txfm)(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param); @@ -755,85 +753,61 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params); void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); -void cdef_copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); -void cdef_copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); void cdef_copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); void cdef_copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); RTCD_EXTERN void (*cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height); void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); -void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); -void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); void cdef_copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); RTCD_EXTERN void (*cdef_copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height); void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_0_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_0_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_0_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_0_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_16_0)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_1_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_1_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_1_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_1_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_1_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_16_1)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_2_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_2_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_2_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_2_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_2_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_16_2)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_3_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_3_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_16_3_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_3_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_16_3_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_16_3)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_0_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_0_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_0_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_0_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_0_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_8_0)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_1_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_1_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_1_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_1_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_1_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_8_1)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_2_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_2_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_2_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_2_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_2_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_8_2)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_3_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_3_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); -void cdef_filter_8_3_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_3_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void cdef_filter_8_3_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); RTCD_EXTERN void (*cdef_filter_8_3)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift); -int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift); -int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift); int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift); int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift); RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift); void cdef_find_dir_dual_c(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); -void cdef_find_dir_dual_sse2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); -void cdef_find_dir_dual_ssse3(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); void cdef_find_dir_dual_sse4_1(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); RTCD_EXTERN void (*cdef_find_dir_dual)(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2); @@ -941,7 +915,7 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) av1_convolve_x_sr = av1_convolve_x_sr_avx2; av1_convolve_y_sr = av1_convolve_y_sr_sse2; if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2; - av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_sse2; + av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_c; if (flags & HAS_SSSE3) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_ssse3; if (flags & HAS_AVX2) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_avx2; av1_dist_wtd_convolve_2d_copy = av1_dist_wtd_convolve_2d_copy_sse2; @@ -1091,7 +1065,7 @@ static void setup_rtcd_internal(void) av1_inv_txfm_add = av1_inv_txfm_add_c; if (flags & HAS_SSSE3) av1_inv_txfm_add = av1_inv_txfm_add_ssse3; if (flags & HAS_AVX2) av1_inv_txfm_add = av1_inv_txfm_add_avx2; - av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_sse2; + av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_c; if (flags & HAS_SSE4_1) av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_sse4_1; if (flags & HAS_AVX2) av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_avx2; av1_lowbd_pixel_proj_error = av1_lowbd_pixel_proj_error_c; @@ -1133,52 +1107,40 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) av1_wedge_sse_from_residuals = av1_wedge_sse_from_residuals_avx2; av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2; if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2; - cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse2; - if (flags & HAS_SSSE3) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_ssse3; + cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_c; if (flags & HAS_SSE4_1) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse4_1; if (flags & HAS_AVX2) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_avx2; - cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse2; - if (flags & HAS_SSSE3) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_ssse3; + cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_c; if (flags & HAS_SSE4_1) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse4_1; if (flags & HAS_AVX2) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_avx2; - cdef_filter_16_0 = cdef_filter_16_0_sse2; - if (flags & HAS_SSSE3) cdef_filter_16_0 = cdef_filter_16_0_ssse3; + cdef_filter_16_0 = cdef_filter_16_0_c; if (flags & HAS_SSE4_1) cdef_filter_16_0 = cdef_filter_16_0_sse4_1; if (flags & HAS_AVX2) cdef_filter_16_0 = cdef_filter_16_0_avx2; - cdef_filter_16_1 = cdef_filter_16_1_sse2; - if (flags & HAS_SSSE3) cdef_filter_16_1 = cdef_filter_16_1_ssse3; + cdef_filter_16_1 = cdef_filter_16_1_c; if (flags & HAS_SSE4_1) cdef_filter_16_1 = cdef_filter_16_1_sse4_1; if (flags & HAS_AVX2) cdef_filter_16_1 = cdef_filter_16_1_avx2; - cdef_filter_16_2 = cdef_filter_16_2_sse2; - if (flags & HAS_SSSE3) cdef_filter_16_2 = cdef_filter_16_2_ssse3; + cdef_filter_16_2 = cdef_filter_16_2_c; if (flags & HAS_SSE4_1) cdef_filter_16_2 = cdef_filter_16_2_sse4_1; if (flags & HAS_AVX2) cdef_filter_16_2 = cdef_filter_16_2_avx2; - cdef_filter_16_3 = cdef_filter_16_3_sse2; - if (flags & HAS_SSSE3) cdef_filter_16_3 = cdef_filter_16_3_ssse3; + cdef_filter_16_3 = cdef_filter_16_3_c; if (flags & HAS_SSE4_1) cdef_filter_16_3 = cdef_filter_16_3_sse4_1; if (flags & HAS_AVX2) cdef_filter_16_3 = cdef_filter_16_3_avx2; - cdef_filter_8_0 = cdef_filter_8_0_sse2; - if (flags & HAS_SSSE3) cdef_filter_8_0 = cdef_filter_8_0_ssse3; + cdef_filter_8_0 = cdef_filter_8_0_c; if (flags & HAS_SSE4_1) cdef_filter_8_0 = cdef_filter_8_0_sse4_1; if (flags & HAS_AVX2) cdef_filter_8_0 = cdef_filter_8_0_avx2; - cdef_filter_8_1 = cdef_filter_8_1_sse2; - if (flags & HAS_SSSE3) cdef_filter_8_1 = cdef_filter_8_1_ssse3; + cdef_filter_8_1 = cdef_filter_8_1_c; if (flags & HAS_SSE4_1) cdef_filter_8_1 = cdef_filter_8_1_sse4_1; if (flags & HAS_AVX2) cdef_filter_8_1 = cdef_filter_8_1_avx2; - cdef_filter_8_2 = cdef_filter_8_2_sse2; - if (flags & HAS_SSSE3) cdef_filter_8_2 = cdef_filter_8_2_ssse3; + cdef_filter_8_2 = cdef_filter_8_2_c; if (flags & HAS_SSE4_1) cdef_filter_8_2 = cdef_filter_8_2_sse4_1; if (flags & HAS_AVX2) cdef_filter_8_2 = cdef_filter_8_2_avx2; - cdef_filter_8_3 = cdef_filter_8_3_sse2; - if (flags & HAS_SSSE3) cdef_filter_8_3 = cdef_filter_8_3_ssse3; + cdef_filter_8_3 = cdef_filter_8_3_c; if (flags & HAS_SSE4_1) cdef_filter_8_3 = cdef_filter_8_3_sse4_1; if (flags & HAS_AVX2) cdef_filter_8_3 = cdef_filter_8_3_avx2; - cdef_find_dir = cdef_find_dir_sse2; - if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3; + cdef_find_dir = cdef_find_dir_c; if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1; if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2; - cdef_find_dir_dual = cdef_find_dir_dual_sse2; - if (flags & HAS_SSSE3) cdef_find_dir_dual = cdef_find_dir_dual_ssse3; + cdef_find_dir_dual = cdef_find_dir_dual_c; if (flags & HAS_SSE4_1) cdef_find_dir_dual = cdef_find_dir_dual_sse4_1; if (flags & HAS_AVX2) cdef_find_dir_dual = cdef_find_dir_dual_avx2; cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_c; diff --git a/media/libaom/moz.yaml b/media/libaom/moz.yaml index b06ee5115a..a37ab1e904 100644 --- a/media/libaom/moz.yaml +++ b/media/libaom/moz.yaml @@ -20,11 +20,11 @@ origin: # Human-readable identifier for this version/release # Generally "version NNN", "tag SSS", "bookmark SSS" - release: 11631186b36e96afce18808ebebb17cc23a010ef (Fri Jan 19 23:29:34 2024 +0000). + release: 879d14159441796c92f3bbba7f8965e1bcf320ca (Tue Apr 02 21:57:54 2024 +0000). # Revision to pull in # Must be a long or short commit SHA (long preferred) - revision: 11631186b36e96afce18808ebebb17cc23a010ef + revision: 879d14159441796c92f3bbba7f8965e1bcf320ca # The package's license, where possible using the mnemonic from # https://spdx.org/licenses/ @@ -54,3 +54,7 @@ vendoring: - action: run-script script: '{yaml_dir}/generate_sources_mozbuild.sh' cwd: '{yaml_dir}' + + patches: + - 0001-errno.patch + - 0002-mmloadusi64.patch diff --git a/media/libaom/sources.mozbuild b/media/libaom/sources.mozbuild index b29ddd5c97..187bf97f8a 100644 --- a/media/libaom/sources.mozbuild +++ b/media/libaom/sources.mozbuild @@ -506,6 +506,7 @@ files = { '../../third_party/aom/aom_dsp/flow_estimation/ransac.c', '../../third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c', '../../third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c', + '../../third_party/aom/aom_dsp/flow_estimation/x86/disflow_avx2.c', '../../third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c', '../../third_party/aom/aom_dsp/fwd_txfm.c', '../../third_party/aom/aom_dsp/grain_table.c', @@ -533,11 +534,8 @@ files = { '../../third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm', '../../third_party/aom/aom_dsp/x86/aom_quantize_avx.c', '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c', - '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c', '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c', - '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm', '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm', - '../../third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm', '../../third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm', '../../third_party/aom/aom_dsp/x86/avg_intrin_avx2.c', '../../third_party/aom/aom_dsp/x86/avg_intrin_sse2.c', @@ -599,7 +597,7 @@ files = { '../../third_party/aom/aom_dsp/x86/sad_sse2.asm', '../../third_party/aom/aom_dsp/x86/sse_avx2.c', '../../third_party/aom/aom_dsp/x86/sse_sse4.c', - '../../third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm', + '../../third_party/aom/aom_dsp/x86/subpel_variance_ssse3.asm', '../../third_party/aom/aom_dsp/x86/subtract_avx2.c', '../../third_party/aom/aom_dsp/x86/subtract_sse2.asm', '../../third_party/aom/aom_dsp/x86/sum_squares_avx2.c', @@ -658,7 +656,6 @@ files = { '../../third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c', '../../third_party/aom/av1/common/x86/av1_txfm_sse4.c', '../../third_party/aom/av1/common/x86/cdef_block_avx2.c', - '../../third_party/aom/av1/common/x86/cdef_block_sse2.c', '../../third_party/aom/av1/common/x86/cdef_block_sse4.c', '../../third_party/aom/av1/common/x86/cdef_block_ssse3.c', '../../third_party/aom/av1/common/x86/cfl_avx2.c', @@ -859,6 +856,7 @@ files = { '../../third_party/aom/aom_dsp/flow_estimation/ransac.c', '../../third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c', '../../third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c', + '../../third_party/aom/aom_dsp/flow_estimation/x86/disflow_avx2.c', '../../third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c', '../../third_party/aom/aom_dsp/fwd_txfm.c', '../../third_party/aom/aom_dsp/grain_table.c', @@ -886,11 +884,8 @@ files = { '../../third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm', '../../third_party/aom/aom_dsp/x86/aom_quantize_avx.c', '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c', - '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c', '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c', - '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm', '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm', - '../../third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm', '../../third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm', '../../third_party/aom/aom_dsp/x86/avg_intrin_avx2.c', '../../third_party/aom/aom_dsp/x86/avg_intrin_sse2.c', @@ -955,7 +950,7 @@ files = { '../../third_party/aom/aom_dsp/x86/sse_avx2.c', '../../third_party/aom/aom_dsp/x86/sse_sse4.c', '../../third_party/aom/aom_dsp/x86/ssim_sse2_x86_64.asm', - '../../third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm', + '../../third_party/aom/aom_dsp/x86/subpel_variance_ssse3.asm', '../../third_party/aom/aom_dsp/x86/subtract_avx2.c', '../../third_party/aom/aom_dsp/x86/subtract_sse2.asm', '../../third_party/aom/aom_dsp/x86/sum_squares_avx2.c', @@ -1014,9 +1009,7 @@ files = { '../../third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c', '../../third_party/aom/av1/common/x86/av1_txfm_sse4.c', '../../third_party/aom/av1/common/x86/cdef_block_avx2.c', - '../../third_party/aom/av1/common/x86/cdef_block_sse2.c', '../../third_party/aom/av1/common/x86/cdef_block_sse4.c', - '../../third_party/aom/av1/common/x86/cdef_block_ssse3.c', '../../third_party/aom/av1/common/x86/cfl_avx2.c', '../../third_party/aom/av1/common/x86/cfl_sse2.c', '../../third_party/aom/av1/common/x86/cfl_ssse3.c', diff --git a/media/libcubeb/0004-audiounit-ios-compile-fixes.patch b/media/libcubeb/0004-audiounit-ios-compile-fixes.patch new file mode 100644 index 0000000000..465ae0f98a --- /dev/null +++ b/media/libcubeb/0004-audiounit-ios-compile-fixes.patch @@ -0,0 +1,1415 @@ +diff --git a/src/cubeb_audiounit.cpp b/src/cubeb_audiounit.cpp +--- a/src/cubeb_audiounit.cpp ++++ b/src/cubeb_audiounit.cpp +@@ -36,16 +36,25 @@ + #include <vector> + + using namespace std; + + #if MAC_OS_X_VERSION_MIN_REQUIRED < 101000 + typedef UInt32 AudioFormatFlags; + #endif + ++#if TARGET_OS_IPHONE ++typedef UInt32 AudioDeviceID; ++typedef UInt32 AudioObjectID; ++const UInt32 kAudioObjectUnknown = 0; ++ ++#define AudioGetCurrentHostTime mach_absolute_time ++ ++#endif ++ + #define AU_OUT_BUS 0 + #define AU_IN_BUS 1 + + const char * DISPATCH_QUEUE_LABEL = "org.mozilla.cubeb"; + const char * PRIVATE_AGGREGATE_DEVICE_NAME = "CubebAggregateDevice"; + + #ifdef ALOGV + #undef ALOGV +@@ -60,45 +69,47 @@ const char * PRIVATE_AGGREGATE_DEVICE_NA + #undef ALOG + #endif + #define ALOG(msg, ...) \ + dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_HIGH, 0), \ + ^{ \ + LOG(msg, ##__VA_ARGS__); \ + }) + ++#if !TARGET_OS_IPHONE + /* Testing empirically, some headsets report a minimal latency that is very + * low, but this does not work in practice. Lie and say the minimum is 256 + * frames. */ + const uint32_t SAFE_MIN_LATENCY_FRAMES = 128; + const uint32_t SAFE_MAX_LATENCY_FRAMES = 512; + + const AudioObjectPropertyAddress DEFAULT_INPUT_DEVICE_PROPERTY_ADDRESS = { + kAudioHardwarePropertyDefaultInputDevice, kAudioObjectPropertyScopeGlobal, +- kAudioObjectPropertyElementMaster}; ++ kAudioObjectPropertyElementMain}; + + const AudioObjectPropertyAddress DEFAULT_OUTPUT_DEVICE_PROPERTY_ADDRESS = { + kAudioHardwarePropertyDefaultOutputDevice, kAudioObjectPropertyScopeGlobal, +- kAudioObjectPropertyElementMaster}; ++ kAudioObjectPropertyElementMain}; + + const AudioObjectPropertyAddress DEVICE_IS_ALIVE_PROPERTY_ADDRESS = { + kAudioDevicePropertyDeviceIsAlive, kAudioObjectPropertyScopeGlobal, +- kAudioObjectPropertyElementMaster}; ++ kAudioObjectPropertyElementMain}; + + const AudioObjectPropertyAddress DEVICES_PROPERTY_ADDRESS = { + kAudioHardwarePropertyDevices, kAudioObjectPropertyScopeGlobal, +- kAudioObjectPropertyElementMaster}; ++ kAudioObjectPropertyElementMain}; + + const AudioObjectPropertyAddress INPUT_DATA_SOURCE_PROPERTY_ADDRESS = { + kAudioDevicePropertyDataSource, kAudioDevicePropertyScopeInput, +- kAudioObjectPropertyElementMaster}; ++ kAudioObjectPropertyElementMain}; + + const AudioObjectPropertyAddress OUTPUT_DATA_SOURCE_PROPERTY_ADDRESS = { + kAudioDevicePropertyDataSource, kAudioDevicePropertyScopeOutput, +- kAudioObjectPropertyElementMaster}; ++ kAudioObjectPropertyElementMain}; ++#endif + + typedef uint32_t device_flags_value; + + enum device_flags { + DEV_UNKNOWN = 0x00, /* Unknown */ + DEV_INPUT = 0x01, /* Record device like mic */ + DEV_OUTPUT = 0x02, /* Playback device like speakers */ + DEV_SYSTEM_DEFAULT = 0x04, /* System default device */ +@@ -109,49 +120,51 @@ enum device_flags { + void + audiounit_stream_stop_internal(cubeb_stream * stm); + static int + audiounit_stream_start_internal(cubeb_stream * stm); + static void + audiounit_close_stream(cubeb_stream * stm); + static int + audiounit_setup_stream(cubeb_stream * stm); ++#if !TARGET_OS_IPHONE + static vector<AudioObjectID> + audiounit_get_devices_of_type(cubeb_device_type devtype); + static UInt32 + audiounit_get_device_presentation_latency(AudioObjectID devid, + AudioObjectPropertyScope scope); +- +-#if !TARGET_OS_IPHONE + static AudioObjectID + audiounit_get_default_device_id(cubeb_device_type type); + static int + audiounit_uninstall_device_changed_callback(cubeb_stream * stm); + static int + audiounit_uninstall_system_changed_callback(cubeb_stream * stm); ++#endif ++ + static void + audiounit_reinit_stream_async(cubeb_stream * stm, device_flags_value flags); +-#endif + + extern cubeb_ops const audiounit_ops; + + struct cubeb { + cubeb_ops const * ops = &audiounit_ops; + owned_critical_section mutex; + int active_streams = 0; + uint32_t global_latency_frames = 0; + cubeb_device_collection_changed_callback input_collection_changed_callback = + nullptr; + void * input_collection_changed_user_ptr = nullptr; + cubeb_device_collection_changed_callback output_collection_changed_callback = + nullptr; + void * output_collection_changed_user_ptr = nullptr; ++ #if !TARGET_OS_IPHONE + // Store list of devices to detect changes + vector<AudioObjectID> input_device_array; + vector<AudioObjectID> output_device_array; ++ #endif + // The queue should be released when it’s no longer needed. + dispatch_queue_t serial_queue = + dispatch_queue_create(DISPATCH_QUEUE_LABEL, DISPATCH_QUEUE_SERIAL); + // Current used channel layout + atomic<cubeb_channel_layout> layout{CUBEB_LAYOUT_UNDEFINED}; + uint32_t channels = 0; + }; + +@@ -181,29 +194,31 @@ to_string(io_side side) + } + } + + struct device_info { + AudioDeviceID id = kAudioObjectUnknown; + device_flags_value flags = DEV_UNKNOWN; + }; + ++#if !TARGET_OS_IPHONE + struct property_listener { + AudioDeviceID device_id; + const AudioObjectPropertyAddress * property_address; + AudioObjectPropertyListenerProc callback; + cubeb_stream * stream; + + property_listener(AudioDeviceID id, + const AudioObjectPropertyAddress * address, + AudioObjectPropertyListenerProc proc, cubeb_stream * stm) + : device_id(id), property_address(address), callback(proc), stream(stm) + { + } + }; ++#endif + + struct cubeb_stream { + explicit cubeb_stream(cubeb * context); + + /* Note: Must match cubeb_stream layout in cubeb.c. */ + cubeb * context; + void * user_ptr = nullptr; + /**/ +@@ -252,32 +267,36 @@ struct cubeb_stream { + /* Latency requested by the user. */ + uint32_t latency_frames = 0; + atomic<uint32_t> current_latency_frames{0}; + atomic<uint32_t> total_output_latency_frames{0}; + unique_ptr<cubeb_resampler, decltype(&cubeb_resampler_destroy)> resampler; + /* This is true if a device change callback is currently running. */ + atomic<bool> switching_device{false}; + atomic<bool> buffer_size_change_state{false}; ++ #if !TARGET_OS_IPHONE + AudioDeviceID aggregate_device_id = + kAudioObjectUnknown; // the aggregate device id + AudioObjectID plugin_id = + kAudioObjectUnknown; // used to create aggregate device ++ #endif + /* Mixer interface */ + unique_ptr<cubeb_mixer, decltype(&cubeb_mixer_destroy)> mixer; + /* Buffer where remixing/resampling will occur when upmixing is required */ + /* Only accessed from callback thread */ + unique_ptr<uint8_t[]> temp_buffer; + size_t temp_buffer_size = 0; // size in bytes. ++ #if !TARGET_OS_IPHONE + /* Listeners indicating what system events are monitored. */ + unique_ptr<property_listener> default_input_listener; + unique_ptr<property_listener> default_output_listener; + unique_ptr<property_listener> input_alive_listener; + unique_ptr<property_listener> input_source_listener; + unique_ptr<property_listener> output_source_listener; ++ #endif + }; + + bool + has_input(cubeb_stream * stm) + { + return stm->input_stream_params.rate != 0; + } + +@@ -381,24 +400,16 @@ bool + is_common_sample_rate(Float64 sample_rate) + { + /* Some commonly used sample rates and their multiples and divisors. */ + return sample_rate == 8000 || sample_rate == 16000 || sample_rate == 22050 || + sample_rate == 32000 || sample_rate == 44100 || sample_rate == 48000 || + sample_rate == 88200 || sample_rate == 96000; + } + +-#if TARGET_OS_IPHONE +-typedef UInt32 AudioDeviceID; +-typedef UInt32 AudioObjectID; +- +-#define AudioGetCurrentHostTime mach_absolute_time +- +-#endif +- + uint64_t + ConvertHostTimeToNanos(uint64_t host_time) + { + static struct mach_timebase_info timebase_info; + static bool initialized = false; + if (!initialized) { + mach_timebase_info(&timebase_info); + initialized = true; +@@ -756,23 +767,23 @@ audiounit_init(cubeb ** context, char co + } + + static char const * + audiounit_get_backend_id(cubeb * /* ctx */) + { + return "audiounit"; + } + +-#if !TARGET_OS_IPHONE + + static int + audiounit_stream_get_volume(cubeb_stream * stm, float * volume); + static int + audiounit_stream_set_volume(cubeb_stream * stm, float volume); + ++#if !TARGET_OS_IPHONE + static int + audiounit_set_device_info(cubeb_stream * stm, AudioDeviceID id, io_side side) + { + assert(stm); + + device_info * info = nullptr; + cubeb_device_type type = CUBEB_DEVICE_TYPE_UNKNOWN; + +@@ -806,42 +817,47 @@ audiounit_set_device_info(cubeb_stream * + } + + assert(info->id); + assert(info->flags & DEV_INPUT && !(info->flags & DEV_OUTPUT) || + !(info->flags & DEV_INPUT) && info->flags & DEV_OUTPUT); + + return CUBEB_OK; + } ++#endif + + static int + audiounit_reinit_stream(cubeb_stream * stm, device_flags_value flags) + { + auto_lock context_lock(stm->context->mutex); + assert((flags & DEV_INPUT && stm->input_unit) || + (flags & DEV_OUTPUT && stm->output_unit)); + if (!stm->shutdown) { + audiounit_stream_stop_internal(stm); + } + +- int r = audiounit_uninstall_device_changed_callback(stm); ++ int r; ++#if !TARGET_OS_IPHONE ++ r = audiounit_uninstall_device_changed_callback(stm); + if (r != CUBEB_OK) { + LOG("(%p) Could not uninstall all device change listeners.", stm); + } ++#endif + + { + auto_lock lock(stm->mutex); + float volume = 0.0; + int vol_rv = CUBEB_ERROR; + if (stm->output_unit) { + vol_rv = audiounit_stream_get_volume(stm, &volume); + } + + audiounit_close_stream(stm); + ++ #if !TARGET_OS_IPHONE + /* Reinit occurs in one of the following case: + * - When the device is not alive any more + * - When the default system device change. + * - The bluetooth device changed from A2DP to/from HFP/HSP profile + * We first attempt to re-use the same device id, should that fail we will + * default to the (potentially new) default device. */ + AudioDeviceID input_device = + flags & DEV_INPUT ? stm->input_device.id : kAudioObjectUnknown; +@@ -861,29 +877,33 @@ audiounit_reinit_stream(cubeb_stream * s + r = audiounit_set_device_info(stm, kAudioObjectUnknown, io_side::OUTPUT); + if (r != CUBEB_OK) { + LOG("(%p) Set output device info failed. This can happen when last media " + "device is unplugged", + stm); + return CUBEB_ERROR; + } + ++ #endif ++ + if (audiounit_setup_stream(stm) != CUBEB_OK) { + LOG("(%p) Stream reinit failed.", stm); ++ #if !TARGET_OS_IPHONE + if (flags & DEV_INPUT && input_device != kAudioObjectUnknown) { + // Attempt to re-use the same device-id failed, so attempt again with + // default input device. + audiounit_close_stream(stm); + if (audiounit_set_device_info(stm, kAudioObjectUnknown, + io_side::INPUT) != CUBEB_OK || + audiounit_setup_stream(stm) != CUBEB_OK) { + LOG("(%p) Second stream reinit failed.", stm); + return CUBEB_ERROR; + } + } ++ #endif + } + + if (vol_rv == CUBEB_OK) { + audiounit_stream_set_volume(stm, volume); + } + + // If the stream was running, start it again. + if (!stm->shutdown) { +@@ -909,27 +929,30 @@ audiounit_reinit_stream_async(cubeb_stre + // Get/SetProperties method from inside notify callback + dispatch_async(stm->context->serial_queue, ^() { + if (stm->destroy_pending) { + ALOG("(%p) stream pending destroy, cancelling reinit task", stm); + return; + } + + if (audiounit_reinit_stream(stm, flags) != CUBEB_OK) { ++ #if !TARGET_OS_IPHONE + if (audiounit_uninstall_system_changed_callback(stm) != CUBEB_OK) { + LOG("(%p) Could not uninstall system changed callback", stm); + } ++ #endif + stm->state_callback(stm, stm->user_ptr, CUBEB_STATE_ERROR); + LOG("(%p) Could not reopen the stream after switching.", stm); + } + stm->switching_device = false; + stm->reinit_pending = false; + }); + } + ++#if !TARGET_OS_IPHONE + static char const * + event_addr_to_string(AudioObjectPropertySelector selector) + { + switch (selector) { + case kAudioHardwarePropertyDefaultOutputDevice: + return "kAudioHardwarePropertyDefaultOutputDevice"; + case kAudioHardwarePropertyDefaultInputDevice: + return "kAudioHardwarePropertyDefaultInputDevice"; +@@ -1091,16 +1114,17 @@ audiounit_install_device_changed_callbac + rv, stm->input_device.id); + r = CUBEB_ERROR; + } + } + + return r; + } + ++#if !TARGET_OS_IPHONE + static int + audiounit_install_system_changed_callback(cubeb_stream * stm) + { + OSStatus r; + + if (stm->output_unit) { + /* This event will notify us when the default audio device changes, + * for example when the user plugs in a USB headset and the system chooses +@@ -1131,16 +1155,17 @@ audiounit_install_system_changed_callbac + "kAudioHardwarePropertyDefaultInputDevice rv=%d", + r); + return CUBEB_ERROR; + } + } + + return CUBEB_OK; + } ++#endif + + static int + audiounit_uninstall_device_changed_callback(cubeb_stream * stm) + { + OSStatus rv; + // Failing to uninstall listeners is not a fatal error. + int r = CUBEB_OK; + +@@ -1207,17 +1232,17 @@ audiounit_uninstall_system_changed_callb + static int + audiounit_get_acceptable_latency_range(AudioValueRange * latency_range) + { + UInt32 size; + OSStatus r; + AudioDeviceID output_device_id; + AudioObjectPropertyAddress output_device_buffer_size_range = { + kAudioDevicePropertyBufferFrameSizeRange, kAudioDevicePropertyScopeOutput, +- kAudioObjectPropertyElementMaster}; ++ kAudioObjectPropertyElementMain}; + + output_device_id = audiounit_get_default_device_id(CUBEB_DEVICE_TYPE_OUTPUT); + if (output_device_id == kAudioObjectUnknown) { + LOG("Could not get default output device id."); + return CUBEB_ERROR; + } + + /* Get the buffer size range this device supports */ +@@ -1228,17 +1253,16 @@ audiounit_get_acceptable_latency_range(A + &size, latency_range); + if (r != noErr) { + LOG("AudioObjectGetPropertyData/buffer size range rv=%d", r); + return CUBEB_ERROR; + } + + return CUBEB_OK; + } +-#endif /* !TARGET_OS_IPHONE */ + + static AudioObjectID + audiounit_get_default_device_id(cubeb_device_type type) + { + const AudioObjectPropertyAddress * adr; + if (type == CUBEB_DEVICE_TYPE_OUTPUT) { + adr = &DEFAULT_OUTPUT_DEVICE_PROPERTY_ADDRESS; + } else if (type == CUBEB_DEVICE_TYPE_INPUT) { +@@ -1251,31 +1275,32 @@ audiounit_get_default_device_id(cubeb_de + UInt32 size = sizeof(AudioDeviceID); + if (AudioObjectGetPropertyData(kAudioObjectSystemObject, adr, 0, NULL, &size, + &devid) != noErr) { + return kAudioObjectUnknown; + } + + return devid; + } ++#endif /* !TARGET_OS_IPHONE */ + + int + audiounit_get_max_channel_count(cubeb * ctx, uint32_t * max_channels) + { + #if TARGET_OS_IPHONE + // TODO: [[AVAudioSession sharedInstance] maximumOutputNumberOfChannels] + *max_channels = 2; + #else + UInt32 size; + OSStatus r; + AudioDeviceID output_device_id; + AudioStreamBasicDescription stream_format; + AudioObjectPropertyAddress stream_format_address = { + kAudioDevicePropertyStreamFormat, kAudioDevicePropertyScopeOutput, +- kAudioObjectPropertyElementMaster}; ++ kAudioObjectPropertyElementMain}; + + assert(ctx && max_channels); + + output_device_id = audiounit_get_default_device_id(CUBEB_DEVICE_TYPE_OUTPUT); + if (output_device_id == kAudioObjectUnknown) { + return CUBEB_ERROR; + } + +@@ -1304,52 +1329,52 @@ audiounit_get_min_latency(cubeb * /* ctx + AudioValueRange latency_range; + if (audiounit_get_acceptable_latency_range(&latency_range) != CUBEB_OK) { + LOG("Could not get acceptable latency range."); + return CUBEB_ERROR; + } + + *latency_frames = + max<uint32_t>(latency_range.mMinimum, SAFE_MIN_LATENCY_FRAMES); ++ return CUBEB_OK; + #endif +- +- return CUBEB_OK; + } + + static int + audiounit_get_preferred_sample_rate(cubeb * /* ctx */, uint32_t * rate) + { + #if TARGET_OS_IPHONE +- // TODO +- return CUBEB_ERROR_NOT_SUPPORTED; ++ *rate = 44100; ++ return CUBEB_OK; + #else + UInt32 size; + OSStatus r; + Float64 fsamplerate; + AudioDeviceID output_device_id; + AudioObjectPropertyAddress samplerate_address = { + kAudioDevicePropertyNominalSampleRate, kAudioObjectPropertyScopeGlobal, +- kAudioObjectPropertyElementMaster}; ++ kAudioObjectPropertyElementMain}; + + output_device_id = audiounit_get_default_device_id(CUBEB_DEVICE_TYPE_OUTPUT); + if (output_device_id == kAudioObjectUnknown) { + return CUBEB_ERROR; + } + + size = sizeof(fsamplerate); + r = AudioObjectGetPropertyData(output_device_id, &samplerate_address, 0, NULL, + &size, &fsamplerate); + + if (r != noErr) { + return CUBEB_ERROR; + } + + *rate = static_cast<uint32_t>(fsamplerate); ++ ++ return CUBEB_OK; + #endif +- return CUBEB_OK; + } + + static cubeb_channel_layout + audiounit_convert_channel_layout(AudioChannelLayout * layout) + { + // When having one or two channel, force mono or stereo. Some devices (namely, + // Bose QC35, mark 1 and 2), expose a single channel mapped to the right for + // some reason. +@@ -1380,16 +1405,19 @@ audiounit_convert_channel_layout(AudioCh + } + + return cl; + } + + static cubeb_channel_layout + audiounit_get_preferred_channel_layout(AudioUnit output_unit) + { ++ #if TARGET_OS_IPHONE ++ return CUBEB_LAYOUT_STEREO; ++ #else + OSStatus rv = noErr; + UInt32 size = 0; + rv = AudioUnitGetPropertyInfo( + output_unit, kAudioDevicePropertyPreferredChannelLayout, + kAudioUnitScope_Output, AU_OUT_BUS, &size, nullptr); + if (rv != noErr) { + LOG("AudioUnitGetPropertyInfo/kAudioDevicePropertyPreferredChannelLayout " + "rv=%d", +@@ -1404,16 +1432,17 @@ audiounit_get_preferred_channel_layout(A + kAudioUnitScope_Output, AU_OUT_BUS, layout.get(), &size); + if (rv != noErr) { + LOG("AudioUnitGetProperty/kAudioDevicePropertyPreferredChannelLayout rv=%d", + rv); + return CUBEB_LAYOUT_UNDEFINED; + } + + return audiounit_convert_channel_layout(layout.get()); ++ #endif + } + + static cubeb_channel_layout + audiounit_get_current_channel_layout(AudioUnit output_unit) + { + OSStatus rv = noErr; + UInt32 size = 0; + rv = AudioUnitGetPropertyInfo( +@@ -1437,18 +1466,20 @@ audiounit_get_current_channel_layout(Aud + } + + return audiounit_convert_channel_layout(layout.get()); + } + + static int + audiounit_create_unit(AudioUnit * unit, device_info * device); + ++#if !TARGET_OS_IPHONE + static OSStatus + audiounit_remove_device_listener(cubeb * context, cubeb_device_type devtype); ++#endif + + static void + audiounit_destroy(cubeb * ctx) + { + { + auto_lock lock(ctx->mutex); + + // Disabling this assert for bug 1083664 -- we seem to leak a stream +@@ -1460,23 +1491,25 @@ audiounit_destroy(cubeb * ctx) + + // Destroying a cubeb context with device collection callbacks registered + // is misuse of the API, assert then attempt to clean up. + assert(!ctx->input_collection_changed_callback && + !ctx->input_collection_changed_user_ptr && + !ctx->output_collection_changed_callback && + !ctx->output_collection_changed_user_ptr); + ++ #if !TARGET_OS_IPHONE + /* Unregister the callback if necessary. */ + if (ctx->input_collection_changed_callback) { + audiounit_remove_device_listener(ctx, CUBEB_DEVICE_TYPE_INPUT); + } + if (ctx->output_collection_changed_callback) { + audiounit_remove_device_listener(ctx, CUBEB_DEVICE_TYPE_OUTPUT); + } ++ #endif + } + + dispatch_release(ctx->serial_queue); + + delete ctx; + } + + static void +@@ -1594,23 +1627,24 @@ audiounit_layout_init(cubeb_stream * stm + } + + stm->context->layout = audiounit_get_current_channel_layout(stm->output_unit); + + audiounit_set_channel_layout(stm->output_unit, io_side::OUTPUT, + stm->context->layout); + } + ++#if !TARGET_OS_IPHONE + static vector<AudioObjectID> + audiounit_get_sub_devices(AudioDeviceID device_id) + { + vector<AudioDeviceID> sub_devices; + AudioObjectPropertyAddress property_address = { + kAudioAggregateDevicePropertyActiveSubDeviceList, +- kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMaster}; ++ kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMain}; + UInt32 size = 0; + OSStatus rv = AudioObjectGetPropertyDataSize(device_id, &property_address, 0, + nullptr, &size); + + if (rv != noErr) { + sub_devices.push_back(device_id); + return sub_devices; + } +@@ -1629,17 +1663,17 @@ audiounit_get_sub_devices(AudioDeviceID + } + + static int + audiounit_create_blank_aggregate_device(AudioObjectID * plugin_id, + AudioDeviceID * aggregate_device_id) + { + AudioObjectPropertyAddress address_plugin_bundle_id = { + kAudioHardwarePropertyPlugInForBundleID, kAudioObjectPropertyScopeGlobal, +- kAudioObjectPropertyElementMaster}; ++ kAudioObjectPropertyElementMain}; + UInt32 size = 0; + OSStatus r = AudioObjectGetPropertyDataSize( + kAudioObjectSystemObject, &address_plugin_bundle_id, 0, NULL, &size); + if (r != noErr) { + LOG("AudioObjectGetPropertyDataSize/" + "kAudioHardwarePropertyPlugInForBundleID, rv=%d", + r); + return CUBEB_ERROR; +@@ -1659,17 +1693,17 @@ audiounit_create_blank_aggregate_device( + LOG("AudioObjectGetPropertyData/kAudioHardwarePropertyPlugInForBundleID, " + "rv=%d", + r); + return CUBEB_ERROR; + } + + AudioObjectPropertyAddress create_aggregate_device_address = { + kAudioPlugInCreateAggregateDevice, kAudioObjectPropertyScopeGlobal, +- kAudioObjectPropertyElementMaster}; ++ kAudioObjectPropertyElementMain}; + r = AudioObjectGetPropertyDataSize( + *plugin_id, &create_aggregate_device_address, 0, nullptr, &size); + if (r != noErr) { + LOG("AudioObjectGetPropertyDataSize/kAudioPlugInCreateAggregateDevice, " + "rv=%d", + r); + return CUBEB_ERROR; + } +@@ -1731,17 +1765,17 @@ audiounit_create_blank_aggregate_device( + // object is increased. + static CFStringRef + get_device_name(AudioDeviceID id) + { + UInt32 size = sizeof(CFStringRef); + CFStringRef UIname = nullptr; + AudioObjectPropertyAddress address_uuid = {kAudioDevicePropertyDeviceUID, + kAudioObjectPropertyScopeGlobal, +- kAudioObjectPropertyElementMaster}; ++ kAudioObjectPropertyElementMain}; + OSStatus err = + AudioObjectGetPropertyData(id, &address_uuid, 0, nullptr, &size, &UIname); + return (err == noErr) ? UIname : NULL; + } + + static int + audiounit_set_aggregate_sub_device_list(AudioDeviceID aggregate_device_id, + AudioDeviceID input_device_id, +@@ -1774,17 +1808,17 @@ audiounit_set_aggregate_sub_device_list( + return CUBEB_ERROR; + } + CFArrayAppendValue(aggregate_sub_devices_array, ref); + CFRelease(ref); + } + + AudioObjectPropertyAddress aggregate_sub_device_list = { + kAudioAggregateDevicePropertyFullSubDeviceList, +- kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMaster}; ++ kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMain}; + UInt32 size = sizeof(CFMutableArrayRef); + OSStatus rv = AudioObjectSetPropertyData( + aggregate_device_id, &aggregate_sub_device_list, 0, nullptr, size, + &aggregate_sub_devices_array); + CFRelease(aggregate_sub_devices_array); + if (rv != noErr) { + LOG("AudioObjectSetPropertyData/" + "kAudioAggregateDevicePropertyFullSubDeviceList, rv=%d", +@@ -1796,17 +1830,17 @@ audiounit_set_aggregate_sub_device_list( + } + + static int + audiounit_set_master_aggregate_device(const AudioDeviceID aggregate_device_id) + { + assert(aggregate_device_id != kAudioObjectUnknown); + AudioObjectPropertyAddress master_aggregate_sub_device = { + kAudioAggregateDevicePropertyMasterSubDevice, +- kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMaster}; ++ kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMain}; + + // Master become the 1st output sub device + AudioDeviceID output_device_id = + audiounit_get_default_device_id(CUBEB_DEVICE_TYPE_OUTPUT); + const vector<AudioDeviceID> output_sub_devices = + audiounit_get_sub_devices(output_device_id); + CFStringRef master_sub_device = get_device_name(output_sub_devices[0]); + +@@ -1829,17 +1863,17 @@ audiounit_set_master_aggregate_device(co + + static int + audiounit_activate_clock_drift_compensation( + const AudioDeviceID aggregate_device_id) + { + assert(aggregate_device_id != kAudioObjectUnknown); + AudioObjectPropertyAddress address_owned = { + kAudioObjectPropertyOwnedObjects, kAudioObjectPropertyScopeGlobal, +- kAudioObjectPropertyElementMaster}; ++ kAudioObjectPropertyElementMain}; + + UInt32 qualifier_data_size = sizeof(AudioObjectID); + AudioClassID class_id = kAudioSubDeviceClassID; + void * qualifier_data = &class_id; + UInt32 size = 0; + OSStatus rv = AudioObjectGetPropertyDataSize( + aggregate_device_id, &address_owned, qualifier_data_size, qualifier_data, + &size); +@@ -1861,17 +1895,17 @@ audiounit_activate_clock_drift_compensat + if (rv != noErr) { + LOG("AudioObjectGetPropertyData/kAudioObjectPropertyOwnedObjects, rv=%d", + rv); + return CUBEB_ERROR; + } + + AudioObjectPropertyAddress address_drift = { + kAudioSubDevicePropertyDriftCompensation, kAudioObjectPropertyScopeGlobal, +- kAudioObjectPropertyElementMaster}; ++ kAudioObjectPropertyElementMain}; + + // Start from the second device since the first is the master clock + for (UInt32 i = 1; i < subdevices_num; ++i) { + UInt32 drift_compensation_value = 1; + rv = AudioObjectSetPropertyData(sub_devices[i], &address_drift, 0, nullptr, + sizeof(UInt32), &drift_compensation_value); + if (rv != noErr) { + LOG("AudioObjectSetPropertyData/" +@@ -1930,17 +1964,17 @@ audiounit_workaround_for_airpod(cubeb_st + &output_min_rate, &output_max_rate, &output_nominal_rate); + LOG("(%p) Output device %u, name: %s, min: %u, max: %u, nominal rate: %u", + stm, stm->output_device.id, output_device_info.friendly_name, + output_min_rate, output_max_rate, output_nominal_rate); + + Float64 rate = input_nominal_rate; + AudioObjectPropertyAddress addr = {kAudioDevicePropertyNominalSampleRate, + kAudioObjectPropertyScopeGlobal, +- kAudioObjectPropertyElementMaster}; ++ kAudioObjectPropertyElementMain}; + + OSStatus rv = AudioObjectSetPropertyData(stm->aggregate_device_id, &addr, 0, + nullptr, sizeof(Float64), &rate); + if (rv != noErr) { + LOG("Non fatal error, " + "AudioObjectSetPropertyData/kAudioDevicePropertyNominalSampleRate, " + "rv=%d", + rv); +@@ -2014,17 +2048,17 @@ audiounit_create_aggregate_device(cubeb_ + static int + audiounit_destroy_aggregate_device(AudioObjectID plugin_id, + AudioDeviceID * aggregate_device_id) + { + assert(aggregate_device_id && *aggregate_device_id != kAudioDeviceUnknown && + plugin_id != kAudioObjectUnknown); + AudioObjectPropertyAddress destroy_aggregate_device_addr = { + kAudioPlugInDestroyAggregateDevice, kAudioObjectPropertyScopeGlobal, +- kAudioObjectPropertyElementMaster}; ++ kAudioObjectPropertyElementMain}; + UInt32 size; + OSStatus rv = AudioObjectGetPropertyDataSize( + plugin_id, &destroy_aggregate_device_addr, 0, NULL, &size); + if (rv != noErr) { + LOG("AudioObjectGetPropertyDataSize/kAudioPlugInDestroyAggregateDevice, " + "rv=%d", + rv); + return CUBEB_ERROR; +@@ -2037,16 +2071,17 @@ audiounit_destroy_aggregate_device(Audio + rv); + return CUBEB_ERROR; + } + + LOG("Destroyed aggregate device %d", *aggregate_device_id); + *aggregate_device_id = kAudioObjectUnknown; + return CUBEB_OK; + } ++#endif + + static int + audiounit_new_unit_instance(AudioUnit * unit, device_info * device) + { + AudioComponentDescription desc; + AudioComponent comp; + OSStatus rv; + +@@ -2173,16 +2208,19 @@ audiounit_init_input_linear_buffer(cubeb + assert(stream->input_linear_buffer->length() == 0); + + return CUBEB_OK; + } + + static uint32_t + audiounit_clamp_latency(cubeb_stream * stm, uint32_t latency_frames) + { ++ #if TARGET_OS_IPHONE ++ return latency_frames; ++ #else + // For the 1st stream set anything within safe min-max + assert(audiounit_active_streams(stm->context) > 0); + if (audiounit_active_streams(stm->context) == 1) { + return max(min<uint32_t>(latency_frames, SAFE_MAX_LATENCY_FRAMES), + SAFE_MIN_LATENCY_FRAMES); + } + assert(stm->output_unit); + +@@ -2233,18 +2271,20 @@ audiounit_clamp_latency(cubeb_stream * s + } else if (output_buffer_size != 0) { + upper_latency_limit = output_buffer_size; + } else { + upper_latency_limit = SAFE_MAX_LATENCY_FRAMES; + } + + return max(min<uint32_t>(latency_frames, upper_latency_limit), + SAFE_MIN_LATENCY_FRAMES); ++ #endif + } + ++#if !TARGET_OS_IPHONE + /* + * Change buffer size is prone to deadlock thus we change it + * following the steps: + * - register a listener for the buffer size property + * - change the property + * - wait until the listener is executed + * - property has changed, remove the listener + * */ +@@ -2285,21 +2325,25 @@ buffer_size_changed_callback(void * inCl + "= %d for scope %d", + stm, au_type, new_buffer_size, inScope); + } + stm->buffer_size_change_state = true; + break; + } + } + } ++#endif + + static int + audiounit_set_buffer_size(cubeb_stream * stm, uint32_t new_size_frames, + io_side side) + { ++ #if TARGET_OS_IPHONE ++ return CUBEB_OK; ++ #else + AudioUnit au = stm->output_unit; + AudioUnitScope au_scope = kAudioUnitScope_Input; + AudioUnitElement au_element = AU_OUT_BUS; + + if (side == io_side::INPUT) { + au = stm->input_unit; + au_scope = kAudioUnitScope_Output; + au_element = AU_IN_BUS; +@@ -2377,16 +2421,17 @@ audiounit_set_buffer_size(cubeb_stream * + if (!stm->buffer_size_change_state && count >= 30) { + LOG("(%p) Error, did not get buffer size change callback ...", stm); + return CUBEB_ERROR; + } + + LOG("(%p) %s buffer size changed to %u frames.", stm, to_string(side), + new_size_frames); + return CUBEB_OK; ++ #endif + } + + static int + audiounit_configure_input(cubeb_stream * stm) + { + assert(stm && stm->input_unit); + + int r = 0; +@@ -2593,16 +2638,17 @@ audiounit_setup_stream(cubeb_stream * st + return CUBEB_ERROR_NOT_SUPPORTED; + } + + int r = 0; + + device_info in_dev_info = stm->input_device; + device_info out_dev_info = stm->output_device; + ++ #if !TARGET_OS_IPHONE + if (has_input(stm) && has_output(stm) && + stm->input_device.id != stm->output_device.id) { + r = audiounit_create_aggregate_device(stm); + if (r != CUBEB_OK) { + stm->aggregate_device_id = kAudioObjectUnknown; + LOG("(%p) Create aggregate devices failed.", stm); + // !!!NOTE: It is not necessary to return here. If it does not + // return it will fallback to the old implementation. The intention +@@ -2610,16 +2656,20 @@ audiounit_setup_stream(cubeb_stream * st + // it after a couple of weeks. + return r; + } else { + in_dev_info.id = out_dev_info.id = stm->aggregate_device_id; + in_dev_info.flags = DEV_INPUT; + out_dev_info.flags = DEV_OUTPUT; + } + } ++ #else ++ in_dev_info.flags = DEV_SYSTEM_DEFAULT | DEV_INPUT; ++ out_dev_info.flags = DEV_SYSTEM_DEFAULT | DEV_OUTPUT; ++ #endif + + if (has_input(stm)) { + r = audiounit_create_unit(&stm->input_unit, &in_dev_info); + if (r != CUBEB_OK) { + LOG("(%p) AudioUnit creation for input failed.", stm); + return r; + } + } +@@ -2751,18 +2801,20 @@ audiounit_setup_stream(cubeb_stream * st + + if (stm->output_unit != NULL) { + r = AudioUnitInitialize(stm->output_unit); + if (r != noErr) { + LOG("AudioUnitInitialize/output rv=%d", r); + return CUBEB_ERROR; + } + ++ #if !TARGET_OS_IPHONE + stm->current_latency_frames = audiounit_get_device_presentation_latency( + stm->output_device.id, kAudioDevicePropertyScopeOutput); ++ #endif + + Float64 unit_s; + UInt32 size = sizeof(unit_s); + if (AudioUnitGetProperty(stm->output_unit, kAudioUnitProperty_Latency, + kAudioUnitScope_Global, 0, &unit_s, + &size) == noErr) { + stm->current_latency_frames += + static_cast<uint32_t>(unit_s * stm->output_desc.mSampleRate); +@@ -2772,20 +2824,22 @@ audiounit_setup_stream(cubeb_stream * st + if (stm->input_unit && stm->output_unit) { + // According to the I/O hardware rate it is expected a specific pattern of + // callbacks for example is input is 44100 and output is 48000 we expected + // no more than 2 out callback in a row. + stm->expected_output_callbacks_in_a_row = + ceilf(stm->output_hw_rate / stm->input_hw_rate); + } + ++ #if !TARGET_OS_IPHONE + r = audiounit_install_device_changed_callback(stm); + if (r != CUBEB_OK) { + LOG("(%p) Could not install all device change callback.", stm); + } ++ #endif + + return CUBEB_OK; + } + + cubeb_stream::cubeb_stream(cubeb * context) + : context(context), resampler(nullptr, cubeb_resampler_destroy), + mixer(nullptr, cubeb_mixer_destroy) + { +@@ -2823,51 +2877,57 @@ audiounit_stream_init(cubeb * context, c + stm->latency_frames = latency_frames; + + if ((input_device && !input_stream_params) || + (output_device && !output_stream_params)) { + return CUBEB_ERROR_INVALID_PARAMETER; + } + if (input_stream_params) { + stm->input_stream_params = *input_stream_params; ++ #if !TARGET_OS_IPHONE + r = audiounit_set_device_info( + stm.get(), reinterpret_cast<uintptr_t>(input_device), io_side::INPUT); + if (r != CUBEB_OK) { + LOG("(%p) Fail to set device info for input.", stm.get()); + return r; + } ++ #endif + } + if (output_stream_params) { + stm->output_stream_params = *output_stream_params; ++ #if !TARGET_OS_IPHONE + r = audiounit_set_device_info( + stm.get(), reinterpret_cast<uintptr_t>(output_device), io_side::OUTPUT); + if (r != CUBEB_OK) { + LOG("(%p) Fail to set device info for output.", stm.get()); + return r; + } ++ #endif + } + + { + // It's not critical to lock here, because no other thread has been started + // yet, but it allows to assert that the lock has been taken in + // `audiounit_setup_stream`. + auto_lock lock(stm->mutex); + r = audiounit_setup_stream(stm.get()); + } + + if (r != CUBEB_OK) { + LOG("(%p) Could not setup the audiounit stream.", stm.get()); + return r; + } + ++ #if !TARGET_OS_IPHONE + r = audiounit_install_system_changed_callback(stm.get()); + if (r != CUBEB_OK) { + LOG("(%p) Could not install the device change callback.", stm.get()); + return r; + } ++ #endif + + *stream = stm.release(); + LOG("(%p) Cubeb stream init successful.", *stream); + return CUBEB_OK; + } + + static void + audiounit_close_stream(cubeb_stream * stm) +@@ -2886,54 +2946,60 @@ audiounit_close_stream(cubeb_stream * st + AudioUnitUninitialize(stm->output_unit); + AudioComponentInstanceDispose(stm->output_unit); + stm->output_unit = nullptr; + } + + stm->resampler.reset(); + stm->mixer.reset(); + ++ #if !TARGET_OS_IPHONE + if (stm->aggregate_device_id != kAudioObjectUnknown) { + audiounit_destroy_aggregate_device(stm->plugin_id, + &stm->aggregate_device_id); + stm->aggregate_device_id = kAudioObjectUnknown; + } ++ #endif + } + + static void + audiounit_stream_destroy_internal(cubeb_stream * stm) + { + stm->context->mutex.assert_current_thread_owns(); + ++#if !TARGET_OS_IPHONE + int r = audiounit_uninstall_system_changed_callback(stm); + if (r != CUBEB_OK) { + LOG("(%p) Could not uninstall the device changed callback", stm); + } + r = audiounit_uninstall_device_changed_callback(stm); + if (r != CUBEB_OK) { + LOG("(%p) Could not uninstall all device change listeners", stm); + } ++#endif + + auto_lock lock(stm->mutex); + audiounit_close_stream(stm); + assert(audiounit_active_streams(stm->context) >= 1); + audiounit_decrement_active_streams(stm->context); + } + + static void + audiounit_stream_destroy(cubeb_stream * stm) + { ++ #if !TARGET_OS_IPHONE + int r = audiounit_uninstall_system_changed_callback(stm); + if (r != CUBEB_OK) { + LOG("(%p) Could not uninstall the device changed callback", stm); + } + r = audiounit_uninstall_device_changed_callback(stm); + if (r != CUBEB_OK) { + LOG("(%p) Could not uninstall all device change listeners", stm); + } ++ #endif + + if (!stm->shutdown.load()) { + auto_lock context_lock(stm->context->mutex); + audiounit_stream_stop_internal(stm); + stm->shutdown = true; + } + + stm->destroy_pending = true; +@@ -3081,16 +3147,17 @@ convert_uint32_into_string(UInt32 data) + // Reverse 0xWXYZ into 0xZYXW. + str[0] = (char)(data >> 24); + str[1] = (char)(data >> 16); + str[2] = (char)(data >> 8); + str[3] = (char)(data); + return str; + } + ++#if !TARGET_OS_IPHONE + int + audiounit_get_default_device_datasource(cubeb_device_type type, UInt32 * data) + { + AudioDeviceID id = audiounit_get_default_device_id(type); + if (id == kAudioObjectUnknown) { + return CUBEB_ERROR; + } + +@@ -3102,38 +3169,43 @@ audiounit_get_default_device_datasource( + : &OUTPUT_DATA_SOURCE_PROPERTY_ADDRESS, + 0, NULL, &size, data); + if (r != noErr) { + *data = 0; + } + + return CUBEB_OK; + } ++#endif + + int + audiounit_get_default_device_name(cubeb_stream * stm, + cubeb_device * const device, + cubeb_device_type type) + { ++#if TARGET_OS_IPHONE ++ return CUBEB_ERROR_NOT_SUPPORTED; ++#else + assert(stm); + assert(device); + + UInt32 data; + int r = audiounit_get_default_device_datasource(type, &data); + if (r != CUBEB_OK) { + return r; + } + char ** name = type == CUBEB_DEVICE_TYPE_INPUT ? &device->input_name + : &device->output_name; + *name = convert_uint32_into_string(data).release(); + if (!strlen(*name)) { // empty string. + LOG("(%p) name of %s device is empty!", stm, + type == CUBEB_DEVICE_TYPE_INPUT ? "input" : "output"); + } + return CUBEB_OK; ++ #endif + } + + int + audiounit_stream_get_current_device(cubeb_stream * stm, + cubeb_device ** const device) + { + #if TARGET_OS_IPHONE + // TODO +@@ -3178,16 +3250,17 @@ audiounit_stream_register_device_changed + auto_lock dev_cb_lock(stream->device_changed_callback_lock); + /* Note: second register without unregister first causes 'nope' error. + * Current implementation requires unregister before register a new cb. */ + assert(!device_changed_callback || !stream->device_changed_callback); + stream->device_changed_callback = device_changed_callback; + return CUBEB_OK; + } + ++#if !TARGET_OS_IPHONE + static char * + audiounit_strref_to_cstr_utf8(CFStringRef strref) + { + CFIndex len, size; + char * ret; + if (strref == NULL) { + return NULL; + } +@@ -3199,22 +3272,24 @@ audiounit_strref_to_cstr_utf8(CFStringRe + + if (!CFStringGetCString(strref, ret, size, kCFStringEncodingUTF8)) { + delete[] ret; + ret = NULL; + } + + return ret; + } +- ++#endif ++ ++#if !TARGET_OS_IPHONE + static uint32_t + audiounit_get_channel_count(AudioObjectID devid, AudioObjectPropertyScope scope) + { + AudioObjectPropertyAddress adr = {0, scope, +- kAudioObjectPropertyElementMaster}; ++ kAudioObjectPropertyElementMain}; + UInt32 size = 0; + uint32_t i, ret = 0; + + adr.mSelector = kAudioDevicePropertyStreamConfiguration; + + if (AudioObjectGetPropertyDataSize(devid, &adr, 0, NULL, &size) == noErr && + size > 0) { + AudioBufferList * list = static_cast<AudioBufferList *>(alloca(size)); +@@ -3230,17 +3305,17 @@ audiounit_get_channel_count(AudioObjectI + + static void + audiounit_get_available_samplerate(AudioObjectID devid, + AudioObjectPropertyScope scope, + uint32_t * min, uint32_t * max, + uint32_t * def) + { + AudioObjectPropertyAddress adr = {0, scope, +- kAudioObjectPropertyElementMaster}; ++ kAudioObjectPropertyElementMain}; + + adr.mSelector = kAudioDevicePropertyNominalSampleRate; + if (AudioObjectHasProperty(devid, &adr)) { + UInt32 size = sizeof(Float64); + Float64 fvalue = 0.0; + if (AudioObjectGetPropertyData(devid, &adr, 0, NULL, &size, &fvalue) == + noErr) { + *def = fvalue; +@@ -3272,17 +3347,17 @@ audiounit_get_available_samplerate(Audio + } + } + + static UInt32 + audiounit_get_device_presentation_latency(AudioObjectID devid, + AudioObjectPropertyScope scope) + { + AudioObjectPropertyAddress adr = {0, scope, +- kAudioObjectPropertyElementMaster}; ++ kAudioObjectPropertyElementMain}; + UInt32 size, dev, stream = 0; + AudioStreamID sid[1]; + + adr.mSelector = kAudioDevicePropertyLatency; + size = sizeof(UInt32); + if (AudioObjectGetPropertyData(devid, &adr, 0, NULL, &size, &dev) != noErr) { + dev = 0; + } +@@ -3297,28 +3372,32 @@ audiounit_get_device_presentation_latenc + + return dev + stream; + } + + static int + audiounit_create_device_from_hwdev(cubeb_device_info * dev_info, + AudioObjectID devid, cubeb_device_type type) + { +- AudioObjectPropertyAddress adr = {0, 0, kAudioObjectPropertyElementMaster}; ++ AudioObjectPropertyAddress adr = {0, 0, kAudioObjectPropertyElementMain}; + UInt32 size; + + if (type == CUBEB_DEVICE_TYPE_OUTPUT) { + adr.mScope = kAudioDevicePropertyScopeOutput; + } else if (type == CUBEB_DEVICE_TYPE_INPUT) { + adr.mScope = kAudioDevicePropertyScopeInput; + } else { + return CUBEB_ERROR; + } + ++ #if TARGET_OS_IPHONE ++ UINT32 ch = 2; ++ #else + UInt32 ch = audiounit_get_channel_count(devid, adr.mScope); ++ #endif + if (ch == 0) { + return CUBEB_ERROR; + } + + PodZero(dev_info, 1); + + CFStringRef device_id_str = nullptr; + size = sizeof(CFStringRef); +@@ -3412,17 +3491,26 @@ audiounit_create_device_from_hwdev(cubeb + + bool + is_aggregate_device(cubeb_device_info * device_info) + { + assert(device_info->friendly_name); + return !strncmp(device_info->friendly_name, PRIVATE_AGGREGATE_DEVICE_NAME, + strlen(PRIVATE_AGGREGATE_DEVICE_NAME)); + } +- ++#endif ++ ++#if TARGET_OS_IPHONE ++static int ++audiounit_enumerate_devices(cubeb * /* context */, cubeb_device_type type, ++ cubeb_device_collection * collection) ++{ ++ return CUBEB_ERROR_NOT_SUPPORTED; ++} ++#else + static int + audiounit_enumerate_devices(cubeb * /* context */, cubeb_device_type type, + cubeb_device_collection * collection) + { + vector<AudioObjectID> input_devs; + vector<AudioObjectID> output_devs; + + // Count number of input and output devices. This is not +@@ -3478,29 +3566,35 @@ audiounit_enumerate_devices(cubeb * /* c + + static void + audiounit_device_destroy(cubeb_device_info * device) + { + delete[] device->device_id; + delete[] device->friendly_name; + delete[] device->vendor_name; + } ++#endif + + static int + audiounit_device_collection_destroy(cubeb * /* context */, + cubeb_device_collection * collection) + { ++ #if TARGET_OS_IPHONE ++ return CUBEB_ERROR_NOT_SUPPORTED; ++ #else + for (size_t i = 0; i < collection->count; i++) { + audiounit_device_destroy(&collection->device[i]); + } + delete[] collection->device; + + return CUBEB_OK; ++ #endif + } + ++#if !TARGET_OS_IPHONE + static vector<AudioObjectID> + audiounit_get_devices_of_type(cubeb_device_type devtype) + { + UInt32 size = 0; + OSStatus ret = AudioObjectGetPropertyDataSize( + kAudioObjectSystemObject, &DEVICES_PROPERTY_ADDRESS, 0, NULL, &size); + if (ret != noErr) { + return vector<AudioObjectID>(); +@@ -3653,17 +3747,28 @@ audiounit_remove_device_listener(cubeb * + context->output_collection_changed_callback) { + return noErr; + } + /* Note: unregister a non registered cb is not a problem, not checking. */ + return AudioObjectRemovePropertyListener( + kAudioObjectSystemObject, &DEVICES_PROPERTY_ADDRESS, + audiounit_collection_changed_callback, context); + } +- ++#endif ++ ++#if TARGET_OS_IPHONE ++int ++audiounit_register_device_collection_changed( ++ cubeb * context, cubeb_device_type devtype, ++ cubeb_device_collection_changed_callback collection_changed_callback, ++ void * user_ptr) ++{ ++ return CUBEB_ERROR_NOT_SUPPORTED; ++} ++#else + int + audiounit_register_device_collection_changed( + cubeb * context, cubeb_device_type devtype, + cubeb_device_collection_changed_callback collection_changed_callback, + void * user_ptr) + { + if (devtype == CUBEB_DEVICE_TYPE_UNKNOWN) { + return CUBEB_ERROR_INVALID_PARAMETER; +@@ -3673,16 +3778,17 @@ audiounit_register_device_collection_cha + if (collection_changed_callback) { + ret = audiounit_add_device_listener(context, devtype, + collection_changed_callback, user_ptr); + } else { + ret = audiounit_remove_device_listener(context, devtype); + } + return (ret == noErr) ? CUBEB_OK : CUBEB_ERROR; + } ++#endif + + cubeb_ops const audiounit_ops = { + /*.init =*/audiounit_init, + /*.get_backend_id =*/audiounit_get_backend_id, + /*.get_max_channel_count =*/audiounit_get_max_channel_count, + /*.get_min_latency =*/audiounit_get_min_latency, + /*.get_preferred_sample_rate =*/audiounit_get_preferred_sample_rate, + /*.get_supported_input_processing_params =*/NULL, diff --git a/media/libcubeb/0005-aaudio-timing-fix.patch b/media/libcubeb/0005-aaudio-timing-fix.patch new file mode 100644 index 0000000000..aabaec9c50 --- /dev/null +++ b/media/libcubeb/0005-aaudio-timing-fix.patch @@ -0,0 +1,57 @@ +From 19fcbefe1a9c5e22f8111af251df27b41658bc77 Mon Sep 17 00:00:00 2001 +From: John Lin <jolin@mozilla.com> +Date: Mon, 29 Apr 2024 13:46:57 -0700 +Subject: [PATCH] Invalidate timing info buffers when destorying AAudio stream. + +aaudio_stream_get_position() returns incorrect result because +aaudio_stream_init() recycled destroyed stream where the +timing_info buffers contain stale data. +--- + src/cubeb_aaudio.cpp | 2 ++ + src/cubeb_triple_buffer.h | 7 +++++++ + test/test_triple_buffer.cpp | 3 +++ + 3 files changed, 12 insertions(+) + +diff --git a/src/cubeb_aaudio.cpp b/src/cubeb_aaudio.cpp +index cfae2d6f..8b5eb231 100644 +--- a/src/cubeb_aaudio.cpp ++++ b/src/cubeb_aaudio.cpp +@@ -1049,6 +1049,8 @@ aaudio_stream_destroy_locked(cubeb_stream * stm, lock_guard<mutex> & lock) + stm->istream = nullptr; + } + ++ stm->timing_info.invalidate(); ++ + if (stm->resampler) { + cubeb_resampler_destroy(stm->resampler); + stm->resampler = nullptr; +diff --git a/src/cubeb_triple_buffer.h b/src/cubeb_triple_buffer.h +index a5a5978f..759b92e6 100644 +--- a/src/cubeb_triple_buffer.h ++++ b/src/cubeb_triple_buffer.h +@@ -42,6 +42,13 @@ template <typename T> class triple_buffer { + { + return (shared_state.load(std::memory_order_relaxed) & BACK_DIRTY_BIT) != 0; + } ++ // Reset state and indices to initial values. ++ void invalidate() ++ { ++ shared_state.store(0, std::memory_order_release); ++ input_idx = 1; ++ output_idx = 2; ++ } + + private: + // Publish a value to the consumer. Returns true if the data was overwritten +diff --git a/test/test_triple_buffer.cpp b/test/test_triple_buffer.cpp +index a6e0049b..d463c07e 100644 +--- a/test/test_triple_buffer.cpp ++++ b/test/test_triple_buffer.cpp +@@ -64,4 +64,7 @@ TEST(cubeb, triple_buffer) + } + + t.join(); ++ ++ buffer.invalidate(); ++ ASSERT_FALSE(buffer.updated()); + } diff --git a/media/libcubeb/moz.yaml b/media/libcubeb/moz.yaml index d79e64b5eb..3444bdb1d6 100644 --- a/media/libcubeb/moz.yaml +++ b/media/libcubeb/moz.yaml @@ -20,6 +20,8 @@ vendoring: - 0001-disable-aaudio-before-android-31.patch - 0002-disable-crash-reporter-death-test.patch - 0003-Only-build-duplex_collection_change_no_unregister-wh.patch + - 0004-audiounit-ios-compile-fixes.patch + - 0005-aaudio-timing-fix.patch skip-vendoring-steps: - update-moz-build exclude: diff --git a/media/libcubeb/src/cubeb_aaudio.cpp b/media/libcubeb/src/cubeb_aaudio.cpp index df19602cd6..c2441bbeef 100644 --- a/media/libcubeb/src/cubeb_aaudio.cpp +++ b/media/libcubeb/src/cubeb_aaudio.cpp @@ -1039,6 +1039,8 @@ aaudio_stream_destroy_locked(cubeb_stream * stm, lock_guard<mutex> & lock) stm->istream = nullptr; } + stm->timing_info.invalidate(); + if (stm->resampler) { cubeb_resampler_destroy(stm->resampler); stm->resampler = nullptr; diff --git a/media/libcubeb/src/cubeb_audiounit.cpp b/media/libcubeb/src/cubeb_audiounit.cpp index d823e80ff8..fb15790159 100644 --- a/media/libcubeb/src/cubeb_audiounit.cpp +++ b/media/libcubeb/src/cubeb_audiounit.cpp @@ -41,6 +41,15 @@ using namespace std; typedef UInt32 AudioFormatFlags; #endif +#if TARGET_OS_IPHONE +typedef UInt32 AudioDeviceID; +typedef UInt32 AudioObjectID; +const UInt32 kAudioObjectUnknown = 0; + +#define AudioGetCurrentHostTime mach_absolute_time + +#endif + #define AU_OUT_BUS 0 #define AU_IN_BUS 1 @@ -65,6 +74,7 @@ const char * PRIVATE_AGGREGATE_DEVICE_NAME = "CubebAggregateDevice"; LOG(msg, ##__VA_ARGS__); \ }) +#if !TARGET_OS_IPHONE /* Testing empirically, some headsets report a minimal latency that is very * low, but this does not work in practice. Lie and say the minimum is 256 * frames. */ @@ -73,27 +83,28 @@ const uint32_t SAFE_MAX_LATENCY_FRAMES = 512; const AudioObjectPropertyAddress DEFAULT_INPUT_DEVICE_PROPERTY_ADDRESS = { kAudioHardwarePropertyDefaultInputDevice, kAudioObjectPropertyScopeGlobal, - kAudioObjectPropertyElementMaster}; + kAudioObjectPropertyElementMain}; const AudioObjectPropertyAddress DEFAULT_OUTPUT_DEVICE_PROPERTY_ADDRESS = { kAudioHardwarePropertyDefaultOutputDevice, kAudioObjectPropertyScopeGlobal, - kAudioObjectPropertyElementMaster}; + kAudioObjectPropertyElementMain}; const AudioObjectPropertyAddress DEVICE_IS_ALIVE_PROPERTY_ADDRESS = { kAudioDevicePropertyDeviceIsAlive, kAudioObjectPropertyScopeGlobal, - kAudioObjectPropertyElementMaster}; + kAudioObjectPropertyElementMain}; const AudioObjectPropertyAddress DEVICES_PROPERTY_ADDRESS = { kAudioHardwarePropertyDevices, kAudioObjectPropertyScopeGlobal, - kAudioObjectPropertyElementMaster}; + kAudioObjectPropertyElementMain}; const AudioObjectPropertyAddress INPUT_DATA_SOURCE_PROPERTY_ADDRESS = { kAudioDevicePropertyDataSource, kAudioDevicePropertyScopeInput, - kAudioObjectPropertyElementMaster}; + kAudioObjectPropertyElementMain}; const AudioObjectPropertyAddress OUTPUT_DATA_SOURCE_PROPERTY_ADDRESS = { kAudioDevicePropertyDataSource, kAudioDevicePropertyScopeOutput, - kAudioObjectPropertyElementMaster}; + kAudioObjectPropertyElementMain}; +#endif typedef uint32_t device_flags_value; @@ -114,22 +125,22 @@ static void audiounit_close_stream(cubeb_stream * stm); static int audiounit_setup_stream(cubeb_stream * stm); +#if !TARGET_OS_IPHONE static vector<AudioObjectID> audiounit_get_devices_of_type(cubeb_device_type devtype); static UInt32 audiounit_get_device_presentation_latency(AudioObjectID devid, AudioObjectPropertyScope scope); - -#if !TARGET_OS_IPHONE static AudioObjectID audiounit_get_default_device_id(cubeb_device_type type); static int audiounit_uninstall_device_changed_callback(cubeb_stream * stm); static int audiounit_uninstall_system_changed_callback(cubeb_stream * stm); +#endif + static void audiounit_reinit_stream_async(cubeb_stream * stm, device_flags_value flags); -#endif extern cubeb_ops const audiounit_ops; @@ -144,9 +155,11 @@ struct cubeb { cubeb_device_collection_changed_callback output_collection_changed_callback = nullptr; void * output_collection_changed_user_ptr = nullptr; + #if !TARGET_OS_IPHONE // Store list of devices to detect changes vector<AudioObjectID> input_device_array; vector<AudioObjectID> output_device_array; + #endif // The queue should be released when it’s no longer needed. dispatch_queue_t serial_queue = dispatch_queue_create(DISPATCH_QUEUE_LABEL, DISPATCH_QUEUE_SERIAL); @@ -186,6 +199,7 @@ struct device_info { device_flags_value flags = DEV_UNKNOWN; }; +#if !TARGET_OS_IPHONE struct property_listener { AudioDeviceID device_id; const AudioObjectPropertyAddress * property_address; @@ -199,6 +213,7 @@ struct property_listener { { } }; +#endif struct cubeb_stream { explicit cubeb_stream(cubeb * context); @@ -257,22 +272,26 @@ struct cubeb_stream { /* This is true if a device change callback is currently running. */ atomic<bool> switching_device{false}; atomic<bool> buffer_size_change_state{false}; + #if !TARGET_OS_IPHONE AudioDeviceID aggregate_device_id = kAudioObjectUnknown; // the aggregate device id AudioObjectID plugin_id = kAudioObjectUnknown; // used to create aggregate device + #endif /* Mixer interface */ unique_ptr<cubeb_mixer, decltype(&cubeb_mixer_destroy)> mixer; /* Buffer where remixing/resampling will occur when upmixing is required */ /* Only accessed from callback thread */ unique_ptr<uint8_t[]> temp_buffer; size_t temp_buffer_size = 0; // size in bytes. + #if !TARGET_OS_IPHONE /* Listeners indicating what system events are monitored. */ unique_ptr<property_listener> default_input_listener; unique_ptr<property_listener> default_output_listener; unique_ptr<property_listener> input_alive_listener; unique_ptr<property_listener> input_source_listener; unique_ptr<property_listener> output_source_listener; + #endif }; bool @@ -386,14 +405,6 @@ is_common_sample_rate(Float64 sample_rate) sample_rate == 88200 || sample_rate == 96000; } -#if TARGET_OS_IPHONE -typedef UInt32 AudioDeviceID; -typedef UInt32 AudioObjectID; - -#define AudioGetCurrentHostTime mach_absolute_time - -#endif - uint64_t ConvertHostTimeToNanos(uint64_t host_time) { @@ -761,13 +772,13 @@ audiounit_get_backend_id(cubeb * /* ctx */) return "audiounit"; } -#if !TARGET_OS_IPHONE static int audiounit_stream_get_volume(cubeb_stream * stm, float * volume); static int audiounit_stream_set_volume(cubeb_stream * stm, float volume); +#if !TARGET_OS_IPHONE static int audiounit_set_device_info(cubeb_stream * stm, AudioDeviceID id, io_side side) { @@ -811,6 +822,7 @@ audiounit_set_device_info(cubeb_stream * stm, AudioDeviceID id, io_side side) return CUBEB_OK; } +#endif static int audiounit_reinit_stream(cubeb_stream * stm, device_flags_value flags) @@ -822,10 +834,13 @@ audiounit_reinit_stream(cubeb_stream * stm, device_flags_value flags) audiounit_stream_stop_internal(stm); } - int r = audiounit_uninstall_device_changed_callback(stm); + int r; +#if !TARGET_OS_IPHONE + r = audiounit_uninstall_device_changed_callback(stm); if (r != CUBEB_OK) { LOG("(%p) Could not uninstall all device change listeners.", stm); } +#endif { auto_lock lock(stm->mutex); @@ -837,6 +852,7 @@ audiounit_reinit_stream(cubeb_stream * stm, device_flags_value flags) audiounit_close_stream(stm); + #if !TARGET_OS_IPHONE /* Reinit occurs in one of the following case: * - When the device is not alive any more * - When the default system device change. @@ -866,8 +882,11 @@ audiounit_reinit_stream(cubeb_stream * stm, device_flags_value flags) return CUBEB_ERROR; } + #endif + if (audiounit_setup_stream(stm) != CUBEB_OK) { LOG("(%p) Stream reinit failed.", stm); + #if !TARGET_OS_IPHONE if (flags & DEV_INPUT && input_device != kAudioObjectUnknown) { // Attempt to re-use the same device-id failed, so attempt again with // default input device. @@ -879,6 +898,7 @@ audiounit_reinit_stream(cubeb_stream * stm, device_flags_value flags) return CUBEB_ERROR; } } + #endif } if (vol_rv == CUBEB_OK) { @@ -914,9 +934,11 @@ audiounit_reinit_stream_async(cubeb_stream * stm, device_flags_value flags) } if (audiounit_reinit_stream(stm, flags) != CUBEB_OK) { + #if !TARGET_OS_IPHONE if (audiounit_uninstall_system_changed_callback(stm) != CUBEB_OK) { LOG("(%p) Could not uninstall system changed callback", stm); } + #endif stm->state_callback(stm, stm->user_ptr, CUBEB_STATE_ERROR); LOG("(%p) Could not reopen the stream after switching.", stm); } @@ -925,6 +947,7 @@ audiounit_reinit_stream_async(cubeb_stream * stm, device_flags_value flags) }); } +#if !TARGET_OS_IPHONE static char const * event_addr_to_string(AudioObjectPropertySelector selector) { @@ -1096,6 +1119,7 @@ audiounit_install_device_changed_callback(cubeb_stream * stm) return r; } +#if !TARGET_OS_IPHONE static int audiounit_install_system_changed_callback(cubeb_stream * stm) { @@ -1136,6 +1160,7 @@ audiounit_install_system_changed_callback(cubeb_stream * stm) return CUBEB_OK; } +#endif static int audiounit_uninstall_device_changed_callback(cubeb_stream * stm) @@ -1212,7 +1237,7 @@ audiounit_get_acceptable_latency_range(AudioValueRange * latency_range) AudioDeviceID output_device_id; AudioObjectPropertyAddress output_device_buffer_size_range = { kAudioDevicePropertyBufferFrameSizeRange, kAudioDevicePropertyScopeOutput, - kAudioObjectPropertyElementMaster}; + kAudioObjectPropertyElementMain}; output_device_id = audiounit_get_default_device_id(CUBEB_DEVICE_TYPE_OUTPUT); if (output_device_id == kAudioObjectUnknown) { @@ -1233,7 +1258,6 @@ audiounit_get_acceptable_latency_range(AudioValueRange * latency_range) return CUBEB_OK; } -#endif /* !TARGET_OS_IPHONE */ static AudioObjectID audiounit_get_default_device_id(cubeb_device_type type) @@ -1256,6 +1280,7 @@ audiounit_get_default_device_id(cubeb_device_type type) return devid; } +#endif /* !TARGET_OS_IPHONE */ int audiounit_get_max_channel_count(cubeb * ctx, uint32_t * max_channels) @@ -1270,7 +1295,7 @@ audiounit_get_max_channel_count(cubeb * ctx, uint32_t * max_channels) AudioStreamBasicDescription stream_format; AudioObjectPropertyAddress stream_format_address = { kAudioDevicePropertyStreamFormat, kAudioDevicePropertyScopeOutput, - kAudioObjectPropertyElementMaster}; + kAudioObjectPropertyElementMain}; assert(ctx && max_channels); @@ -1309,17 +1334,16 @@ audiounit_get_min_latency(cubeb * /* ctx */, cubeb_stream_params /* params */, *latency_frames = max<uint32_t>(latency_range.mMinimum, SAFE_MIN_LATENCY_FRAMES); -#endif - return CUBEB_OK; +#endif } static int audiounit_get_preferred_sample_rate(cubeb * /* ctx */, uint32_t * rate) { #if TARGET_OS_IPHONE - // TODO - return CUBEB_ERROR_NOT_SUPPORTED; + *rate = 44100; + return CUBEB_OK; #else UInt32 size; OSStatus r; @@ -1327,7 +1351,7 @@ audiounit_get_preferred_sample_rate(cubeb * /* ctx */, uint32_t * rate) AudioDeviceID output_device_id; AudioObjectPropertyAddress samplerate_address = { kAudioDevicePropertyNominalSampleRate, kAudioObjectPropertyScopeGlobal, - kAudioObjectPropertyElementMaster}; + kAudioObjectPropertyElementMain}; output_device_id = audiounit_get_default_device_id(CUBEB_DEVICE_TYPE_OUTPUT); if (output_device_id == kAudioObjectUnknown) { @@ -1343,8 +1367,9 @@ audiounit_get_preferred_sample_rate(cubeb * /* ctx */, uint32_t * rate) } *rate = static_cast<uint32_t>(fsamplerate); -#endif + return CUBEB_OK; +#endif } static cubeb_channel_layout @@ -1385,6 +1410,9 @@ audiounit_convert_channel_layout(AudioChannelLayout * layout) static cubeb_channel_layout audiounit_get_preferred_channel_layout(AudioUnit output_unit) { + #if TARGET_OS_IPHONE + return CUBEB_LAYOUT_STEREO; + #else OSStatus rv = noErr; UInt32 size = 0; rv = AudioUnitGetPropertyInfo( @@ -1409,6 +1437,7 @@ audiounit_get_preferred_channel_layout(AudioUnit output_unit) } return audiounit_convert_channel_layout(layout.get()); + #endif } static cubeb_channel_layout @@ -1442,8 +1471,10 @@ audiounit_get_current_channel_layout(AudioUnit output_unit) static int audiounit_create_unit(AudioUnit * unit, device_info * device); +#if !TARGET_OS_IPHONE static OSStatus audiounit_remove_device_listener(cubeb * context, cubeb_device_type devtype); +#endif static void audiounit_destroy(cubeb * ctx) @@ -1465,6 +1496,7 @@ audiounit_destroy(cubeb * ctx) !ctx->output_collection_changed_callback && !ctx->output_collection_changed_user_ptr); + #if !TARGET_OS_IPHONE /* Unregister the callback if necessary. */ if (ctx->input_collection_changed_callback) { audiounit_remove_device_listener(ctx, CUBEB_DEVICE_TYPE_INPUT); @@ -1472,6 +1504,7 @@ audiounit_destroy(cubeb * ctx) if (ctx->output_collection_changed_callback) { audiounit_remove_device_listener(ctx, CUBEB_DEVICE_TYPE_OUTPUT); } + #endif } dispatch_release(ctx->serial_queue); @@ -1599,13 +1632,14 @@ audiounit_layout_init(cubeb_stream * stm, io_side side) stm->context->layout); } +#if !TARGET_OS_IPHONE static vector<AudioObjectID> audiounit_get_sub_devices(AudioDeviceID device_id) { vector<AudioDeviceID> sub_devices; AudioObjectPropertyAddress property_address = { kAudioAggregateDevicePropertyActiveSubDeviceList, - kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMaster}; + kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMain}; UInt32 size = 0; OSStatus rv = AudioObjectGetPropertyDataSize(device_id, &property_address, 0, nullptr, &size); @@ -1634,7 +1668,7 @@ audiounit_create_blank_aggregate_device(AudioObjectID * plugin_id, { AudioObjectPropertyAddress address_plugin_bundle_id = { kAudioHardwarePropertyPlugInForBundleID, kAudioObjectPropertyScopeGlobal, - kAudioObjectPropertyElementMaster}; + kAudioObjectPropertyElementMain}; UInt32 size = 0; OSStatus r = AudioObjectGetPropertyDataSize( kAudioObjectSystemObject, &address_plugin_bundle_id, 0, NULL, &size); @@ -1664,7 +1698,7 @@ audiounit_create_blank_aggregate_device(AudioObjectID * plugin_id, AudioObjectPropertyAddress create_aggregate_device_address = { kAudioPlugInCreateAggregateDevice, kAudioObjectPropertyScopeGlobal, - kAudioObjectPropertyElementMaster}; + kAudioObjectPropertyElementMain}; r = AudioObjectGetPropertyDataSize( *plugin_id, &create_aggregate_device_address, 0, nullptr, &size); if (r != noErr) { @@ -1736,7 +1770,7 @@ get_device_name(AudioDeviceID id) CFStringRef UIname = nullptr; AudioObjectPropertyAddress address_uuid = {kAudioDevicePropertyDeviceUID, kAudioObjectPropertyScopeGlobal, - kAudioObjectPropertyElementMaster}; + kAudioObjectPropertyElementMain}; OSStatus err = AudioObjectGetPropertyData(id, &address_uuid, 0, nullptr, &size, &UIname); return (err == noErr) ? UIname : NULL; @@ -1779,7 +1813,7 @@ audiounit_set_aggregate_sub_device_list(AudioDeviceID aggregate_device_id, AudioObjectPropertyAddress aggregate_sub_device_list = { kAudioAggregateDevicePropertyFullSubDeviceList, - kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMaster}; + kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMain}; UInt32 size = sizeof(CFMutableArrayRef); OSStatus rv = AudioObjectSetPropertyData( aggregate_device_id, &aggregate_sub_device_list, 0, nullptr, size, @@ -1801,7 +1835,7 @@ audiounit_set_master_aggregate_device(const AudioDeviceID aggregate_device_id) assert(aggregate_device_id != kAudioObjectUnknown); AudioObjectPropertyAddress master_aggregate_sub_device = { kAudioAggregateDevicePropertyMasterSubDevice, - kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMaster}; + kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMain}; // Master become the 1st output sub device AudioDeviceID output_device_id = @@ -1834,7 +1868,7 @@ audiounit_activate_clock_drift_compensation( assert(aggregate_device_id != kAudioObjectUnknown); AudioObjectPropertyAddress address_owned = { kAudioObjectPropertyOwnedObjects, kAudioObjectPropertyScopeGlobal, - kAudioObjectPropertyElementMaster}; + kAudioObjectPropertyElementMain}; UInt32 qualifier_data_size = sizeof(AudioObjectID); AudioClassID class_id = kAudioSubDeviceClassID; @@ -1866,7 +1900,7 @@ audiounit_activate_clock_drift_compensation( AudioObjectPropertyAddress address_drift = { kAudioSubDevicePropertyDriftCompensation, kAudioObjectPropertyScopeGlobal, - kAudioObjectPropertyElementMaster}; + kAudioObjectPropertyElementMain}; // Start from the second device since the first is the master clock for (UInt32 i = 1; i < subdevices_num; ++i) { @@ -1935,7 +1969,7 @@ audiounit_workaround_for_airpod(cubeb_stream * stm) Float64 rate = input_nominal_rate; AudioObjectPropertyAddress addr = {kAudioDevicePropertyNominalSampleRate, kAudioObjectPropertyScopeGlobal, - kAudioObjectPropertyElementMaster}; + kAudioObjectPropertyElementMain}; OSStatus rv = AudioObjectSetPropertyData(stm->aggregate_device_id, &addr, 0, nullptr, sizeof(Float64), &rate); @@ -2019,7 +2053,7 @@ audiounit_destroy_aggregate_device(AudioObjectID plugin_id, plugin_id != kAudioObjectUnknown); AudioObjectPropertyAddress destroy_aggregate_device_addr = { kAudioPlugInDestroyAggregateDevice, kAudioObjectPropertyScopeGlobal, - kAudioObjectPropertyElementMaster}; + kAudioObjectPropertyElementMain}; UInt32 size; OSStatus rv = AudioObjectGetPropertyDataSize( plugin_id, &destroy_aggregate_device_addr, 0, NULL, &size); @@ -2042,6 +2076,7 @@ audiounit_destroy_aggregate_device(AudioObjectID plugin_id, *aggregate_device_id = kAudioObjectUnknown; return CUBEB_OK; } +#endif static int audiounit_new_unit_instance(AudioUnit * unit, device_info * device) @@ -2178,6 +2213,9 @@ audiounit_init_input_linear_buffer(cubeb_stream * stream, uint32_t capacity) static uint32_t audiounit_clamp_latency(cubeb_stream * stm, uint32_t latency_frames) { + #if TARGET_OS_IPHONE + return latency_frames; + #else // For the 1st stream set anything within safe min-max assert(audiounit_active_streams(stm->context) > 0); if (audiounit_active_streams(stm->context) == 1) { @@ -2238,8 +2276,10 @@ audiounit_clamp_latency(cubeb_stream * stm, uint32_t latency_frames) return max(min<uint32_t>(latency_frames, upper_latency_limit), SAFE_MIN_LATENCY_FRAMES); + #endif } +#if !TARGET_OS_IPHONE /* * Change buffer size is prone to deadlock thus we change it * following the steps: @@ -2290,11 +2330,15 @@ buffer_size_changed_callback(void * inClientData, AudioUnit inUnit, } } } +#endif static int audiounit_set_buffer_size(cubeb_stream * stm, uint32_t new_size_frames, io_side side) { + #if TARGET_OS_IPHONE + return CUBEB_OK; + #else AudioUnit au = stm->output_unit; AudioUnitScope au_scope = kAudioUnitScope_Input; AudioUnitElement au_element = AU_OUT_BUS; @@ -2382,6 +2426,7 @@ audiounit_set_buffer_size(cubeb_stream * stm, uint32_t new_size_frames, LOG("(%p) %s buffer size changed to %u frames.", stm, to_string(side), new_size_frames); return CUBEB_OK; + #endif } static int @@ -2598,6 +2643,7 @@ audiounit_setup_stream(cubeb_stream * stm) device_info in_dev_info = stm->input_device; device_info out_dev_info = stm->output_device; + #if !TARGET_OS_IPHONE if (has_input(stm) && has_output(stm) && stm->input_device.id != stm->output_device.id) { r = audiounit_create_aggregate_device(stm); @@ -2615,6 +2661,10 @@ audiounit_setup_stream(cubeb_stream * stm) out_dev_info.flags = DEV_OUTPUT; } } + #else + in_dev_info.flags = DEV_SYSTEM_DEFAULT | DEV_INPUT; + out_dev_info.flags = DEV_SYSTEM_DEFAULT | DEV_OUTPUT; + #endif if (has_input(stm)) { r = audiounit_create_unit(&stm->input_unit, &in_dev_info); @@ -2756,8 +2806,10 @@ audiounit_setup_stream(cubeb_stream * stm) return CUBEB_ERROR; } + #if !TARGET_OS_IPHONE stm->current_latency_frames = audiounit_get_device_presentation_latency( stm->output_device.id, kAudioDevicePropertyScopeOutput); + #endif Float64 unit_s; UInt32 size = sizeof(unit_s); @@ -2777,10 +2829,12 @@ audiounit_setup_stream(cubeb_stream * stm) ceilf(stm->output_hw_rate / stm->input_hw_rate); } + #if !TARGET_OS_IPHONE r = audiounit_install_device_changed_callback(stm); if (r != CUBEB_OK) { LOG("(%p) Could not install all device change callback.", stm); } + #endif return CUBEB_OK; } @@ -2828,21 +2882,25 @@ audiounit_stream_init(cubeb * context, cubeb_stream ** stream, } if (input_stream_params) { stm->input_stream_params = *input_stream_params; + #if !TARGET_OS_IPHONE r = audiounit_set_device_info( stm.get(), reinterpret_cast<uintptr_t>(input_device), io_side::INPUT); if (r != CUBEB_OK) { LOG("(%p) Fail to set device info for input.", stm.get()); return r; } + #endif } if (output_stream_params) { stm->output_stream_params = *output_stream_params; + #if !TARGET_OS_IPHONE r = audiounit_set_device_info( stm.get(), reinterpret_cast<uintptr_t>(output_device), io_side::OUTPUT); if (r != CUBEB_OK) { LOG("(%p) Fail to set device info for output.", stm.get()); return r; } + #endif } { @@ -2858,11 +2916,13 @@ audiounit_stream_init(cubeb * context, cubeb_stream ** stream, return r; } + #if !TARGET_OS_IPHONE r = audiounit_install_system_changed_callback(stm.get()); if (r != CUBEB_OK) { LOG("(%p) Could not install the device change callback.", stm.get()); return r; } + #endif *stream = stm.release(); LOG("(%p) Cubeb stream init successful.", *stream); @@ -2891,11 +2951,13 @@ audiounit_close_stream(cubeb_stream * stm) stm->resampler.reset(); stm->mixer.reset(); + #if !TARGET_OS_IPHONE if (stm->aggregate_device_id != kAudioObjectUnknown) { audiounit_destroy_aggregate_device(stm->plugin_id, &stm->aggregate_device_id); stm->aggregate_device_id = kAudioObjectUnknown; } + #endif } static void @@ -2903,6 +2965,7 @@ audiounit_stream_destroy_internal(cubeb_stream * stm) { stm->context->mutex.assert_current_thread_owns(); +#if !TARGET_OS_IPHONE int r = audiounit_uninstall_system_changed_callback(stm); if (r != CUBEB_OK) { LOG("(%p) Could not uninstall the device changed callback", stm); @@ -2911,6 +2974,7 @@ audiounit_stream_destroy_internal(cubeb_stream * stm) if (r != CUBEB_OK) { LOG("(%p) Could not uninstall all device change listeners", stm); } +#endif auto_lock lock(stm->mutex); audiounit_close_stream(stm); @@ -2921,6 +2985,7 @@ audiounit_stream_destroy_internal(cubeb_stream * stm) static void audiounit_stream_destroy(cubeb_stream * stm) { + #if !TARGET_OS_IPHONE int r = audiounit_uninstall_system_changed_callback(stm); if (r != CUBEB_OK) { LOG("(%p) Could not uninstall the device changed callback", stm); @@ -2929,6 +2994,7 @@ audiounit_stream_destroy(cubeb_stream * stm) if (r != CUBEB_OK) { LOG("(%p) Could not uninstall all device change listeners", stm); } + #endif if (!stm->shutdown.load()) { auto_lock context_lock(stm->context->mutex); @@ -3086,6 +3152,7 @@ convert_uint32_into_string(UInt32 data) return str; } +#if !TARGET_OS_IPHONE int audiounit_get_default_device_datasource(cubeb_device_type type, UInt32 * data) { @@ -3107,12 +3174,16 @@ audiounit_get_default_device_datasource(cubeb_device_type type, UInt32 * data) return CUBEB_OK; } +#endif int audiounit_get_default_device_name(cubeb_stream * stm, cubeb_device * const device, cubeb_device_type type) { +#if TARGET_OS_IPHONE + return CUBEB_ERROR_NOT_SUPPORTED; +#else assert(stm); assert(device); @@ -3129,6 +3200,7 @@ audiounit_get_default_device_name(cubeb_stream * stm, type == CUBEB_DEVICE_TYPE_INPUT ? "input" : "output"); } return CUBEB_OK; + #endif } int @@ -3183,6 +3255,7 @@ audiounit_stream_register_device_changed_callback( return CUBEB_OK; } +#if !TARGET_OS_IPHONE static char * audiounit_strref_to_cstr_utf8(CFStringRef strref) { @@ -3204,12 +3277,14 @@ audiounit_strref_to_cstr_utf8(CFStringRef strref) return ret; } +#endif +#if !TARGET_OS_IPHONE static uint32_t audiounit_get_channel_count(AudioObjectID devid, AudioObjectPropertyScope scope) { AudioObjectPropertyAddress adr = {0, scope, - kAudioObjectPropertyElementMaster}; + kAudioObjectPropertyElementMain}; UInt32 size = 0; uint32_t i, ret = 0; @@ -3235,7 +3310,7 @@ audiounit_get_available_samplerate(AudioObjectID devid, uint32_t * def) { AudioObjectPropertyAddress adr = {0, scope, - kAudioObjectPropertyElementMaster}; + kAudioObjectPropertyElementMain}; adr.mSelector = kAudioDevicePropertyNominalSampleRate; if (AudioObjectHasProperty(devid, &adr)) { @@ -3277,7 +3352,7 @@ audiounit_get_device_presentation_latency(AudioObjectID devid, AudioObjectPropertyScope scope) { AudioObjectPropertyAddress adr = {0, scope, - kAudioObjectPropertyElementMaster}; + kAudioObjectPropertyElementMain}; UInt32 size, dev, stream = 0; AudioStreamID sid[1]; @@ -3302,7 +3377,7 @@ static int audiounit_create_device_from_hwdev(cubeb_device_info * dev_info, AudioObjectID devid, cubeb_device_type type) { - AudioObjectPropertyAddress adr = {0, 0, kAudioObjectPropertyElementMaster}; + AudioObjectPropertyAddress adr = {0, 0, kAudioObjectPropertyElementMain}; UInt32 size; if (type == CUBEB_DEVICE_TYPE_OUTPUT) { @@ -3313,7 +3388,11 @@ audiounit_create_device_from_hwdev(cubeb_device_info * dev_info, return CUBEB_ERROR; } + #if TARGET_OS_IPHONE + UINT32 ch = 2; + #else UInt32 ch = audiounit_get_channel_count(devid, adr.mScope); + #endif if (ch == 0) { return CUBEB_ERROR; } @@ -3417,7 +3496,16 @@ is_aggregate_device(cubeb_device_info * device_info) return !strncmp(device_info->friendly_name, PRIVATE_AGGREGATE_DEVICE_NAME, strlen(PRIVATE_AGGREGATE_DEVICE_NAME)); } +#endif +#if TARGET_OS_IPHONE +static int +audiounit_enumerate_devices(cubeb * /* context */, cubeb_device_type type, + cubeb_device_collection * collection) +{ + return CUBEB_ERROR_NOT_SUPPORTED; +} +#else static int audiounit_enumerate_devices(cubeb * /* context */, cubeb_device_type type, cubeb_device_collection * collection) @@ -3483,19 +3571,25 @@ audiounit_device_destroy(cubeb_device_info * device) delete[] device->friendly_name; delete[] device->vendor_name; } +#endif static int audiounit_device_collection_destroy(cubeb * /* context */, cubeb_device_collection * collection) { + #if TARGET_OS_IPHONE + return CUBEB_ERROR_NOT_SUPPORTED; + #else for (size_t i = 0; i < collection->count; i++) { audiounit_device_destroy(&collection->device[i]); } delete[] collection->device; return CUBEB_OK; + #endif } +#if !TARGET_OS_IPHONE static vector<AudioObjectID> audiounit_get_devices_of_type(cubeb_device_type devtype) { @@ -3658,7 +3752,18 @@ audiounit_remove_device_listener(cubeb * context, cubeb_device_type devtype) kAudioObjectSystemObject, &DEVICES_PROPERTY_ADDRESS, audiounit_collection_changed_callback, context); } +#endif +#if TARGET_OS_IPHONE +int +audiounit_register_device_collection_changed( + cubeb * context, cubeb_device_type devtype, + cubeb_device_collection_changed_callback collection_changed_callback, + void * user_ptr) +{ + return CUBEB_ERROR_NOT_SUPPORTED; +} +#else int audiounit_register_device_collection_changed( cubeb * context, cubeb_device_type devtype, @@ -3678,6 +3783,7 @@ audiounit_register_device_collection_changed( } return (ret == noErr) ? CUBEB_OK : CUBEB_ERROR; } +#endif cubeb_ops const audiounit_ops = { /*.init =*/audiounit_init, diff --git a/media/libcubeb/src/cubeb_triple_buffer.h b/media/libcubeb/src/cubeb_triple_buffer.h index a5a5978fb4..759b92e62b 100644 --- a/media/libcubeb/src/cubeb_triple_buffer.h +++ b/media/libcubeb/src/cubeb_triple_buffer.h @@ -42,6 +42,13 @@ public: { return (shared_state.load(std::memory_order_relaxed) & BACK_DIRTY_BIT) != 0; } + // Reset state and indices to initial values. + void invalidate() + { + shared_state.store(0, std::memory_order_release); + input_idx = 1; + output_idx = 2; + } private: // Publish a value to the consumer. Returns true if the data was overwritten diff --git a/media/libcubeb/src/moz.build b/media/libcubeb/src/moz.build index d7d05b5867..46a89c4063 100644 --- a/media/libcubeb/src/moz.build +++ b/media/libcubeb/src/moz.build @@ -74,8 +74,8 @@ if CONFIG['MOZ_AUDIOUNIT_RUST']: SOURCES += [ 'cubeb_osx_run_loop.c', ] + DEFINES['USE_AUDIOUNIT_RUST'] = True DEFINES['USE_AUDIOUNIT'] = True - DEFINES['USE_AUDIOUNIT_RUST'] = True if CONFIG['MOZ_WASAPI']: SOURCES += [ diff --git a/media/libcubeb/test/test_triple_buffer.cpp b/media/libcubeb/test/test_triple_buffer.cpp index a6e0049b79..d463c07e03 100644 --- a/media/libcubeb/test/test_triple_buffer.cpp +++ b/media/libcubeb/test/test_triple_buffer.cpp @@ -64,4 +64,7 @@ TEST(cubeb, triple_buffer) } t.join(); + + buffer.invalidate(); + ASSERT_FALSE(buffer.updated()); } diff --git a/media/libdav1d/config.h b/media/libdav1d/config.h index 218c8ae7f4..c7bdc7defc 100644 --- a/media/libdav1d/config.h +++ b/media/libdav1d/config.h @@ -46,7 +46,10 @@ // Those values are copied from the auto generated // config file produced by stand alone dav1d build. # define HAVE_AS_FUNC 0 +// Build with <sys/auxv.h> header only on Linux-specific systems. +#if defined(__linux__) # define HAVE_GETAUXVAL 1 +#endif # define PIC 3 #endif diff --git a/media/libdav1d/moz.yaml b/media/libdav1d/moz.yaml index 22994fc7bf..ca526ea688 100644 --- a/media/libdav1d/moz.yaml +++ b/media/libdav1d/moz.yaml @@ -20,11 +20,11 @@ origin: # Human-readable identifier for this version/release # Generally "version NNN", "tag SSS", "bookmark SSS" - release: 4796b59fc0a459588183dc2ea199ba1074befc67 (2024-02-18T15:37:04.000+01:00). + release: 8e08426468a76d8a667e8a79d92bafd85d7411ac (2024-03-18T20:50:37.000+00:00). # Revision to pull in # Must be a long or short commit SHA (long preferred) - revision: 4796b59fc0a459588183dc2ea199ba1074befc67 + revision: 8e08426468a76d8a667e8a79d92bafd85d7411ac # The package's license, where possible using the mnemonic from # https://spdx.org/licenses/ diff --git a/media/libdav1d/vcs_version.h b/media/libdav1d/vcs_version.h index 1ac3f3ded3..af1770d5bd 100644 --- a/media/libdav1d/vcs_version.h +++ b/media/libdav1d/vcs_version.h @@ -1,2 +1,2 @@ /* auto-generated, do not edit */ -#define DAV1D_VERSION "4796b59fc0a459588183dc2ea199ba1074befc67" +#define DAV1D_VERSION "8e08426468a76d8a667e8a79d92bafd85d7411ac" diff --git a/media/libjxl/moz.yaml b/media/libjxl/moz.yaml index 7b8d187ff4..ddf34a3dc9 100644 --- a/media/libjxl/moz.yaml +++ b/media/libjxl/moz.yaml @@ -10,9 +10,9 @@ origin: url: https://github.com/libjxl/libjxl - release: f06a34c77b1bd11bafbe82989241e68c756ccca2 (2024-03-11T15:14:53Z). + release: a5e4aa1fc1fe5bee252225a2616dccde7fd35da0 (2024-04-01T20:09:39Z). - revision: f06a34c77b1bd11bafbe82989241e68c756ccca2 + revision: a5e4aa1fc1fe5bee252225a2616dccde7fd35da0 license: Apache-2.0 diff --git a/media/libopus/celt/arm/armcpu.c b/media/libopus/celt/arm/armcpu.c index 06a53435b8..6785121ac9 100644 --- a/media/libopus/celt/arm/armcpu.c +++ b/media/libopus/celt/arm/armcpu.c @@ -96,7 +96,7 @@ static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){ /* Linux based */ #include <stdio.h> -opus_uint32 opus_cpu_capabilities(void) +static opus_uint32 opus_cpu_capabilities(void) { opus_uint32 flags = 0; FILE *cpuinfo; @@ -169,7 +169,7 @@ opus_uint32 opus_cpu_capabilities(void) #include <sys/types.h> #include <sys/sysctl.h> -opus_uint32 opus_cpu_capabilities(void) +static opus_uint32 opus_cpu_capabilities(void) { opus_uint32 flags = 0; @@ -191,6 +191,54 @@ opus_uint32 opus_cpu_capabilities(void) return flags; } +#elif defined(__FreeBSD__) +#include <sys/auxv.h> + +static opus_uint32 opus_cpu_capabilities(void) +{ + long hwcap = 0; + opus_uint32 flags = 0; + +# if defined(OPUS_ARM_MAY_HAVE_MEDIA) \ + || defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR) + /* FreeBSD requires armv6+, which always supports media instructions */ + flags |= OPUS_CPU_ARM_MEDIA_FLAG; +# endif + + elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap); + +# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_MEDIA) \ + || defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +# ifdef HWCAP_EDSP + if (hwcap & HWCAP_EDSP) + flags |= OPUS_CPU_ARM_EDSP_FLAG; +# endif + +# if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +# ifdef HWCAP_NEON + if (hwcap & HWCAP_NEON) + flags |= OPUS_CPU_ARM_NEON_FLAG; +# elif defined(HWCAP_ASIMD) + if (hwcap & HWCAP_ASIMD) + flags |= OPUS_CPU_ARM_NEON_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_EDSP_FLAG; +# endif +# endif +# if defined(OPUS_ARM_MAY_HAVE_DOTPROD) && defined(HWCAP_ASIMDDP) + if (hwcap & HWCAP_ASIMDDP) + flags |= OPUS_CPU_ARM_DOTPROD_FLAG; +# endif +# endif + +#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR) + flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG; +# if defined(OPUS_ARM_PRESUME_DOTPROD) + flags |= OPUS_CPU_ARM_DOTPROD_FLAG; +# endif +#endif + + return (flags); +} + #else /* The feature registers which can tell us what the processor supports are * accessible in priveleged modes only, so we can't have a general user-space diff --git a/media/libopus/celt/x86/x86cpu.h b/media/libopus/celt/x86/x86cpu.h index 8ae9be8d8f..1e5b6a4cb3 100644 --- a/media/libopus/celt/x86/x86cpu.h +++ b/media/libopus/celt/x86/x86cpu.h @@ -68,8 +68,22 @@ int opus_select_arch(void); Use this to work around those restrictions (which should hopefully all get optimized to a single MOVD instruction). GCC implemented _mm_loadu_si32() since GCC 11; HOWEVER, there is a bug! - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99754 */ -# if !defined(_MSC_VER) && !OPUS_GNUC_PREREQ(11,3) && !(defined(__clang__) && (__clang_major__ >= 8)) + https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99754 + LLVM implemented _mm_loadu_si32() since Clang 8.0, however the + __clang_major__ version number macro is unreliable, as vendors + (specifically, Apple) will use different numbering schemes than upstream. + Clang's advice is "use feature detection", but they do not provide feature + detection support for specific SIMD functions. + We follow the approach from the SIMDe project and instead detect unrelated + features that should be available in the version we want (see + <https://github.com/simd-everywhere/simde/blob/master/simde/simde-detect-clang.h>).*/ +# if defined(__clang__) +# if __has_warning("-Wextra-semi-stmt") || \ + __has_builtin(__builtin_rotateleft32) +# define OPUS_CLANG_8 (1) +# endif +# endif +# if !defined(_MSC_VER) && !OPUS_GNUC_PREREQ(11,3) && !defined(OPUS_CLANG_8) # include <string.h> # include <emmintrin.h> diff --git a/media/libopus/moz.build b/media/libopus/moz.build index 44c0ab7c90..c5b2021ba7 100644 --- a/media/libopus/moz.build +++ b/media/libopus/moz.build @@ -21,7 +21,7 @@ FINAL_LIBRARY = "gkcodecs" NoVisibilityFlags() DEFINES["OPUS_BUILD"] = True -DEFINES["OPUS_VERSION"] = "ab4e83598e7fc8b2ce82dc633a0fc0c452b629aa" +DEFINES["OPUS_VERSION"] = "fdb198e88660721e289df94c29e91f70caff787e" DEFINES["USE_ALLOCA"] = True DEFINES["ENABLE_HARDENING"] = True diff --git a/media/libopus/moz.yaml b/media/libopus/moz.yaml index ed76d36d1f..7728a66c41 100644 --- a/media/libopus/moz.yaml +++ b/media/libopus/moz.yaml @@ -20,11 +20,11 @@ origin: # Human-readable identifier for this version/release # Generally "version NNN", "tag SSS", "bookmark SSS" - release: ab4e83598e7fc8b2ce82dc633a0fc0c452b629aa (2024-03-04T11:53:07.000-05:00). + release: fdb198e88660721e289df94c29e91f70caff787e (2024-04-09T14:29:12.000-04:00). # Revision to pull in # Must be a long or short commit SHA (long preferred) - revision: ab4e83598e7fc8b2ce82dc633a0fc0c452b629aa + revision: fdb198e88660721e289df94c29e91f70caff787e # The package's license, where possible using the mnemonic from # https://spdx.org/licenses/ diff --git a/media/libopus/silk/x86/NSQ_del_dec_avx2.c b/media/libopus/silk/x86/NSQ_del_dec_avx2.c index 43485871a4..21f00c2dad 100644 --- a/media/libopus/silk/x86/NSQ_del_dec_avx2.c +++ b/media/libopus/silk/x86/NSQ_del_dec_avx2.c @@ -73,7 +73,6 @@ static OPUS_INLINE int verify_assumptions(const silk_encoder_state *psEncC) /* Intrinsics not defined on MSVC */ #ifdef _MSC_VER #include <Intsafe.h> -#define __m128i_u __m128i static inline int __builtin_sadd_overflow(opus_int32 a, opus_int32 b, opus_int32* res) { *res = a+b; @@ -959,7 +958,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2( { __m256i x = _mm256_cvtepi16_epi64(_mm_loadu_si64(&x16[i])); x = _mm256_slli_epi64(_mm256_mul_epi32(x, _mm256_set1_epi32(inv_gain_Q26)), 16); - _mm_storeu_si128((__m128i_u*)&x_sc_Q10[i], silk_cvtepi64_epi32_high(x)); + _mm_storeu_si128((__m128i*)&x_sc_Q10[i], silk_cvtepi64_epi32_high(x)); } /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */ @@ -985,8 +984,8 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2( /* Scale long-term shaping state */ for (i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx; i+=4) { - __m128i_u* p = (__m128i_u*)&NSQ->sLTP_shp_Q14[i]; - *p = silk_mm_smulww_epi32(*p, gain_adj_Q16); + opus_int32 *p = &NSQ->sLTP_shp_Q14[i]; + _mm_storeu_si128((__m128i*)p, silk_mm_smulww_epi32(_mm_loadu_si128((__m128i*)p), gain_adj_Q16)); } /* Scale long-term prediction state */ @@ -1041,13 +1040,13 @@ static OPUS_INLINE void silk_LPC_analysis_filter_avx2( /* Allowing wrap around so that two wraps can cancel each other. The rare cases where the result wraps around can only be triggered by invalid streams*/ - __m256i in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&in_ptr[-8])); - __m256i B_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)& B[0])); + __m256i in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&in_ptr[-8])); + __m256i B_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)& B[0])); __m256i sum = _mm256_mullo_epi32(in_v, silk_mm256_reverse_epi32(B_v)); if (order > 10) { - in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&in_ptr[-16])); - B_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&B [8])); + in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&in_ptr[-16])); + B_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&B [8])); B_v = silk_mm256_reverse_epi32(B_v); } else diff --git a/media/libopus/src/opus_private.h b/media/libopus/src/opus_private.h index 364c21cebc..279f5f95f6 100644 --- a/media/libopus/src/opus_private.h +++ b/media/libopus/src/opus_private.h @@ -214,7 +214,7 @@ int opus_multistream_decode_native( opus_int32 opus_packet_extensions_parse(const unsigned char *data, opus_int32 len, opus_extension_data *extensions, opus_int32 *nb_extensions); -opus_int32 opus_packet_extensions_generate(unsigned char *data, opus_int32 len, const opus_extension_data *extensions, int nb_extensions, int pad); +opus_int32 opus_packet_extensions_generate(unsigned char *data, opus_int32 len, const opus_extension_data *extensions, opus_int32 nb_extensions, int pad); opus_int32 opus_packet_extensions_count(const unsigned char *data, opus_int32 len); diff --git a/media/libopus/src/repacketizer.c b/media/libopus/src/repacketizer.c index 6a7a8b3d8e..79798b0217 100644 --- a/media/libopus/src/repacketizer.c +++ b/media/libopus/src/repacketizer.c @@ -155,7 +155,8 @@ opus_int32 opus_repacketizer_out_range_impl(OpusRepacketizer *rp, int begin, int /* incorporate any extensions from the repacketizer padding */ for (i=begin;i<end;i++) { - int frame_ext_count, j; + int j; + opus_int32 frame_ext_count; frame_ext_count = total_ext_count - ext_count; int ret = opus_packet_extensions_parse(rp->paddings[i], rp->padding_len[i], &all_extensions[ext_count], &frame_ext_count); diff --git a/media/libvpx/arm_cpu_runtime_detection_code_on_openbsd.patch b/media/libvpx/arm_cpu_runtime_detection_code_on_openbsd.patch new file mode 100644 index 0000000000..4788b3996a --- /dev/null +++ b/media/libvpx/arm_cpu_runtime_detection_code_on_openbsd.patch @@ -0,0 +1,41 @@ +# HG changeset patch +# User Chun-Min Chang <chun.m.chang@gmail.com> + +Bug 1888772 - Allow ARM CPU runtime detection code to build on OpenBSD + +diff --git a/vpx_ports/aarch64_cpudetect.c b/vpx_ports/aarch64_cpudetect.c +--- a/vpx_ports/aarch64_cpudetect.c ++++ b/vpx_ports/aarch64_cpudetect.c +@@ -10,30 +10,30 @@ + + #include "./vpx_config.h" + #include "arm_cpudetect.h" + + #if defined(__APPLE__) + #include <sys/sysctl.h> + #endif + +-#if !CONFIG_RUNTIME_CPU_DETECT ++#if !CONFIG_RUNTIME_CPU_DETECT || defined(__OpenBSD__) + + static int arm_get_cpu_caps(void) { + // This function should actually be a no-op. There is no way to adjust any of + // these because the RTCD tables do not exist: the functions are called + // statically. + int flags = 0; + #if HAVE_NEON + flags |= HAS_NEON; + #endif // HAVE_NEON + return flags; + } + +-#elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT ++#elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT || defined(__OpenBSD__) + + // sysctlbyname() parameter documentation for instruction set characteristics: + // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics + static INLINE int64_t have_feature(const char *feature) { + int64_t feature_present = 0; + size_t size = sizeof(feature_present); + if (sysctlbyname(feature, &feature_present, &size, NULL, 0) != 0) { + return 0; diff --git a/media/libvpx/config/generic/vpx_config.asm b/media/libvpx/config/generic/vpx_config.asm index 47243ad198..7a1aaf999a 100644 --- a/media/libvpx/config/generic/vpx_config.asm +++ b/media/libvpx/config/generic/vpx_config.asm @@ -13,6 +13,7 @@ .equ HAVE_NEON_DOTPROD , 0 .equ HAVE_NEON_I8MM , 0 .equ HAVE_SVE , 0 +.equ HAVE_SVE2 , 0 .equ HAVE_MIPS32 , 0 .equ HAVE_DSPR2 , 0 .equ HAVE_MSA , 0 diff --git a/media/libvpx/config/generic/vpx_config.c b/media/libvpx/config/generic/vpx_config.c index d1c3d1acd7..922edd1ea2 100644 --- a/media/libvpx/config/generic/vpx_config.c +++ b/media/libvpx/config/generic/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=generic-gnu --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512"; +static const char* const cfg = "--target=generic-gnu --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --log=/home/cm/Work/gecko-dev/media/libvpx/config/generic/config.log"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/generic/vpx_config.h b/media/libvpx/config/generic/vpx_config.h index 774a531ed9..c885bb399a 100644 --- a/media/libvpx/config/generic/vpx_config.h +++ b/media/libvpx/config/generic/vpx_config.h @@ -22,6 +22,7 @@ #define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_I8MM 0 #define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 diff --git a/media/libvpx/config/linux/arm/vpx_config.asm b/media/libvpx/config/linux/arm/vpx_config.asm index ee43d0f922..6be2a7f7a2 100644 --- a/media/libvpx/config/linux/arm/vpx_config.asm +++ b/media/libvpx/config/linux/arm/vpx_config.asm @@ -13,6 +13,7 @@ .equ HAVE_NEON_DOTPROD , 0 .equ HAVE_NEON_I8MM , 0 .equ HAVE_SVE , 0 +.equ HAVE_SVE2 , 0 .equ HAVE_MIPS32 , 0 .equ HAVE_DSPR2 , 0 .equ HAVE_MSA , 0 diff --git a/media/libvpx/config/linux/arm/vpx_config.c b/media/libvpx/config/linux/arm/vpx_config.c index c885d910c0..c634e2af66 100644 --- a/media/libvpx/config/linux/arm/vpx_config.c +++ b/media/libvpx/config/linux/arm/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=armv7-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-runtime-cpu-detect --enable-realtime-only"; +static const char* const cfg = "--target=armv7-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-runtime-cpu-detect --enable-realtime-only --log=/home/cm/Work/gecko-dev/media/libvpx/config/linux/arm/config.log"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/linux/arm/vpx_config.h b/media/libvpx/config/linux/arm/vpx_config.h index bfd2c04e07..99a55f0ea9 100644 --- a/media/libvpx/config/linux/arm/vpx_config.h +++ b/media/libvpx/config/linux/arm/vpx_config.h @@ -22,6 +22,7 @@ #define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_I8MM 0 #define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 diff --git a/media/libvpx/config/linux/arm64/vp9_rtcd.h b/media/libvpx/config/linux/arm64/vp9_rtcd.h index 738de4f9f4..b7d828d446 100644 --- a/media/libvpx/config/linux/arm64/vp9_rtcd.h +++ b/media/libvpx/config/linux/arm64/vp9_rtcd.h @@ -35,11 +35,13 @@ extern "C" { int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); -#define vp9_block_error vp9_block_error_neon +int64_t vp9_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); int64_t vp9_block_error_fp_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); -#define vp9_block_error_fp vp9_block_error_fp_neon +int64_t vp9_block_error_fp_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); @@ -96,6 +98,10 @@ static void setup_rtcd_internal(void) (void)flags; + vp9_block_error = vp9_block_error_neon; + if (flags & HAS_SVE) vp9_block_error = vp9_block_error_sve; + vp9_block_error_fp = vp9_block_error_fp_neon; + if (flags & HAS_SVE) vp9_block_error_fp = vp9_block_error_fp_sve; } #endif diff --git a/media/libvpx/config/linux/arm64/vpx_config.asm b/media/libvpx/config/linux/arm64/vpx_config.asm index 499c16202c..c51a76b3f6 100644 --- a/media/libvpx/config/linux/arm64/vpx_config.asm +++ b/media/libvpx/config/linux/arm64/vpx_config.asm @@ -13,6 +13,7 @@ .equ HAVE_NEON_DOTPROD , 1 .equ HAVE_NEON_I8MM , 1 .equ HAVE_SVE , 1 +.equ HAVE_SVE2 , 1 .equ HAVE_MIPS32 , 0 .equ HAVE_DSPR2 , 0 .equ HAVE_MSA , 0 diff --git a/media/libvpx/config/linux/arm64/vpx_config.c b/media/libvpx/config/linux/arm64/vpx_config.c index 74baa0689c..c0d714503f 100644 --- a/media/libvpx/config/linux/arm64/vpx_config.c +++ b/media/libvpx/config/linux/arm64/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=arm64-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-realtime-only"; +static const char* const cfg = "--target=arm64-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-realtime-only --log=/home/cm/Work/gecko-dev/media/libvpx/config/linux/arm64/config.log"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/linux/arm64/vpx_config.h b/media/libvpx/config/linux/arm64/vpx_config.h index 3c5f2e33ca..12251ee0c1 100644 --- a/media/libvpx/config/linux/arm64/vpx_config.h +++ b/media/libvpx/config/linux/arm64/vpx_config.h @@ -22,6 +22,7 @@ #define HAVE_NEON_DOTPROD 1 #define HAVE_NEON_I8MM 1 #define HAVE_SVE 1 +#define HAVE_SVE2 1 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 diff --git a/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h b/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h index 5a9b05ca14..2c31ee4ef9 100644 --- a/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h +++ b/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h @@ -916,7 +916,8 @@ void vpx_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t di uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size); -#define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_neon +uint64_t vpx_sum_squares_2d_i16_sve(const int16_t *src, int stride, int size); +RTCD_EXTERN uint64_t (*vpx_sum_squares_2d_i16)(const int16_t *src, int stride, int size); void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); @@ -1148,6 +1149,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_neon_dotprod; vpx_sse = vpx_sse_neon; if (flags & HAS_NEON_DOTPROD) vpx_sse = vpx_sse_neon_dotprod; + vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_neon; + if (flags & HAS_SVE) vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_sve; vpx_variance16x16 = vpx_variance16x16_neon; if (flags & HAS_NEON_DOTPROD) vpx_variance16x16 = vpx_variance16x16_neon_dotprod; vpx_variance16x32 = vpx_variance16x32_neon; diff --git a/media/libvpx/config/linux/ia32/vpx_config.asm b/media/libvpx/config/linux/ia32/vpx_config.asm index eaa3950d37..5a92abf939 100644 --- a/media/libvpx/config/linux/ia32/vpx_config.asm +++ b/media/libvpx/config/linux/ia32/vpx_config.asm @@ -10,6 +10,7 @@ %define HAVE_NEON_DOTPROD 0 %define HAVE_NEON_I8MM 0 %define HAVE_SVE 0 +%define HAVE_SVE2 0 %define HAVE_MIPS32 0 %define HAVE_DSPR2 0 %define HAVE_MSA 0 diff --git a/media/libvpx/config/linux/ia32/vpx_config.c b/media/libvpx/config/linux/ia32/vpx_config.c index 6805ab62a8..7024ca989f 100644 --- a/media/libvpx/config/linux/ia32/vpx_config.c +++ b/media/libvpx/config/linux/ia32/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=x86-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm"; +static const char* const cfg = "--target=x86-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm --log=/home/cm/Work/gecko-dev/media/libvpx/config/linux/ia32/config.log"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/linux/ia32/vpx_config.h b/media/libvpx/config/linux/ia32/vpx_config.h index 69fd63bf02..b4cc10a906 100644 --- a/media/libvpx/config/linux/ia32/vpx_config.h +++ b/media/libvpx/config/linux/ia32/vpx_config.h @@ -22,6 +22,7 @@ #define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_I8MM 0 #define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 diff --git a/media/libvpx/config/linux/x64/vpx_config.asm b/media/libvpx/config/linux/x64/vpx_config.asm index 8715768a2e..148a894979 100644 --- a/media/libvpx/config/linux/x64/vpx_config.asm +++ b/media/libvpx/config/linux/x64/vpx_config.asm @@ -10,6 +10,7 @@ %define HAVE_NEON_DOTPROD 0 %define HAVE_NEON_I8MM 0 %define HAVE_SVE 0 +%define HAVE_SVE2 0 %define HAVE_MIPS32 0 %define HAVE_DSPR2 0 %define HAVE_MSA 0 diff --git a/media/libvpx/config/linux/x64/vpx_config.c b/media/libvpx/config/linux/x64/vpx_config.c index e4dcb394c3..f38bd16290 100644 --- a/media/libvpx/config/linux/x64/vpx_config.c +++ b/media/libvpx/config/linux/x64/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=x86_64-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm"; +static const char* const cfg = "--target=x86_64-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm --log=/home/cm/Work/gecko-dev/media/libvpx/config/linux/x64/config.log"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/linux/x64/vpx_config.h b/media/libvpx/config/linux/x64/vpx_config.h index ab4439aaf4..d91509ad10 100644 --- a/media/libvpx/config/linux/x64/vpx_config.h +++ b/media/libvpx/config/linux/x64/vpx_config.h @@ -22,6 +22,7 @@ #define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_I8MM 0 #define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 diff --git a/media/libvpx/config/mac/ia32/vpx_config.asm b/media/libvpx/config/mac/ia32/vpx_config.asm index eaa3950d37..5a92abf939 100644 --- a/media/libvpx/config/mac/ia32/vpx_config.asm +++ b/media/libvpx/config/mac/ia32/vpx_config.asm @@ -10,6 +10,7 @@ %define HAVE_NEON_DOTPROD 0 %define HAVE_NEON_I8MM 0 %define HAVE_SVE 0 +%define HAVE_SVE2 0 %define HAVE_MIPS32 0 %define HAVE_DSPR2 0 %define HAVE_MSA 0 diff --git a/media/libvpx/config/mac/ia32/vpx_config.c b/media/libvpx/config/mac/ia32/vpx_config.c index 3e5d3ec0f3..2ee9d0ebb0 100644 --- a/media/libvpx/config/mac/ia32/vpx_config.c +++ b/media/libvpx/config/mac/ia32/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=x86-darwin9-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm"; +static const char* const cfg = "--target=x86-darwin9-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm --log=/home/cm/Work/gecko-dev/media/libvpx/config/mac/ia32/config.log"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/mac/ia32/vpx_config.h b/media/libvpx/config/mac/ia32/vpx_config.h index 69fd63bf02..b4cc10a906 100644 --- a/media/libvpx/config/mac/ia32/vpx_config.h +++ b/media/libvpx/config/mac/ia32/vpx_config.h @@ -22,6 +22,7 @@ #define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_I8MM 0 #define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 diff --git a/media/libvpx/config/mac/x64/vpx_config.asm b/media/libvpx/config/mac/x64/vpx_config.asm index 8715768a2e..148a894979 100644 --- a/media/libvpx/config/mac/x64/vpx_config.asm +++ b/media/libvpx/config/mac/x64/vpx_config.asm @@ -10,6 +10,7 @@ %define HAVE_NEON_DOTPROD 0 %define HAVE_NEON_I8MM 0 %define HAVE_SVE 0 +%define HAVE_SVE2 0 %define HAVE_MIPS32 0 %define HAVE_DSPR2 0 %define HAVE_MSA 0 diff --git a/media/libvpx/config/mac/x64/vpx_config.c b/media/libvpx/config/mac/x64/vpx_config.c index 9a06646fdc..51fceeb6e3 100644 --- a/media/libvpx/config/mac/x64/vpx_config.c +++ b/media/libvpx/config/mac/x64/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=x86_64-darwin9-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm"; +static const char* const cfg = "--target=x86_64-darwin9-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm --log=/home/cm/Work/gecko-dev/media/libvpx/config/mac/x64/config.log"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/mac/x64/vpx_config.h b/media/libvpx/config/mac/x64/vpx_config.h index ab4439aaf4..d91509ad10 100644 --- a/media/libvpx/config/mac/x64/vpx_config.h +++ b/media/libvpx/config/mac/x64/vpx_config.h @@ -22,6 +22,7 @@ #define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_I8MM 0 #define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 diff --git a/media/libvpx/config/win/aarch64/vpx_config.asm b/media/libvpx/config/win/aarch64/vpx_config.asm index 24eb1a8cba..32d700f1bb 100644 --- a/media/libvpx/config/win/aarch64/vpx_config.asm +++ b/media/libvpx/config/win/aarch64/vpx_config.asm @@ -12,7 +12,8 @@ .equ HAVE_NEON , 1 .equ HAVE_NEON_DOTPROD , 1 .equ HAVE_NEON_I8MM , 1 -.equ HAVE_SVE , 1 +.equ HAVE_SVE , 0 +.equ HAVE_SVE2 , 0 .equ HAVE_MIPS32 , 0 .equ HAVE_DSPR2 , 0 .equ HAVE_MSA , 0 diff --git a/media/libvpx/config/win/aarch64/vpx_config.c b/media/libvpx/config/win/aarch64/vpx_config.c index 13cc13a95d..b8f4ec8754 100644 --- a/media/libvpx/config/win/aarch64/vpx_config.c +++ b/media/libvpx/config/win/aarch64/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=arm64-win64-vs15 --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-realtime-only"; +static const char* const cfg = "--target=arm64-win64-vs15 --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-realtime-only --disable-sve --log=/home/cm/Work/gecko-dev/media/libvpx/config/win/aarch64/config.log"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/win/aarch64/vpx_config.h b/media/libvpx/config/win/aarch64/vpx_config.h index c3cc860f18..a81f868053 100644 --- a/media/libvpx/config/win/aarch64/vpx_config.h +++ b/media/libvpx/config/win/aarch64/vpx_config.h @@ -21,7 +21,8 @@ #define HAVE_NEON 1 #define HAVE_NEON_DOTPROD 1 #define HAVE_NEON_I8MM 1 -#define HAVE_SVE 1 +#define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 diff --git a/media/libvpx/config/win/ia32/vpx_config.asm b/media/libvpx/config/win/ia32/vpx_config.asm index cb1aa7ce6a..9c7e3ce2c2 100755 --- a/media/libvpx/config/win/ia32/vpx_config.asm +++ b/media/libvpx/config/win/ia32/vpx_config.asm @@ -10,6 +10,7 @@ %define HAVE_NEON_DOTPROD 0 %define HAVE_NEON_I8MM 0 %define HAVE_SVE 0 +%define HAVE_SVE2 0 %define HAVE_MIPS32 0 %define HAVE_DSPR2 0 %define HAVE_MSA 0 diff --git a/media/libvpx/config/win/ia32/vpx_config.c b/media/libvpx/config/win/ia32/vpx_config.c index 33c836213b..8cdd6c30b2 100644 --- a/media/libvpx/config/win/ia32/vpx_config.c +++ b/media/libvpx/config/win/ia32/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=x86-win32-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm"; +static const char* const cfg = "--target=x86-win32-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm --log=/home/cm/Work/gecko-dev/media/libvpx/config/win/ia32/config.log"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/win/ia32/vpx_config.h b/media/libvpx/config/win/ia32/vpx_config.h index 9fe256f4ad..b62188c71c 100644 --- a/media/libvpx/config/win/ia32/vpx_config.h +++ b/media/libvpx/config/win/ia32/vpx_config.h @@ -22,6 +22,7 @@ #define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_I8MM 0 #define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 diff --git a/media/libvpx/config/win/x64/vpx_config.asm b/media/libvpx/config/win/x64/vpx_config.asm index a1d34d6d37..d5f5f3968e 100644 --- a/media/libvpx/config/win/x64/vpx_config.asm +++ b/media/libvpx/config/win/x64/vpx_config.asm @@ -10,6 +10,7 @@ %define HAVE_NEON_DOTPROD 0 %define HAVE_NEON_I8MM 0 %define HAVE_SVE 0 +%define HAVE_SVE2 0 %define HAVE_MIPS32 0 %define HAVE_DSPR2 0 %define HAVE_MSA 0 diff --git a/media/libvpx/config/win/x64/vpx_config.c b/media/libvpx/config/win/x64/vpx_config.c index 8c04c1a3cf..57904c7dc6 100644 --- a/media/libvpx/config/win/x64/vpx_config.c +++ b/media/libvpx/config/win/x64/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=x86_64-win64-vs15 --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm"; +static const char* const cfg = "--target=x86_64-win64-vs15 --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm --log=/home/cm/Work/gecko-dev/media/libvpx/config/win/x64/config.log"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/win/x64/vpx_config.h b/media/libvpx/config/win/x64/vpx_config.h index 068c6d2a99..448f13e4a1 100644 --- a/media/libvpx/config/win/x64/vpx_config.h +++ b/media/libvpx/config/win/x64/vpx_config.h @@ -22,6 +22,7 @@ #define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_I8MM 0 #define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 diff --git a/media/libvpx/generate_sources_mozbuild.sh b/media/libvpx/generate_sources_mozbuild.sh index ef9bc696f3..4efcb54aa1 100755 --- a/media/libvpx/generate_sources_mozbuild.sh +++ b/media/libvpx/generate_sources_mozbuild.sh @@ -169,7 +169,8 @@ function gen_rtcd_header { # $1 - Header file directory. # $2 - Config command line. function gen_config_files { - ./configure $2 > /dev/null + ./configure $2 --log=$BASE_DIR/$LIBVPX_CONFIG_DIR/$1/config.log > /dev/null + echo "Log file: $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/config.log" # Disable HAVE_UNISTD_H. ( echo '/HAVE_UNISTD_H'; echo 'd' ; echo 'w' ; echo 'q' ) | ed -s vpx_config.h @@ -203,6 +204,7 @@ all_platforms="${all_platforms} --disable-avx512" x86_platforms="--enable-postproc --enable-vp9-postproc --as=yasm" arm_platforms="--enable-runtime-cpu-detect --enable-realtime-only" arm64_platforms="--enable-realtime-only" +disable_sve="--disable-sve" # Bug 1885585 gen_config_files linux/x64 "--target=x86_64-linux-gcc ${all_platforms} ${x86_platforms}" gen_config_files linux/ia32 "--target=x86-linux-gcc ${all_platforms} ${x86_platforms}" @@ -213,7 +215,7 @@ gen_config_files win/ia32 "--target=x86-win32-gcc ${all_platforms} ${x86_platfor gen_config_files linux/arm "--target=armv7-linux-gcc ${all_platforms} ${arm_platforms}" gen_config_files linux/arm64 "--target=arm64-linux-gcc ${all_platforms} ${arm64_platforms}" -gen_config_files win/aarch64 "--target=arm64-win64-vs15 ${all_platforms} ${arm64_platforms}" +gen_config_files win/aarch64 "--target=arm64-win64-vs15 ${all_platforms} ${arm64_platforms} ${disable_sve}" # Bug 1885585 gen_config_files generic "--target=generic-gnu ${all_platforms}" @@ -236,7 +238,7 @@ gen_rtcd_header win/ia32 x86 gen_rtcd_header linux/arm armv7 gen_rtcd_header linux/arm64 arm64 -gen_rtcd_header win/aarch64 arm64 +gen_rtcd_header win/aarch64 arm64 $disable_sve # Bug 1885585 gen_rtcd_header generic generic @@ -275,6 +277,7 @@ config=$(print_config linux/arm64) make_clean make libvpx_srcs.txt target=libs $config > /dev/null convert_srcs_to_project_files libvpx_srcs.txt ARM64 +# Bug 1885585: The sve files will be excluded from the win/aarch64 build in moz.build. echo "Generate generic source list." config=$(print_config generic) diff --git a/media/libvpx/input_frame_validation.patch b/media/libvpx/input_frame_validation.patch index 1cb33e192f..37f755e022 100644 --- a/media/libvpx/input_frame_validation.patch +++ b/media/libvpx/input_frame_validation.patch @@ -8,15 +8,15 @@ MozReview-Commit-ID: BxDCnJe0mzs diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c -@@ -921,20 +921,29 @@ static vpx_codec_err_t vp8e_encode(vpx_c - dst_time_stamp = - pts_val * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den; - dst_end_time_stamp = (pts_val + (int64_t)duration) * - ctx->timestamp_ratio.num / ctx->timestamp_ratio.den; +@@ -989,20 +989,29 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, + &ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM, + "conversion of relative pts + duration to ticks would overflow"); + } + dst_end_time_stamp = + pts_end * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den; - if (img != NULL) { res = image2yuvconfig(img, &sd); - + - if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags, &sd, - dst_time_stamp, dst_end_time_stamp)) { - VP8_COMP *cpi = (VP8_COMP *)ctx->cpi; diff --git a/media/libvpx/libvpx/.mailmap b/media/libvpx/libvpx/.mailmap index bb0ddd95b2..7206b5ebec 100644 --- a/media/libvpx/libvpx/.mailmap +++ b/media/libvpx/libvpx/.mailmap @@ -20,6 +20,7 @@ Hui Su <huisu@google.com> Jacky Chen <jackychen@google.com> Jim Bankoski <jimbankoski@google.com> Johann Koenig <johannkoenig@google.com> +Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com> Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com> Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org> Johann <johann@duck.com> <johann.koenig@gmail.com> @@ -53,4 +54,4 @@ Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com> Yaowu Xu <yaowu@google.com> <Yaowu Xu> Venkatarama NG. Avadhani <venkatarama.avadhani@ittiam.com> Vitaly Buka <vitalybuka@chromium.org> <vitlaybuka@chromium.org> -xiwei gu <guxiwei-hf@loongson.cn> +Xiwei Gu <guxiwei-hf@loongson.cn> diff --git a/media/libvpx/libvpx/AUTHORS b/media/libvpx/libvpx/AUTHORS index 2db4a113e4..5515e26589 100644 --- a/media/libvpx/libvpx/AUTHORS +++ b/media/libvpx/libvpx/AUTHORS @@ -25,6 +25,7 @@ Andrew Salkeld <andrew.salkeld@arm.com> Angie Chen <yunqi@google.com> Angie Chiang <angiebird@google.com> Anton Venema <anton.venema@liveswitch.com> +Anupam Pandey <anupam.pandey@ittiam.com> Aron Rosenberg <arosenberg@logitech.com> Attila Nagy <attilanagy@google.com> Birk Magnussen <birk.magnussen@googlemail.com> @@ -34,6 +35,8 @@ Brion Vibber <bvibber@wikimedia.org> changjun.yang <changjun.yang@intel.com> Charles 'Buck' Krasic <ckrasic@google.com> Cheng Chen <chengchen@google.com> +Chen Wang <wangchen20@iscas.ac.cn> +Cherma Rajan A <cherma.rajan@ittiam.com> Chi Yo Tsai <chiyotsai@google.com> chm <chm@rock-chips.com> Chris Cunningham <chcunningham@chromium.org> @@ -60,6 +63,8 @@ Fritz Koenig <frkoenig@google.com> Fyodor Kyslov <kyslov@google.com> Gabriel Marin <gmx@chromium.org> Gaute Strokkenes <gaute.strokkenes@broadcom.com> +George Steed <george.steed@arm.com> +Gerda Zsejke More <gerdazsejke.more@arm.com> Geza Lore <gezalore@gmail.com> Ghislain MARY <ghislainmary2@gmail.com> Giuseppe Scrivano <gscrivano@gnu.org> @@ -103,6 +108,7 @@ Jin Bo <jinbo@loongson.cn> Jingning Han <jingning@google.com> Joel Fernandes <joelaf@google.com> Joey Parrish <joeyparrish@google.com> +Johann <johann@duck.com> Johann Koenig <johannkoenig@google.com> John Koleszar <jkoleszar@google.com> Johnny Klonaris <google@jawknee.com> @@ -120,6 +126,7 @@ KO Myung-Hun <komh@chollian.net> Konstantinos Margaritis <konma@vectorcamp.gr> Kyle Siefring <kylesiefring@gmail.com> Lawrence Velázquez <larryv@macports.org> +L. E. Segovia <amy@amyspark.me> Linfeng Zhang <linfengz@google.com> Liu Peng <pengliu.mail@gmail.com> Lou Quillio <louquillio@google.com> @@ -147,6 +154,7 @@ Mirko Bonadei <mbonadei@google.com> Moriyoshi Koizumi <mozo@mozo.jp> Morton Jonuschat <yabawock@gmail.com> Nathan E. Egge <negge@mozilla.com> +Neeraj Gadgil <neeraj.gadgil@ittiam.com> Neil Birkbeck <neil.birkbeck@gmail.com> Nico Weber <thakis@chromium.org> Niveditha Rau <niveditha.rau@gmail.com> @@ -213,7 +221,8 @@ Vitaly Buka <vitalybuka@chromium.org> Vlad Tsyrklevich <vtsyrklevich@chromium.org> Wan-Teh Chang <wtc@google.com> Wonkap Jang <wonkap@google.com> -xiwei gu <guxiwei-hf@loongson.cn> +Xiahong Bao <xiahong.bao@nxp.com> +Xiwei Gu <guxiwei-hf@loongson.cn> Yaowu Xu <yaowu@google.com> Yi Luo <luoyi@google.com> Yongzhe Wang <yongzhe@google.com> diff --git a/media/libvpx/libvpx/CHANGELOG b/media/libvpx/libvpx/CHANGELOG index 21070785ed..87f0d7f708 100644 --- a/media/libvpx/libvpx/CHANGELOG +++ b/media/libvpx/libvpx/CHANGELOG @@ -1,7 +1,79 @@ -20yy-mm-dd v1.14.0 "V Duck" +2024-01-02 v1.14.0 "Venetian Duck" This release drops support for old C compilers, such as Visual Studio 2012 and older, that disallow mixing variable declarations and statements (a C99 - feature). + feature). It adds support for run-time CPU feature detection for Arm + platforms, as well as support for darwin23 (macOS 14). + + - Upgrading: + This release is ABI incompatible with the previous release. + + Various new features for rate control library for real-time: SVC parallel + encoding, loopfilter level, support for frame dropping, and screen content. + + New callback function send_tpl_gop_stats for vp9 external rate control + library, which can be used to transmit TPL stats for a group of pictures. A + public header vpx_tpl.h is added for the definition of TPL stats used in + this callback. + + libwebm is upgraded to libwebm-1.0.0.29-9-g1930e3c. + + - Enhancement: + Improvements on Neon optimizations: VoD: 12-35% speed up for bitdepth 8, + 68%-151% speed up for high bitdepth. + + Improvements on AVX2 and SSE optimizations. + Improvements on LSX optimizations for LoongArch. + 42-49% speedup on speed 0 VoD encoding. + Android API level predicates. + + - Bug fixes: + Fix to missing prototypes from the rtcd header. + Fix to segfault when total size is enlarged but width is smaller. + Fix to the build for arm64ec using MSVC. + Fix to copy BLOCK_8X8's mi to PICK_MODE_CONTEXT::mic. + Fix to -Wshadow warnings. + Fix to heap overflow in vpx_get4x4sse_cs_neon. + Fix to buffer overrun in highbd Neon subpel variance filters. + Added bitexact encode test script. + Fix to -Wl,-z,defs with Clang's sanitizers. + Fix to decoder stability after error & continued decoding. + Fix to mismatch of VP9 encode with NEON intrinsics with C only version. + Fix to Arm64 MSVC compile vpx_highbd_fdct4x4_neon. + Fix to fragments count before use. + Fix to a case where target bandwidth is 0 for SVC. + Fix mask in vp9_quantize_avx2,highbd_get_max_lane_eob. + Fix to int overflow in vp9_calc_pframe_target_size_one_pass_cbr. + Fix to integer overflow in vp8,ratectrl.c. + Fix to integer overflow in vp9 svc. + Fix to avg_frame_bandwidth overflow. + Fix to per frame qp for temporal layers. + Fix to unsigned integer overflow in sse computation. + Fix to uninitialized mesh feature for BEST mode. + Fix to overflow in highbd temporal_filter. + Fix to unaligned loads w/w==4 in vpx_convolve_copy_neon. + Skip arm64_neon.h workaround w/VS >= 2019. + Fix to c vs avx mismatch of diamond_search_sad(). + Fix to c vs intrinsic mismatch of vpx_hadamard_32x32() function. + Fix to a bug in vpx_hadamard_32x32_neon(). + Fix to Clang -Wunreachable-code-aggressive warnings. + Fix to a bug in vpx_highbd_hadamard_32x32_neon(). + Fix to -Wunreachable-code in mfqe_partition. + Force mode search on 64x64 if no mode is selected. + Fix to ubsan failure caused by left shift of negative. + Fix to integer overflow in calc_pframe_target_size. + Fix to float-cast-overflow in vp8_change_config(). + Fix to a null ptr before use. + Conditionally skip using inter frames in speed features. + Remove invalid reference frames. + Disable intra mode search speed features conditionally. + Set nonrd keyframe under dynamic change of deadline for rtc. + Fix to scaled reference offsets. + Set skip_recode=0 in nonrd_pick_sb_modes. + Fix to an edge case when downsizing to one. + Fix to a bug in frame scaling. + Fix to pred buffer stride. + Fix to a bug in simple motion search. + Update frame size in actual encoding. 2023-09-29 v1.13.1 "Ugly Duckling" This release contains two security related fixes. One each for VP8 and VP9. diff --git a/media/libvpx/libvpx/README b/media/libvpx/libvpx/README index 4c25b15d81..6dbd164c34 100644 --- a/media/libvpx/libvpx/README +++ b/media/libvpx/libvpx/README @@ -1,5 +1,3 @@ -v1.13.1 Ugly Duckling - Welcome to the WebM VP8/VP9 Codec SDK! COMPILING THE APPLICATIONS/LIBRARIES: @@ -183,6 +181,44 @@ CODE STYLE: See also: http://clang.llvm.org/docs/ClangFormat.html +PROFILE GUIDED OPTIMIZATION (PGO) + Profile Guided Optimization can be enabled for Clang builds using the + commands: + + $ export CC=clang + $ export CXX=clang++ + $ ../libvpx/configure --enable-profile + $ make + + Generate one or multiple PGO profile files by running vpxdec or vpxenc. For + example: + + $ ./vpxdec ../vpx/out_ful/vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm \ + -o - > /dev/null + + To convert and merge the raw profile files, use the llvm-profdata tool: + + $ llvm-profdata merge -o perf.profdata default_8382761441159425451_0.profraw + + Then, rebuild the project with the new profile file: + + $ make clean + $ ../libvpx/configure --use-profile=perf.profdata + $ make + + Note: Always use the llvm-profdata from the toolchain that is used for + compiling the PGO-enabled binary. + + To observe the improvements from a PGO-enabled build, enable and compare the + list of failed optimizations by using the -Rpass-missed compiler flag. For + example, to list the failed loop vectorizations: + + $ ../libvpx/configure --use-profile=perf.profdata \ + --extra-cflags=-Rpass-missed=loop-vectorize + + For guidance on utilizing PGO files to identify potential optimization + opportunities, see: tools/README.pgo.md + SUPPORT This library is an open source project supported by its community. Please email webm-discuss@webmproject.org for help. diff --git a/media/libvpx/libvpx/build/make/Android.mk b/media/libvpx/libvpx/build/make/Android.mk index ba24f541b1..533f43c1c2 100644 --- a/media/libvpx/libvpx/build/make/Android.mk +++ b/media/libvpx/libvpx/build/make/Android.mk @@ -15,13 +15,9 @@ ifdef NDK_ROOT # In an Android project place a libvpx checkout in the jni directory. # Run the configure script from the jni directory. Base libvpx # encoder/decoder configuration will look similar to: -# ./libvpx/configure --target=armv7-android-gcc --disable-examples \ +# ./libvpx/configure --target=arm64-android-gcc --disable-examples \ # --enable-external-build # -# When targeting Android, realtime-only is enabled by default. This can -# be overridden by adding the command line flag: -# --disable-realtime-only -# # This will create .mk files that contain variables that contain the # source files to compile. # @@ -38,11 +34,14 @@ ifdef NDK_ROOT # but the resulting library *must* be run on devices supporting all of the # enabled extensions. They can be disabled individually with # --disable-{sse2, sse3, ssse3, sse4_1, avx, avx2, avx512} -# --disable-neon[-asm] +# --disable-neon{, -asm, -neon-dotprod, -neon-i8mm} +# --disable-sve # --disable-{dspr2, msa} # -# Running ndk-build will build libvpx and include it in your project. +# Running ndk-build will build libvpx and include it in your project. Set +# APP_ABI to match the --target passed to configure: +# https://developer.android.com/ndk/guides/application_mk#app_abi. # CONFIG_DIR := $(LOCAL_PATH)/ diff --git a/media/libvpx/libvpx/build/make/Makefile b/media/libvpx/libvpx/build/make/Makefile index 199ed78058..658b37617b 100644 --- a/media/libvpx/libvpx/build/make/Makefile +++ b/media/libvpx/libvpx/build/make/Makefile @@ -150,6 +150,8 @@ $(BUILD_PFX)%_neon_i8mm.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm $(BUILD_PFX)%_neon_i8mm.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm $(BUILD_PFX)%_sve.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve $(BUILD_PFX)%_sve.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve +$(BUILD_PFX)%_sve2.c.d: CFLAGS += -march=armv9-a+sve2 +$(BUILD_PFX)%_sve2.c.o: CFLAGS += -march=armv9-a+sve2 # POWER $(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx diff --git a/media/libvpx/libvpx/build/make/configure.sh b/media/libvpx/libvpx/build/make/configure.sh index 869793a296..009bf7db5c 100644 --- a/media/libvpx/libvpx/build/make/configure.sh +++ b/media/libvpx/libvpx/build/make/configure.sh @@ -74,6 +74,8 @@ Build options: --cpu=CPU optimize for a specific cpu rather than a family --extra-cflags=ECFLAGS add ECFLAGS to CFLAGS [$CFLAGS] --extra-cxxflags=ECXXFLAGS add ECXXFLAGS to CXXFLAGS [$CXXFLAGS] + --use-profile=PROFILE_FILE + Use PROFILE_FILE for PGO ${toggle_extra_warnings} emit harmless warnings (always non-fatal) ${toggle_werror} treat warnings as errors, if possible (not available with all compilers) @@ -81,6 +83,7 @@ Build options: ${toggle_pic} turn on/off Position Independent Code ${toggle_ccache} turn on/off compiler cache ${toggle_debug} enable/disable debug mode + ${toggle_profile} enable/disable profiling ${toggle_gprof} enable/disable gprof profiling instrumentation ${toggle_gcov} enable/disable gcov coverage instrumentation ${toggle_thumb} enable/disable building arm assembly in thumb mode @@ -429,6 +432,26 @@ check_gcc_machine_options() { fi } +check_neon_sve_bridge_compiles() { + if enabled sve; then + check_cc -march=armv8.2-a+dotprod+i8mm+sve <<EOF +#ifndef __ARM_NEON_SVE_BRIDGE +#error 1 +#endif +#include <arm_sve.h> +#include <arm_neon_sve_bridge.h> +EOF + compile_result=$? + if [ ${compile_result} -ne 0 ]; then + log_echo " disabling sve: arm_neon_sve_bridge.h not supported by compiler" + log_echo " disabling sve2: arm_neon_sve_bridge.h not supported by compiler" + disable_feature sve + disable_feature sve2 + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-sve --disable-sve2 " + fi + fi +} + check_gcc_avx512_compiles() { if disabled gcc; then return @@ -611,6 +634,9 @@ process_common_cmdline() { --extra-cxxflags=*) extra_cxxflags="${optval}" ;; + --use-profile=*) + pgo_file=${optval} + ;; --enable-?*|--disable-?*) eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'` if is_in ${option} ${ARCH_EXT_LIST}; then @@ -951,7 +977,7 @@ EOF add_cflags "-mmacosx-version-min=10.15" add_ldflags "-mmacosx-version-min=10.15" ;; - *-darwin2[0-2]-*) + *-darwin2[0-3]-*) add_cflags "-arch ${toolchain%%-*}" add_ldflags "-arch ${toolchain%%-*}" ;; @@ -980,36 +1006,18 @@ EOF case ${toolchain} in arm*) soft_enable runtime_cpu_detect - # Arm ISA extensions are treated as supersets. - case ${tgt_isa} in - arm64|armv8) - for ext in ${ARCH_EXT_LIST_AARCH64}; do - # Disable higher order extensions to simplify dependencies. - if [ "$disable_exts" = "yes" ]; then - if ! disabled $ext; then - RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} " - disable_feature $ext - fi - elif disabled $ext; then - disable_exts="yes" - else - soft_enable $ext - fi - done - ;; - armv7|armv7s) - soft_enable neon - # Only enable neon_asm when neon is also enabled. - enabled neon && soft_enable neon_asm - # If someone tries to force it through, die. - if disabled neon && enabled neon_asm; then - die "Disabling neon while keeping neon-asm is not supported" - fi - ;; - esac - asm_conversion_cmd="cat" + if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then + soft_enable neon + # Only enable neon_asm when neon is also enabled. + enabled neon && soft_enable neon_asm + # If someone tries to force it through, die. + if disabled neon && enabled neon_asm; then + die "Disabling neon while keeping neon-asm is not supported" + fi + fi + asm_conversion_cmd="cat" case ${tgt_cc} in gcc) link_with_cc=gcc @@ -1228,6 +1236,38 @@ EOF fi ;; esac + + # AArch64 ISA extensions are treated as supersets. + if [ ${tgt_isa} = "arm64" ] || [ ${tgt_isa} = "armv8" ]; then + aarch64_arch_flag_neon="arch=armv8-a" + aarch64_arch_flag_neon_dotprod="arch=armv8.2-a+dotprod" + aarch64_arch_flag_neon_i8mm="arch=armv8.2-a+dotprod+i8mm" + aarch64_arch_flag_sve="arch=armv8.2-a+dotprod+i8mm+sve" + aarch64_arch_flag_sve2="arch=armv9-a+sve2" + for ext in ${ARCH_EXT_LIST_AARCH64}; do + if [ "$disable_exts" = "yes" ]; then + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} " + soft_disable $ext + else + # Check the compiler supports the -march flag for the extension. + # This needs to happen after toolchain/OS inspection so we handle + # $CROSS etc correctly when checking for flags, else these will + # always fail. + flag="$(eval echo \$"aarch64_arch_flag_${ext}")" + check_gcc_machine_option "${flag}" "${ext}" + if ! enabled $ext; then + # Disable higher order extensions to simplify dependencies. + disable_exts="yes" + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} " + soft_disable $ext + fi + fi + done + if enabled sve; then + check_neon_sve_bridge_compiles + fi + fi + ;; mips*) link_with_cc=gcc @@ -1484,6 +1524,14 @@ EOF ;; esac + # Enable PGO + if [ -n "${pgo_file}" ]; then + check_add_cflags -fprofile-use=${pgo_file} || \ + die "-fprofile-use is not supported by compiler" + check_add_ldflags -fprofile-use=${pgo_file} || \ + die "-fprofile-use is not supported by linker" + fi + # Try to enable CPU specific tuning if [ -n "${tune_cpu}" ]; then if [ -n "${tune_cflags}" ]; then @@ -1504,6 +1552,9 @@ EOF else check_add_cflags -DNDEBUG fi + enabled profile && + check_add_cflags -fprofile-generate && + check_add_ldflags -fprofile-generate enabled gprof && check_add_cflags -pg && check_add_ldflags -pg enabled gcov && diff --git a/media/libvpx/libvpx/build/make/rtcd.pl b/media/libvpx/libvpx/build/make/rtcd.pl index 0b9e16738e..025238d678 100755 --- a/media/libvpx/libvpx/build/make/rtcd.pl +++ b/media/libvpx/libvpx/build/make/rtcd.pl @@ -487,7 +487,7 @@ if ($opts{arch} eq 'x86') { @ALL_ARCHS = filter(qw/neon_asm neon/); arm; } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) { - @ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm sve/); + @ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm sve sve2/); @REQUIRES = filter(qw/neon/); &require(@REQUIRES); arm; diff --git a/media/libvpx/libvpx/configure b/media/libvpx/libvpx/configure index b212e0709d..97e78996e8 100755 --- a/media/libvpx/libvpx/configure +++ b/media/libvpx/libvpx/configure @@ -260,6 +260,7 @@ ARCH_EXT_LIST_AARCH64=" neon_dotprod neon_i8mm sve + sve2 " ARCH_EXT_LIST_X86=" @@ -376,6 +377,7 @@ CMDLINE_SELECT=" install_libs install_srcs debug + profile gprof gcov pic @@ -659,6 +661,7 @@ process_toolchain() { check_add_cflags -Wmissing-declarations check_add_cflags -Wmissing-prototypes check_add_cflags -Wshadow + check_add_cflags -Wstrict-prototypes check_add_cflags -Wuninitialized check_add_cflags -Wunreachable-code-aggressive check_add_cflags -Wunused @@ -677,6 +680,10 @@ process_toolchain() { # would be needed to apply this only to test/*.cc. check_cflags -Wshorten-64-to-32 && add_cflags_only -Wshorten-64-to-32 + # Do not allow implicit vector type conversions on Clang builds (this + # is already the default on GCC builds). + check_add_cflags -flax-vector-conversions=none + # Quiet gcc 6 vs 7 abi warnings: # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728 if enabled arm; then diff --git a/media/libvpx/libvpx/examples/resize_util.c b/media/libvpx/libvpx/examples/resize_util.c index 5fb63e1660..083bd2519d 100644 --- a/media/libvpx/libvpx/examples/resize_util.c +++ b/media/libvpx/libvpx/examples/resize_util.c @@ -20,7 +20,7 @@ static const char *exec_name = NULL; -static void usage() { +static void usage(void) { printf("Usage:\n"); printf("%s <input_yuv> <width>x<height> <target_width>x<target_height> ", exec_name); diff --git a/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c b/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c index 998e4fb20d..4050c093cd 100644 --- a/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c +++ b/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c @@ -1156,12 +1156,13 @@ int main(int argc, const char **argv) { #if CONFIG_VP9_DECODER && !SIMULCAST_MODE vpx_codec_control(&encoder, VP9E_GET_SVC_LAYER_ID, &layer_id); // Don't look for mismatch on top spatial and top temporal layers as they - // are non reference frames. + // are non reference frames. Don't look at frames whose top spatial layer + // is dropped. if ((enc_cfg.ss_number_layers > 1 || enc_cfg.ts_number_layers > 1) && + cx_pkt->data.frame + .spatial_layer_encoded[enc_cfg.ss_number_layers - 1] && !(layer_id.temporal_layer_id > 0 && - layer_id.temporal_layer_id == (int)enc_cfg.ts_number_layers - 1 && - cx_pkt->data.frame - .spatial_layer_encoded[enc_cfg.ss_number_layers - 1])) { + layer_id.temporal_layer_id == (int)enc_cfg.ts_number_layers - 1)) { test_decode(&encoder, &decoder, frame_cnt, &mismatch_seen); } #endif diff --git a/media/libvpx/libvpx/examples/vp9cx_set_ref.c b/media/libvpx/libvpx/examples/vp9cx_set_ref.c index 1a0823153b..6e12d668b0 100644 --- a/media/libvpx/libvpx/examples/vp9cx_set_ref.c +++ b/media/libvpx/libvpx/examples/vp9cx_set_ref.c @@ -60,7 +60,7 @@ static const char *exec_name; -void usage_exit() { +void usage_exit(void) { fprintf(stderr, "Usage: %s <width> <height> <infile> <outfile> " "<frame> <limit(optional)>\n", diff --git a/media/libvpx/libvpx/libs.doxy_template b/media/libvpx/libvpx/libs.doxy_template index 1ee442af3e..6d05162d00 100644 --- a/media/libvpx/libvpx/libs.doxy_template +++ b/media/libvpx/libvpx/libs.doxy_template @@ -1223,14 +1223,6 @@ DOT_GRAPH_MAX_NODES = 50 MAX_DOT_GRAPH_DEPTH = 0 -# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent -# background. This is disabled by default, which results in a white background. -# Warning: Depending on the platform used, enabling this option may lead to -# badly anti-aliased labels on the edges of a graph (i.e. they become hard to -# read). - -DOT_TRANSPARENT = YES - # Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output # files in one run (i.e. multiple -o and -T options on the command line). This # makes dot run faster, but since only newer versions of dot (>1.8.10) diff --git a/media/libvpx/libvpx/libs.mk b/media/libvpx/libvpx/libs.mk index ff1c569c3b..5964386710 100644 --- a/media/libvpx/libvpx/libs.mk +++ b/media/libvpx/libvpx/libs.mk @@ -313,9 +313,9 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS) # To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current # SO_VERSION_* then follow the rules in the link to detemine the new version # (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1 -SO_VERSION_MAJOR := 8 +SO_VERSION_MAJOR := 9 SO_VERSION_MINOR := 0 -SO_VERSION_PATCH := 1 +SO_VERSION_PATCH := 0 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS)) LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib SHARED_LIB_SUF := .dylib diff --git a/media/libvpx/libvpx/test/android/get_files.py b/media/libvpx/libvpx/test/android/get_files.py index 1c69740d2b..98ce7b1947 100644 --- a/media/libvpx/libvpx/test/android/get_files.py +++ b/media/libvpx/libvpx/test/android/get_files.py @@ -38,7 +38,7 @@ def get_file_sha(filename): buf = file.read(HASH_CHUNK) return sha_hash.hexdigest() except IOError: - print "Error reading " + filename + print("Error reading " + filename) # Downloads a file from a url, and then checks the sha against the passed # in sha @@ -67,7 +67,7 @@ try: getopt.getopt(sys.argv[1:], \ "u:i:o:", ["url=", "input_csv=", "output_dir="]) except: - print 'get_files.py -u <url> -i <input_csv> -o <output_dir>' + print('get_files.py -u <url> -i <input_csv> -o <output_dir>') sys.exit(2) for opt, arg in opts: @@ -79,7 +79,7 @@ for opt, arg in opts: local_resource_path = os.path.join(arg) if len(sys.argv) != 7: - print "Expects two paths and a url!" + print("Expects two paths and a url!") exit(1) if not os.path.isdir(local_resource_path): @@ -89,7 +89,7 @@ file_list_csv = open(file_list_path, "rb") # Our 'csv' file uses multiple spaces as a delimiter, python's # csv class only uses single character delimiters, so we convert them below -file_list_reader = csv.reader((re.sub(' +', ' ', line) \ +file_list_reader = csv.reader((re.sub(' +', ' ', line.decode('utf-8')) \ for line in file_list_csv), delimiter = ' ') file_shas = [] @@ -104,15 +104,16 @@ for row in file_list_reader: file_list_csv.close() # Download files, only if they don't already exist and have correct shas -for filename, sha in itertools.izip(file_names, file_shas): +for filename, sha in zip(file_names, file_shas): + filename = filename.lstrip('*') path = os.path.join(local_resource_path, filename) if os.path.isfile(path) \ and get_file_sha(path) == sha: - print path + ' exists, skipping' + print(path + ' exists, skipping') continue for retry in range(0, ftp_retries): - print "Downloading " + path + print("Downloading " + path) if not download_and_check_sha(url, filename, sha): - print "Sha does not match, retrying..." + print("Sha does not match, retrying...") else: break diff --git a/media/libvpx/libvpx/test/avg_test.cc b/media/libvpx/libvpx/test/avg_test.cc index ede9c0ba8c..7816912ff7 100644 --- a/media/libvpx/libvpx/test/avg_test.cc +++ b/media/libvpx/libvpx/test/avg_test.cc @@ -719,6 +719,15 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(1024, &vp9_block_error_fp_neon))); #endif // HAVE_NEON +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P( + SVE, BlockErrorTestFP, + ::testing::Values(make_tuple(16, &vp9_block_error_fp_sve), + make_tuple(64, &vp9_block_error_fp_sve), + make_tuple(256, &vp9_block_error_fp_sve), + make_tuple(1024, &vp9_block_error_fp_sve))); +#endif // HAVE_SVE + #if HAVE_MSA INSTANTIATE_TEST_SUITE_P( MSA, AverageTest, diff --git a/media/libvpx/libvpx/test/codec_factory.h b/media/libvpx/libvpx/test/codec_factory.h index c7e8f54847..179ccdf011 100644 --- a/media/libvpx/libvpx/test/codec_factory.h +++ b/media/libvpx/libvpx/test/codec_factory.h @@ -164,7 +164,9 @@ const libvpx_test::VP8CodecFactory kVP8; &libvpx_test::kVP8)), \ __VA_ARGS__)) #else -#define VP8_INSTANTIATE_TEST_SUITE(test, ...) +// static_assert() is used to avoid warnings about an extra ';' outside of a +// function. +#define VP8_INSTANTIATE_TEST_SUITE(test, ...) static_assert(CONFIG_VP8 == 0, "") #endif // CONFIG_VP8 /* @@ -259,7 +261,9 @@ const libvpx_test::VP9CodecFactory kVP9; &libvpx_test::kVP9)), \ __VA_ARGS__)) #else -#define VP9_INSTANTIATE_TEST_SUITE(test, ...) +// static_assert() is used to avoid warnings about an extra ';' outside of a +// function. +#define VP9_INSTANTIATE_TEST_SUITE(test, ...) static_assert(CONFIG_VP9 == 0, "") #endif // CONFIG_VP9 } // namespace libvpx_test diff --git a/media/libvpx/libvpx/test/convolve_test.cc b/media/libvpx/libvpx/test/convolve_test.cc index ffd5c41c63..11f7625137 100644 --- a/media/libvpx/libvpx/test/convolve_test.cc +++ b/media/libvpx/libvpx/test/convolve_test.cc @@ -1218,6 +1218,24 @@ WRAP(convolve8_neon, 12) WRAP(convolve8_avg_neon, 12) #endif // HAVE_NEON +#if HAVE_SVE +WRAP(convolve8_horiz_sve, 8) +WRAP(convolve8_avg_horiz_sve, 8) +WRAP(convolve8_horiz_sve, 10) +WRAP(convolve8_avg_horiz_sve, 10) +WRAP(convolve8_horiz_sve, 12) +WRAP(convolve8_avg_horiz_sve, 12) +#endif // HAVE_SVE + +#if HAVE_SVE2 +WRAP(convolve8_vert_sve2, 8) +WRAP(convolve8_avg_vert_sve2, 8) +WRAP(convolve8_vert_sve2, 10) +WRAP(convolve8_avg_vert_sve2, 10) +WRAP(convolve8_vert_sve2, 12) +WRAP(convolve8_avg_vert_sve2, 12) +#endif // HAVE_SVE2 + WRAP(convolve_copy_c, 8) WRAP(convolve_avg_c, 8) WRAP(convolve8_horiz_c, 8) @@ -1438,6 +1456,74 @@ INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, ConvolveTest, ::testing::ValuesIn(kArrayConvolve_neon_dotprod)); #endif // HAVE_NEON_DOTPROD +#if HAVE_SVE +#if CONFIG_VP9_HIGHBITDEPTH +const ConvolveFunctions convolve8_sve( + wrap_convolve_copy_c_8, wrap_convolve_avg_c_8, wrap_convolve8_horiz_sve_8, + wrap_convolve8_avg_horiz_sve_8, wrap_convolve8_vert_c_8, + wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, + wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8, + wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, + wrap_convolve8_avg_c_8, 8); +const ConvolveFunctions convolve10_sve( + wrap_convolve_copy_c_10, wrap_convolve_avg_c_10, + wrap_convolve8_horiz_sve_10, wrap_convolve8_avg_horiz_sve_10, + wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, + wrap_convolve8_avg_c_10, wrap_convolve8_horiz_c_10, + wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10, + wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10, + 10); +const ConvolveFunctions convolve12_sve( + wrap_convolve_copy_c_12, wrap_convolve_avg_c_12, + wrap_convolve8_horiz_sve_12, wrap_convolve8_avg_horiz_sve_12, + wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, + wrap_convolve8_avg_c_12, wrap_convolve8_horiz_c_12, + wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12, + wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12, + 12); + +const ConvolveParam kArrayConvolve_sve[] = { ALL_SIZES(convolve8_sve), + ALL_SIZES(convolve10_sve), + ALL_SIZES(convolve12_sve) }; +INSTANTIATE_TEST_SUITE_P(SVE, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_sve)); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_SVE + +#if HAVE_SVE2 +#if CONFIG_VP9_HIGHBITDEPTH +const ConvolveFunctions convolve8_sve2( + wrap_convolve_copy_c_8, wrap_convolve_avg_c_8, wrap_convolve8_horiz_c_8, + wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_sve2_8, + wrap_convolve8_avg_vert_sve2_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, + wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8, + wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, + wrap_convolve8_avg_c_8, 8); +const ConvolveFunctions convolve10_sve2( + wrap_convolve_copy_c_10, wrap_convolve_avg_c_10, wrap_convolve8_horiz_c_10, + wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_sve2_10, + wrap_convolve8_avg_vert_sve2_10, wrap_convolve8_c_10, + wrap_convolve8_avg_c_10, wrap_convolve8_horiz_c_10, + wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10, + wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10, + 10); +const ConvolveFunctions convolve12_sve2( + wrap_convolve_copy_c_12, wrap_convolve_avg_c_12, wrap_convolve8_horiz_c_12, + wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_sve2_12, + wrap_convolve8_avg_vert_sve2_12, wrap_convolve8_c_12, + wrap_convolve8_avg_c_12, wrap_convolve8_horiz_c_12, + wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12, + wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12, + 12); + +const ConvolveParam kArrayConvolve_sve2[] = { ALL_SIZES(convolve8_sve2), + ALL_SIZES(convolve10_sve2), + ALL_SIZES(convolve12_sve2) }; +INSTANTIATE_TEST_SUITE_P(SVE2, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_sve2)); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_SVE2 + #if HAVE_NEON_I8MM const ConvolveFunctions convolve8_neon_i8mm( vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_neon_i8mm, diff --git a/media/libvpx/libvpx/test/encode_api_test.cc b/media/libvpx/libvpx/test/encode_api_test.cc index 508083673a..ca3b17a5d5 100644 --- a/media/libvpx/libvpx/test/encode_api_test.cc +++ b/media/libvpx/libvpx/test/encode_api_test.cc @@ -8,7 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <cassert> #include <climits> +#include <cstdint> #include <cstring> #include <initializer_list> #include <new> @@ -44,6 +46,49 @@ bool IsVP9(vpx_codec_iface_t *iface) { 0; } +void *Memset16(void *dest, int val, size_t length) { + uint16_t *dest16 = reinterpret_cast<uint16_t *>(dest); + for (size_t i = 0; i < length; i++) { + *dest16++ = val; + } + return dest; +} + +vpx_image_t *CreateImage(vpx_bit_depth_t bit_depth, vpx_img_fmt_t fmt, + unsigned int width, unsigned int height) { + assert(fmt != VPX_IMG_FMT_NV12); + if (bit_depth > VPX_BITS_8) { + fmt = static_cast<vpx_img_fmt_t>(fmt | VPX_IMG_FMT_HIGHBITDEPTH); + } + vpx_image_t *image = vpx_img_alloc(nullptr, fmt, width, height, 1); + if (!image) return image; + + const int val = 1 << (bit_depth - 1); + const unsigned int uv_h = + (image->d_h + image->y_chroma_shift) >> image->y_chroma_shift; + const unsigned int uv_w = + (image->d_w + image->x_chroma_shift) >> image->x_chroma_shift; + if (bit_depth > VPX_BITS_8) { + for (unsigned int i = 0; i < image->d_h; ++i) { + Memset16(image->planes[0] + i * image->stride[0], val, image->d_w); + } + for (unsigned int i = 0; i < uv_h; ++i) { + Memset16(image->planes[1] + i * image->stride[1], val, uv_w); + Memset16(image->planes[2] + i * image->stride[2], val, uv_w); + } + } else { + for (unsigned int i = 0; i < image->d_h; ++i) { + memset(image->planes[0] + i * image->stride[0], val, image->d_w); + } + for (unsigned int i = 0; i < uv_h; ++i) { + memset(image->planes[1] + i * image->stride[1], val, uv_w); + memset(image->planes[2] + i * image->stride[2], val, uv_w); + } + } + + return image; +} + TEST(EncodeAPI, InvalidParams) { uint8_t buf[1] = { 0 }; vpx_image_t img; @@ -198,7 +243,51 @@ TEST(EncodeAPI, RandomPixelsVp8) { ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); // Generate random frame data and encode - uint8_t img[1280 * 720 * 3 / 2]; + libvpx_test::RandomVideoSource video; + video.SetSize(cfg.g_w, cfg.g_h); + video.SetImageFormat(VPX_IMG_FMT_I420); + video.Begin(); + ASSERT_EQ(vpx_codec_encode(&enc, video.img(), video.pts(), video.duration(), + /*flags=*/0, VPX_DL_BEST_QUALITY), + VPX_CODEC_OK); + + // Destroy libvpx encoder + vpx_codec_destroy(&enc); +} + +TEST(EncodeAPI, ChangeToL1T3AndSetBitrateVp8) { + // Initialize libvpx encoder + vpx_codec_iface_t *const iface = vpx_codec_vp8_cx(); + vpx_codec_enc_cfg_t cfg; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + + cfg.g_threads = 1; + cfg.g_profile = 0; + cfg.g_w = 1; + cfg.g_h = 64; + cfg.g_bit_depth = VPX_BITS_8; + cfg.g_input_bit_depth = 8; + cfg.g_timebase.num = 1; + cfg.g_timebase.den = 1000000; + cfg.g_pass = VPX_RC_ONE_PASS; + cfg.g_lag_in_frames = 0; + cfg.rc_dropframe_thresh = 0; // Don't drop frames + cfg.rc_resize_allowed = 0; + cfg.rc_end_usage = VPX_VBR; + cfg.rc_target_bitrate = 10; + cfg.rc_min_quantizer = 2; + cfg.rc_max_quantizer = 58; + cfg.kf_mode = VPX_KF_AUTO; + cfg.kf_min_dist = 0; + cfg.kf_max_dist = 10000; + + vpx_codec_ctx_t enc; + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, -6), VPX_CODEC_OK); + + // Generate random frame data and encode + uint8_t img[1 * 64 * 3 / 2]; libvpx_test::ACMRandom rng; for (size_t i = 0; i < sizeof(img); ++i) { img[i] = rng.Rand8(); @@ -207,13 +296,142 @@ TEST(EncodeAPI, RandomPixelsVp8) { ASSERT_EQ( vpx_img_wrap(&img_wrapper, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 1, img), &img_wrapper); - ASSERT_EQ(vpx_codec_encode(&enc, &img_wrapper, 0, 1, 0, VPX_DL_BEST_QUALITY), + vpx_enc_frame_flags_t flags = VPX_EFLAG_FORCE_KF; + ASSERT_EQ( + vpx_codec_encode(&enc, &img_wrapper, 0, 500000, flags, VPX_DL_REALTIME), + VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_encode(&enc, nullptr, -1, 0, 0, 0), VPX_CODEC_OK); + + cfg.rc_target_bitrate = 4294967; + // Set the scalability mode to L1T3. + cfg.ts_number_layers = 3; + cfg.ts_periodicity = 4; + cfg.ts_layer_id[0] = 0; + cfg.ts_layer_id[1] = 2; + cfg.ts_layer_id[2] = 1; + cfg.ts_layer_id[3] = 2; + cfg.ts_rate_decimator[0] = 4; + cfg.ts_rate_decimator[1] = 2; + cfg.ts_rate_decimator[2] = 1; + // Bitrate allocation L0: 50% L1: 20% L2: 30% + cfg.layer_target_bitrate[0] = cfg.ts_target_bitrate[0] = + 50 * cfg.rc_target_bitrate / 100; + cfg.layer_target_bitrate[1] = cfg.ts_target_bitrate[1] = + 70 * cfg.rc_target_bitrate / 100; + cfg.layer_target_bitrate[2] = cfg.ts_target_bitrate[2] = + cfg.rc_target_bitrate; + cfg.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_0212; + cfg.g_error_resilient = VPX_ERROR_RESILIENT_DEFAULT; + ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK); + + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_TEMPORAL_LAYER_ID, 2), VPX_CODEC_OK); + constexpr vpx_enc_frame_flags_t VP8_UPDATE_NOTHING = + VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST; + // Layer 2: only reference last frame, no updates + // It only depends on layer 0 + flags = VP8_UPDATE_NOTHING | VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_REF_GF; + ASSERT_EQ( + vpx_codec_encode(&enc, &img_wrapper, 0, 500000, flags, VPX_DL_REALTIME), + VPX_CODEC_OK); + // Destroy libvpx encoder vpx_codec_destroy(&enc); } -#endif + +// Emulates the WebCodecs VideoEncoder interface. +class VP8Encoder { + public: + explicit VP8Encoder(int speed) : speed_(speed) {} + ~VP8Encoder(); + + void Configure(unsigned int threads, unsigned int width, unsigned int height, + vpx_rc_mode end_usage, vpx_enc_deadline_t deadline); + void Encode(bool key_frame); + + private: + const int speed_; + bool initialized_ = false; + vpx_codec_enc_cfg_t cfg_; + vpx_codec_ctx_t enc_; + int frame_index_ = 0; + vpx_enc_deadline_t deadline_ = 0; +}; + +VP8Encoder::~VP8Encoder() { + if (initialized_) { + EXPECT_EQ(vpx_codec_destroy(&enc_), VPX_CODEC_OK); + } +} + +void VP8Encoder::Configure(unsigned int threads, unsigned int width, + unsigned int height, vpx_rc_mode end_usage, + vpx_enc_deadline_t deadline) { + deadline_ = deadline; + + if (!initialized_) { + vpx_codec_iface_t *const iface = vpx_codec_vp8_cx(); + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg_, /*usage=*/0), + VPX_CODEC_OK); + cfg_.g_threads = threads; + cfg_.g_w = width; + cfg_.g_h = height; + cfg_.g_timebase.num = 1; + cfg_.g_timebase.den = 1000 * 1000; // microseconds + cfg_.g_pass = VPX_RC_ONE_PASS; + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = end_usage; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 58; + ASSERT_EQ(vpx_codec_enc_init(&enc_, iface, &cfg_, 0), VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc_, VP8E_SET_CPUUSED, speed_), VPX_CODEC_OK); + initialized_ = true; + return; + } + + cfg_.g_threads = threads; + cfg_.g_w = width; + cfg_.g_h = height; + cfg_.rc_end_usage = end_usage; + ASSERT_EQ(vpx_codec_enc_config_set(&enc_, &cfg_), VPX_CODEC_OK) + << vpx_codec_error_detail(&enc_); +} + +void VP8Encoder::Encode(bool key_frame) { + const vpx_codec_cx_pkt_t *pkt; + vpx_image_t *image = + CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg_.g_w, cfg_.g_h); + ASSERT_NE(image, nullptr); + const vpx_enc_frame_flags_t flags = key_frame ? VPX_EFLAG_FORCE_KF : 0; + ASSERT_EQ(vpx_codec_encode(&enc_, image, frame_index_, 1, flags, deadline_), + VPX_CODEC_OK); + ++frame_index_; + vpx_codec_iter_t iter = nullptr; + while ((pkt = vpx_codec_get_cx_data(&enc_, &iter)) != nullptr) { + ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT); + if (key_frame) { + ASSERT_EQ(pkt->data.frame.flags & VPX_FRAME_IS_KEY, VPX_FRAME_IS_KEY); + } + } + vpx_img_free(image); +} + +// This is the reproducer testcase for crbug.com/324459561. However, +// just running this test is not enough to reproduce the bug. We also +// need to send signals to the test. +TEST(EncodeAPI, Chromium324459561) { + VP8Encoder encoder(-12); + + encoder.Configure(11, 1685, 652, VPX_CBR, VPX_DL_REALTIME); + + encoder.Encode(true); + encoder.Encode(true); + encoder.Encode(true); + + encoder.Configure(0, 1685, 1, VPX_VBR, VPX_DL_REALTIME); +} +#endif // CONFIG_VP8_ENCODER // Set up 2 spatial streams with 2 temporal layers per stream, and generate // invalid configuration by setting the temporal layer rate allocation @@ -499,6 +717,131 @@ TEST(EncodeAPI, ConfigResizeChangeThreadCount) { } } +TEST(EncodeAPI, ConfigResizeBiggerAfterInit) { + for (const auto *iface : kCodecIfaces) { + SCOPED_TRACE(vpx_codec_iface_name(iface)); + vpx_codec_enc_cfg_t cfg; + vpx_codec_ctx_t enc; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, 1, 1, &enc, &cfg)); + + cfg.g_w = 1920; + cfg.g_h = 1; + EXPECT_EQ(vpx_codec_enc_config_set(&enc, &cfg), + IsVP9(iface) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM); + + EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); + } +} + +TEST(EncodeAPI, ConfigResizeBiggerAfterEncode) { + for (const auto *iface : kCodecIfaces) { + SCOPED_TRACE(vpx_codec_iface_name(iface)); + vpx_codec_enc_cfg_t cfg; + vpx_codec_ctx_t enc; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, 1, 1, &enc, &cfg)); + EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc)); + + cfg.g_w = 1920; + cfg.g_h = 1; + EXPECT_EQ(vpx_codec_enc_config_set(&enc, &cfg), + IsVP9(iface) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM); + + cfg.g_w = 1920; + cfg.g_h = 1080; + EXPECT_EQ(vpx_codec_enc_config_set(&enc, &cfg), + IsVP9(iface) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM); + + EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); + } +} + +TEST(EncodeAPI, PtsSmallerThanInitialPts) { + for (const auto *iface : kCodecIfaces) { + // Initialize libvpx encoder. + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + // Create input image. + vpx_image_t *const image = + CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + // Encode frame. + ASSERT_EQ(vpx_codec_encode(&enc, image, 12, 1, 0, VPX_DL_BEST_QUALITY), + VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_encode(&enc, image, 13, 1, 0, VPX_DL_BEST_QUALITY), + VPX_CODEC_OK); + // pts (10) is smaller than the initial pts (12). + ASSERT_EQ(vpx_codec_encode(&enc, image, 10, 1, 0, VPX_DL_BEST_QUALITY), + VPX_CODEC_INVALID_PARAM); + + // Free resources. + vpx_img_free(image); + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); + } +} + +TEST(EncodeAPI, PtsOrDurationTooBig) { + for (const auto *iface : kCodecIfaces) { + // Initialize libvpx encoder. + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + // Create input image. + vpx_image_t *const image = + CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + // Encode frame. + ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 1, 0, VPX_DL_BEST_QUALITY), + VPX_CODEC_OK); +#if ULONG_MAX > INT64_MAX + // duration is too big. + ASSERT_EQ(vpx_codec_encode(&enc, image, 0, (1ul << 63), 0, 2), + VPX_CODEC_INVALID_PARAM); +#endif + // pts, when converted to ticks, is too big. + ASSERT_EQ(vpx_codec_encode(&enc, image, INT64_MAX / 1000000 + 1, 1, 0, + VPX_DL_BEST_QUALITY), + VPX_CODEC_INVALID_PARAM); +#if ULONG_MAX > INT64_MAX + // duration is too big. + ASSERT_EQ( + vpx_codec_encode(&enc, image, 0, (1ul << 63), 0, VPX_DL_BEST_QUALITY), + VPX_CODEC_INVALID_PARAM); + // pts + duration is too big. + ASSERT_EQ( + vpx_codec_encode(&enc, image, 1, INT64_MAX, 0, VPX_DL_BEST_QUALITY), + VPX_CODEC_INVALID_PARAM); +#endif + // pts + duration, when converted to ticks, is too big. +#if ULONG_MAX > INT64_MAX + ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 0xbd6b566b15c7, 0, + VPX_DL_BEST_QUALITY), + VPX_CODEC_INVALID_PARAM); +#endif + ASSERT_EQ(vpx_codec_encode(&enc, image, INT64_MAX / 1000000, 1, 0, + VPX_DL_BEST_QUALITY), + VPX_CODEC_INVALID_PARAM); + + // Free resources. + vpx_img_free(image); + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); + } +} + #if CONFIG_VP9_ENCODER // Frame size needed to trigger the overflow exceeds the max buffer allowed on // 32-bit systems defined by VPX_MAX_ALLOCABLE_MEMORY @@ -528,28 +871,16 @@ TEST(EncodeAPI, ConfigLargeTargetBitrateVp9) { } #endif // VPX_ARCH_X86_64 || VPX_ARCH_AARCH64 -vpx_image_t *CreateImage(const unsigned int width, const unsigned int height) { - vpx_image_t *image = - vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, width, height, 1); - if (!image) return image; - - for (unsigned int i = 0; i < image->d_h; ++i) { - memset(image->planes[0] + i * image->stride[0], 128, image->d_w); - } - const unsigned int uv_h = (image->d_h + 1) / 2; - const unsigned int uv_w = (image->d_w + 1) / 2; - for (unsigned int i = 0; i < uv_h; ++i) { - memset(image->planes[1] + i * image->stride[1], 128, uv_w); - memset(image->planes[2] + i * image->stride[2], 128, uv_w); - } - - return image; -} - // Emulates the WebCodecs VideoEncoder interface. class VP9Encoder { public: - explicit VP9Encoder(int speed) : speed_(speed) {} + explicit VP9Encoder(int speed) + : speed_(speed), bit_depth_(VPX_BITS_8), fmt_(VPX_IMG_FMT_I420) {} + // The image format `fmt` must not have the VPX_IMG_FMT_HIGHBITDEPTH bit set. + // If bit_depth > 8, we will set the VPX_IMG_FMT_HIGHBITDEPTH bit before + // passing the image format to vpx_img_alloc(). + VP9Encoder(int speed, vpx_bit_depth_t bit_depth, vpx_img_fmt_t fmt) + : speed_(speed), bit_depth_(bit_depth), fmt_(fmt) {} ~VP9Encoder(); void Configure(unsigned int threads, unsigned int width, unsigned int height, @@ -558,6 +889,8 @@ class VP9Encoder { private: const int speed_; + const vpx_bit_depth_t bit_depth_; + const vpx_img_fmt_t fmt_; bool initialized_ = false; vpx_codec_enc_cfg_t cfg_; vpx_codec_ctx_t enc_; @@ -577,12 +910,22 @@ void VP9Encoder::Configure(unsigned int threads, unsigned int width, deadline_ = deadline; if (!initialized_) { + ASSERT_EQ(fmt_ & VPX_IMG_FMT_HIGHBITDEPTH, 0); + const bool high_bit_depth = bit_depth_ > VPX_BITS_8; + const bool is_420 = fmt_ == VPX_IMG_FMT_I420; vpx_codec_iface_t *const iface = vpx_codec_vp9_cx(); ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg_, /*usage=*/0), VPX_CODEC_OK); cfg_.g_threads = threads; + // In profiles 0 and 2, only 4:2:0 format is allowed. In profiles 1 and 3, + // all other subsampling formats are allowed. In profiles 0 and 1, only bit + // depth 8 is allowed. In profiles 2 and 3, only bit depths 10 and 12 are + // allowed. + cfg_.g_profile = 2 * high_bit_depth + !is_420; cfg_.g_w = width; cfg_.g_h = height; + cfg_.g_bit_depth = bit_depth_; + cfg_.g_input_bit_depth = bit_depth_; cfg_.g_timebase.num = 1; cfg_.g_timebase.den = 1000 * 1000; // microseconds cfg_.g_pass = VPX_RC_ONE_PASS; @@ -590,7 +933,10 @@ void VP9Encoder::Configure(unsigned int threads, unsigned int width, cfg_.rc_end_usage = end_usage; cfg_.rc_min_quantizer = 2; cfg_.rc_max_quantizer = 58; - ASSERT_EQ(vpx_codec_enc_init(&enc_, iface, &cfg_, 0), VPX_CODEC_OK); + ASSERT_EQ( + vpx_codec_enc_init(&enc_, iface, &cfg_, + high_bit_depth ? VPX_CODEC_USE_HIGHBITDEPTH : 0), + VPX_CODEC_OK); ASSERT_EQ(vpx_codec_control(&enc_, VP8E_SET_CPUUSED, speed_), VPX_CODEC_OK); initialized_ = true; return; @@ -606,13 +952,13 @@ void VP9Encoder::Configure(unsigned int threads, unsigned int width, void VP9Encoder::Encode(bool key_frame) { const vpx_codec_cx_pkt_t *pkt; - vpx_image_t *image = CreateImage(cfg_.g_w, cfg_.g_h); + vpx_image_t *image = CreateImage(bit_depth_, fmt_, cfg_.g_w, cfg_.g_h); ASSERT_NE(image, nullptr); const vpx_enc_frame_flags_t frame_flags = key_frame ? VPX_EFLAG_FORCE_KF : 0; ASSERT_EQ( vpx_codec_encode(&enc_, image, frame_index_, 1, frame_flags, deadline_), VPX_CODEC_OK); - frame_index_++; + ++frame_index_; vpx_codec_iter_t iter = nullptr; while ((pkt = vpx_codec_get_cx_data(&enc_, &iter)) != nullptr) { ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT); @@ -944,6 +1290,28 @@ TEST(EncodeAPI, Buganizer311294795) { encoder.Encode(false); encoder.Encode(false); } + +TEST(EncodeAPI, Buganizer317105128) { + VP9Encoder encoder(-9); + encoder.Configure(0, 1, 1, VPX_CBR, VPX_DL_GOOD_QUALITY); + encoder.Configure(16, 1920, 1, VPX_CBR, VPX_DL_REALTIME); +} + +TEST(EncodeAPI, Buganizer319964497) { + VP9Encoder encoder(7); + encoder.Configure(/*threads=*/1, /*width=*/320, /*height=*/240, VPX_VBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/true); + encoder.Encode(/*key_frame=*/true); + encoder.Encode(/*key_frame=*/false); + encoder.Configure(/*threads=*/1, /*width=*/1, /*height=*/1, VPX_VBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/false); + encoder.Configure(/*threads=*/1, /*width=*/2, /*height=*/2, VPX_CBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/false); +} + #endif // CONFIG_VP9_ENCODER } // namespace diff --git a/media/libvpx/libvpx/test/frame_size_tests.cc b/media/libvpx/libvpx/test/frame_size_tests.cc index eea5647a78..6306e4f2ca 100644 --- a/media/libvpx/libvpx/test/frame_size_tests.cc +++ b/media/libvpx/libvpx/test/frame_size_tests.cc @@ -193,7 +193,7 @@ TEST_F(VP9FrameSizeTestsLarge, ValidSizes) { // size or almost 1 gig of memory. // In total the allocations will exceed 2GiB which may cause a failure with // mingw + wine, use a smaller size in that case. -#if defined(_WIN32) && !defined(_WIN64) || defined(__OS2__) +#if defined(_WIN32) && !defined(_WIN64) video.SetSize(4096, 3072); #else video.SetSize(4096, 4096); diff --git a/media/libvpx/libvpx/test/init_vpx_test.cc b/media/libvpx/libvpx/test/init_vpx_test.cc index f66f00b5c1..353c5043eb 100644 --- a/media/libvpx/libvpx/test/init_vpx_test.cc +++ b/media/libvpx/libvpx/test/init_vpx_test.cc @@ -57,6 +57,9 @@ void init_vpx_test() { if (!(caps & HAS_SVE)) { append_negative_gtest_filter(":SVE.*:SVE/*"); } + if (!(caps & HAS_SVE2)) { + append_negative_gtest_filter(":SVE2.*:SVE2/*"); + } #elif VPX_ARCH_ARM const int caps = arm_cpu_caps(); if (!(caps & HAS_NEON)) append_negative_gtest_filter(":NEON.*:NEON/*"); diff --git a/media/libvpx/libvpx/test/resize_test.cc b/media/libvpx/libvpx/test/resize_test.cc index 20ad2229b4..f27bd7ebbc 100644 --- a/media/libvpx/libvpx/test/resize_test.cc +++ b/media/libvpx/libvpx/test/resize_test.cc @@ -7,8 +7,6 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include <stdio.h> - #include <climits> #include <vector> #include "third_party/googletest/src/include/gtest/gtest.h" @@ -598,6 +596,7 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDown) { mismatch_nframes_ = 0; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER unsigned int last_w = cfg_.g_w; unsigned int last_h = cfg_.g_h; int resize_count = 0; @@ -613,12 +612,12 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDown) { } } -#if CONFIG_VP9_DECODER // Verify that we get 1 resize down event in this test. ASSERT_EQ(1, resize_count) << "Resizing should occur."; EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames()); #else - printf("Warning: VP9 decoder unavailable, unable to check resize count!\n"); + GTEST_SKIP() + << "Warning: VP9 decoder unavailable, unable to check resize count!\n"; #endif } @@ -669,7 +668,8 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) { ASSERT_EQ(resize_count, 4) << "Resizing should occur twice."; EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames()); #else - printf("Warning: VP9 decoder unavailable, unable to check resize count!\n"); + GTEST_SKIP() + << "Warning: VP9 decoder unavailable, unable to check resize count!\n"; #endif } diff --git a/media/libvpx/libvpx/test/sum_squares_test.cc b/media/libvpx/libvpx/test/sum_squares_test.cc index d3c76a34d2..57037f1e30 100644 --- a/media/libvpx/libvpx/test/sum_squares_test.cc +++ b/media/libvpx/libvpx/test/sum_squares_test.cc @@ -119,6 +119,13 @@ INSTANTIATE_TEST_SUITE_P( &vpx_sum_squares_2d_i16_neon))); #endif // HAVE_NEON +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P( + SVE, SumSquaresTest, + ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c, + &vpx_sum_squares_2d_i16_sve))); +#endif // HAVE_SVE + #if HAVE_SSE2 INSTANTIATE_TEST_SUITE_P( SSE2, SumSquaresTest, diff --git a/media/libvpx/libvpx/test/variance_test.cc b/media/libvpx/libvpx/test/variance_test.cc index b8320e9ceb..5cf6a5fb8e 100644 --- a/media/libvpx/libvpx/test/variance_test.cc +++ b/media/libvpx/libvpx/test/variance_test.cc @@ -29,6 +29,9 @@ namespace { typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride); +typedef void (*GetVarianceFunc)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse, int *sum); typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src); using libvpx_test::ACMRandom; @@ -63,35 +66,65 @@ static unsigned int mb_ss_ref(const int16_t *src) { * Our codebase calculates the "diff" value in the variance algorithm by * (src - ref). */ -static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref, int l2w, - int l2h, int src_stride, int ref_stride, - uint32_t *sse_ptr, bool use_high_bit_depth_, - vpx_bit_depth_t bit_depth) { - int64_t se = 0; - uint64_t sse = 0; - const int w = 1 << l2w; - const int h = 1 << l2h; +static void variance(const uint8_t *src, int src_stride, const uint8_t *ref, + int ref_stride, int w, int h, bool use_high_bit_depth_, + uint64_t *sse, int64_t *se, vpx_bit_depth_t bit_depth) { + int64_t se_long = 0; + uint64_t sse_long = 0; + for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) { - int diff; + int diff = 0; if (!use_high_bit_depth_) { diff = src[y * src_stride + x] - ref[y * ref_stride + x]; - se += diff; - sse += diff * diff; #if CONFIG_VP9_HIGHBITDEPTH } else { diff = CONVERT_TO_SHORTPTR(src)[y * src_stride + x] - CONVERT_TO_SHORTPTR(ref)[y * ref_stride + x]; - se += diff; - sse += diff * diff; #endif // CONFIG_VP9_HIGHBITDEPTH } + se_long += diff; + sse_long += diff * diff; } } - RoundHighBitDepth(bit_depth, &se, &sse); - *sse_ptr = static_cast<uint32_t>(sse); + + RoundHighBitDepth(bit_depth, &se_long, &sse_long); + + *sse = sse_long; + *se = se_long; +} + +static void get_variance_ref(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int l2w, + int l2h, bool use_high_bit_depth_, uint32_t *sse, + int *se, vpx_bit_depth_t bit_depth) { + const int w = 1 << l2w; + const int h = 1 << l2h; + int64_t se_long = 0; + uint64_t sse_long = 0; + + variance(src, src_stride, ref, ref_stride, w, h, use_high_bit_depth_, + &sse_long, &se_long, bit_depth); + + *sse = static_cast<uint32_t>(sse_long); + *se = static_cast<int>(se_long); +} + +static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref, int l2w, + int l2h, int src_stride, int ref_stride, + uint32_t *sse_ptr, bool use_high_bit_depth_, + vpx_bit_depth_t bit_depth) { + const int w = 1 << l2w; + const int h = 1 << l2h; + int64_t se_long = 0; + uint64_t sse_long = 0; + + variance(src, src_stride, ref, ref_stride, w, h, use_high_bit_depth_, + &sse_long, &se_long, bit_depth); + + *sse_ptr = static_cast<uint32_t>(sse_long); return static_cast<uint32_t>( - sse - ((static_cast<int64_t>(se) * se) >> (l2w + l2h))); + sse_long - ((static_cast<int64_t>(se_long) * se_long) >> (l2w + l2h))); } /* The subpel reference functions differ from the codec version in one aspect: @@ -337,6 +370,9 @@ class MainTestClass void OneQuarterTest(); void SpeedTest(); + // GetVariance tests + void RefTestGetVar(); + // MSE/SSE tests void RefTestMse(); void RefTestSse(); @@ -493,6 +529,35 @@ void MainTestClass<VarianceFunctionType>::SpeedTest() { } //////////////////////////////////////////////////////////////////////////////// +// Tests related to GetVariance. +template <typename GetVarianceFunctionType> +void MainTestClass<GetVarianceFunctionType>::RefTestGetVar() { + for (int i = 0; i < 10; ++i) { + for (int j = 0; j < block_size(); j++) { + if (!use_high_bit_depth()) { + src_[j] = rnd_.Rand8(); + ref_[j] = rnd_.Rand8(); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask(); + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask(); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + unsigned int sse1, sse2; + int sum1, sum2; + const int stride = width(); + ASM_REGISTER_STATE_CHECK( + params_.func(src_, stride, ref_, stride, &sse1, &sum1)); + get_variance_ref(src_, stride, ref_, stride, params_.log2width, + params_.log2height, use_high_bit_depth(), &sse2, &sum2, + params_.bit_depth); + EXPECT_EQ(sse1, sse2) << "Error at test index: " << i; + EXPECT_EQ(sum1, sum2) << "Error at test index: " << i; + } +} + +//////////////////////////////////////////////////////////////////////////////// // Tests related to MSE / SSE. template <typename FunctionType> @@ -766,6 +831,7 @@ void SubpelVarianceTest<vpx_subp_avg_variance_fn_t>::RefTest() { typedef MainTestClass<Get4x4SseFunc> VpxSseTest; typedef MainTestClass<vpx_variance_fn_t> VpxMseTest; typedef MainTestClass<vpx_variance_fn_t> VpxVarianceTest; +typedef MainTestClass<GetVarianceFunc> VpxGetVarianceTest; typedef SubpelVarianceTest<vpx_subpixvariance_fn_t> VpxSubpelVarianceTest; typedef SubpelVarianceTest<vpx_subp_avg_variance_fn_t> VpxSubpelAvgVarianceTest; @@ -779,6 +845,7 @@ TEST_P(VpxVarianceTest, Ref) { RefTest(); } TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); } TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); } TEST_P(VpxVarianceTest, DISABLED_Speed) { SpeedTest(); } +TEST_P(VpxGetVarianceTest, RefGetVar) { RefTestGetVar(); } TEST_P(SumOfSquaresTest, Const) { ConstTest(); } TEST_P(SumOfSquaresTest, Ref) { RefTest(); } TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); } @@ -818,6 +885,16 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(2, 3, &vpx_variance4x8_c), VarianceParams(2, 2, &vpx_variance4x4_c))); +typedef TestParams<GetVarianceFunc> GetVarianceParams; +INSTANTIATE_TEST_SUITE_P( + C, VpxGetVarianceTest, + ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_c), + GetVarianceParams(3, 3, &vpx_get8x8var_c), + GetVarianceParams(4, 4, &vpx_get16x16var_c), + GetVarianceParams(3, 3, &vpx_get8x8var_c), + GetVarianceParams(4, 4, &vpx_get16x16var_c), + GetVarianceParams(3, 3, &vpx_get8x8var_c))); + typedef TestParams<vpx_subpixvariance_fn_t> SubpelVarianceParams; INSTANTIATE_TEST_SUITE_P( C, VpxSubpelVarianceTest, @@ -856,6 +933,7 @@ INSTANTIATE_TEST_SUITE_P( #if CONFIG_VP9_HIGHBITDEPTH typedef MainTestClass<vpx_variance_fn_t> VpxHBDVarianceTest; +typedef MainTestClass<GetVarianceFunc> VpxHBDGetVarianceTest; typedef SubpelVarianceTest<vpx_subpixvariance_fn_t> VpxHBDSubpelVarianceTest; typedef SubpelVarianceTest<vpx_subp_avg_variance_fn_t> VpxHBDSubpelAvgVarianceTest; @@ -865,6 +943,7 @@ TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); } TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); } TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); } TEST_P(VpxHBDVarianceTest, DISABLED_Speed) { SpeedTest(); } +TEST_P(VpxHBDGetVarianceTest, RefGetVar) { RefTestGetVar(); } TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); } TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); } @@ -933,6 +1012,15 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(2, 2, &vpx_highbd_8_variance4x4_c, 8))); INSTANTIATE_TEST_SUITE_P( + C, VpxHBDGetVarianceTest, + ::testing::Values(GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_c, 12), + GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_c, 12), + GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_c, 10), + GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_c, 10), + GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_c, 8), + GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_c, 8))); + +INSTANTIATE_TEST_SUITE_P( C, VpxHBDSubpelVarianceTest, ::testing::Values( SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8), @@ -1119,6 +1207,15 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(2, 2, &vpx_variance4x4_sse2))); INSTANTIATE_TEST_SUITE_P( + SSE2, VpxGetVarianceTest, + ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_sse2), + GetVarianceParams(3, 3, &vpx_get8x8var_sse2), + GetVarianceParams(4, 4, &vpx_get16x16var_sse2), + GetVarianceParams(3, 3, &vpx_get8x8var_sse2), + GetVarianceParams(4, 4, &vpx_get16x16var_sse2), + GetVarianceParams(3, 3, &vpx_get8x8var_sse2))); + +INSTANTIATE_TEST_SUITE_P( SSE2, VpxSubpelVarianceTest, ::testing::Values( SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0), @@ -1198,6 +1295,16 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(3, 3, &vpx_highbd_8_variance8x8_sse2, 8))); INSTANTIATE_TEST_SUITE_P( + SSE2, VpxHBDGetVarianceTest, + ::testing::Values( + GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_sse2, 12), + GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_sse2, 12), + GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_sse2, 10), + GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_sse2, 10), + GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_sse2, 8), + GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_sse2, 8))); + +INSTANTIATE_TEST_SUITE_P( SSE2, VpxHBDSubpelVarianceTest, ::testing::Values( SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2, @@ -1475,6 +1582,15 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(2, 3, &vpx_variance4x8_neon), VarianceParams(2, 2, &vpx_variance4x4_neon))); +INSTANTIATE_TEST_SUITE_P( + NEON, VpxGetVarianceTest, + ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_neon), + GetVarianceParams(3, 3, &vpx_get8x8var_neon), + GetVarianceParams(4, 4, &vpx_get16x16var_neon), + GetVarianceParams(3, 3, &vpx_get8x8var_neon), + GetVarianceParams(4, 4, &vpx_get16x16var_neon), + GetVarianceParams(3, 3, &vpx_get8x8var_neon))); + #if HAVE_NEON_DOTPROD INSTANTIATE_TEST_SUITE_P( NEON_DOTPROD, VpxSseTest, @@ -1502,6 +1618,15 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(3, 2, &vpx_variance8x4_neon_dotprod), VarianceParams(2, 3, &vpx_variance4x8_neon_dotprod), VarianceParams(2, 2, &vpx_variance4x4_neon_dotprod))); + +INSTANTIATE_TEST_SUITE_P( + NEON_DOTPROD, VpxGetVarianceTest, + ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_neon_dotprod), + GetVarianceParams(3, 3, &vpx_get8x8var_neon_dotprod), + GetVarianceParams(4, 4, &vpx_get16x16var_neon_dotprod), + GetVarianceParams(3, 3, &vpx_get8x8var_neon_dotprod), + GetVarianceParams(4, 4, &vpx_get16x16var_neon_dotprod), + GetVarianceParams(3, 3, &vpx_get8x8var_neon_dotprod))); #endif // HAVE_NEON_DOTPROD INSTANTIATE_TEST_SUITE_P( @@ -1555,9 +1680,6 @@ INSTANTIATE_TEST_SUITE_P( MseParams(3, 4, &vpx_highbd_8_mse8x16_neon, VPX_BITS_8), MseParams(3, 3, &vpx_highbd_8_mse8x8_neon, VPX_BITS_8))); -// TODO(webm:1819): Re-enable when vpx_highbd_8_mse16x16_neon_dotprod, etc. can -// be used again. -#if 0 #if HAVE_NEON_DOTPROD INSTANTIATE_TEST_SUITE_P( NEON_DOTPROD, VpxHBDMseTest, @@ -1567,7 +1689,19 @@ INSTANTIATE_TEST_SUITE_P( MseParams(3, 4, &vpx_highbd_8_mse8x16_neon_dotprod, VPX_BITS_8), MseParams(3, 3, &vpx_highbd_8_mse8x8_neon_dotprod, VPX_BITS_8))); #endif // HAVE_NEON_DOTPROD -#endif // 0 + +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P( + SVE, VpxHBDMseTest, + ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_sve, VPX_BITS_12), + MseParams(4, 3, &vpx_highbd_12_mse16x8_sve, VPX_BITS_12), + MseParams(3, 4, &vpx_highbd_12_mse8x16_sve, VPX_BITS_12), + MseParams(3, 3, &vpx_highbd_12_mse8x8_sve, VPX_BITS_12), + MseParams(4, 4, &vpx_highbd_10_mse16x16_sve, VPX_BITS_10), + MseParams(4, 3, &vpx_highbd_10_mse16x8_sve, VPX_BITS_10), + MseParams(3, 4, &vpx_highbd_10_mse8x16_sve, VPX_BITS_10), + MseParams(3, 3, &vpx_highbd_10_mse8x8_sve, VPX_BITS_10))); +#endif // HAVE_SVE INSTANTIATE_TEST_SUITE_P( NEON, VpxHBDVarianceTest, @@ -1613,6 +1747,28 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(2, 2, &vpx_highbd_8_variance4x4_neon, 8))); INSTANTIATE_TEST_SUITE_P( + NEON, VpxHBDGetVarianceTest, + ::testing::Values( + GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_neon, 12), + GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_neon, 12), + GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_neon, 10), + GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_neon, 10), + GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_neon, 8), + GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_neon, 8))); + +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P( + SVE, VpxHBDGetVarianceTest, + ::testing::Values( + GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_sve, 12), + GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_sve, 12), + GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_sve, 10), + GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_sve, 10), + GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_sve, 8), + GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_sve, 8))); +#endif // HAVE_SVE + +INSTANTIATE_TEST_SUITE_P( NEON, VpxHBDSubpelVarianceTest, ::testing::Values( SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_neon, @@ -1815,6 +1971,53 @@ INSTANTIATE_TEST_SUITE_P( #endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_NEON +#if HAVE_SVE +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + SVE, VpxHBDVarianceTest, + ::testing::Values( + VarianceParams(6, 6, &vpx_highbd_12_variance64x64_sve, 12), + VarianceParams(6, 5, &vpx_highbd_12_variance64x32_sve, 12), + VarianceParams(5, 6, &vpx_highbd_12_variance32x64_sve, 12), + VarianceParams(5, 5, &vpx_highbd_12_variance32x32_sve, 12), + VarianceParams(5, 4, &vpx_highbd_12_variance32x16_sve, 12), + VarianceParams(4, 5, &vpx_highbd_12_variance16x32_sve, 12), + VarianceParams(4, 4, &vpx_highbd_12_variance16x16_sve, 12), + VarianceParams(4, 3, &vpx_highbd_12_variance16x8_sve, 12), + VarianceParams(3, 4, &vpx_highbd_12_variance8x16_sve, 12), + VarianceParams(3, 3, &vpx_highbd_12_variance8x8_sve, 12), + VarianceParams(3, 2, &vpx_highbd_12_variance8x4_sve, 12), + VarianceParams(2, 3, &vpx_highbd_12_variance4x8_sve, 12), + VarianceParams(2, 2, &vpx_highbd_12_variance4x4_sve, 12), + VarianceParams(6, 6, &vpx_highbd_10_variance64x64_sve, 10), + VarianceParams(6, 5, &vpx_highbd_10_variance64x32_sve, 10), + VarianceParams(5, 6, &vpx_highbd_10_variance32x64_sve, 10), + VarianceParams(5, 5, &vpx_highbd_10_variance32x32_sve, 10), + VarianceParams(5, 4, &vpx_highbd_10_variance32x16_sve, 10), + VarianceParams(4, 5, &vpx_highbd_10_variance16x32_sve, 10), + VarianceParams(4, 4, &vpx_highbd_10_variance16x16_sve, 10), + VarianceParams(4, 3, &vpx_highbd_10_variance16x8_sve, 10), + VarianceParams(3, 4, &vpx_highbd_10_variance8x16_sve, 10), + VarianceParams(3, 3, &vpx_highbd_10_variance8x8_sve, 10), + VarianceParams(3, 2, &vpx_highbd_10_variance8x4_sve, 10), + VarianceParams(2, 3, &vpx_highbd_10_variance4x8_sve, 10), + VarianceParams(2, 2, &vpx_highbd_10_variance4x4_sve, 10), + VarianceParams(6, 6, &vpx_highbd_8_variance64x64_sve, 8), + VarianceParams(6, 5, &vpx_highbd_8_variance64x32_sve, 8), + VarianceParams(5, 6, &vpx_highbd_8_variance32x64_sve, 8), + VarianceParams(5, 5, &vpx_highbd_8_variance32x32_sve, 8), + VarianceParams(5, 4, &vpx_highbd_8_variance32x16_sve, 8), + VarianceParams(4, 5, &vpx_highbd_8_variance16x32_sve, 8), + VarianceParams(4, 4, &vpx_highbd_8_variance16x16_sve, 8), + VarianceParams(4, 3, &vpx_highbd_8_variance16x8_sve, 8), + VarianceParams(3, 4, &vpx_highbd_8_variance8x16_sve, 8), + VarianceParams(3, 3, &vpx_highbd_8_variance8x8_sve, 8), + VarianceParams(3, 2, &vpx_highbd_8_variance8x4_sve, 8), + VarianceParams(2, 3, &vpx_highbd_8_variance4x8_sve, 8), + VarianceParams(2, 2, &vpx_highbd_8_variance4x4_sve, 8))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_SVE + #if HAVE_MSA INSTANTIATE_TEST_SUITE_P(MSA, SumOfSquaresTest, ::testing::Values(vpx_get_mb_ss_msa)); @@ -1846,6 +2049,15 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(2, 2, &vpx_variance4x4_msa))); INSTANTIATE_TEST_SUITE_P( + MSA, VpxGetVarianceTest, + ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_msa), + GetVarianceParams(3, 3, &vpx_get8x8var_msa), + GetVarianceParams(4, 4, &vpx_get16x16var_msa), + GetVarianceParams(3, 3, &vpx_get8x8var_msa), + GetVarianceParams(4, 4, &vpx_get16x16var_msa), + GetVarianceParams(3, 3, &vpx_get8x8var_msa))); + +INSTANTIATE_TEST_SUITE_P( MSA, VpxSubpelVarianceTest, ::testing::Values( SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_msa, 0), @@ -1908,6 +2120,15 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(3, 2, &vpx_variance8x4_vsx), VarianceParams(2, 3, &vpx_variance4x8_vsx), VarianceParams(2, 2, &vpx_variance4x4_vsx))); + +INSTANTIATE_TEST_SUITE_P( + VSX, VpxGetVarianceTest, + ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_vsx), + GetVarianceParams(3, 3, &vpx_get8x8var_vsx), + GetVarianceParams(4, 4, &vpx_get16x16var_vsx), + GetVarianceParams(3, 3, &vpx_get8x8var_vsx), + GetVarianceParams(4, 4, &vpx_get16x16var_vsx), + GetVarianceParams(3, 3, &vpx_get8x8var_vsx))); #endif // HAVE_VSX #if HAVE_MMI diff --git a/media/libvpx/libvpx/test/video_source.h b/media/libvpx/libvpx/test/video_source.h index 2194126f1f..2c035910db 100644 --- a/media/libvpx/libvpx/test/video_source.h +++ b/media/libvpx/libvpx/test/video_source.h @@ -236,7 +236,6 @@ class RandomVideoSource : public DummyVideoSource { RandomVideoSource(int seed = ACMRandom::DeterministicSeed()) : rnd_(seed), seed_(seed) {} - protected: // Reset the RNG to get a matching stream for the second pass void Begin() override { frame_ = 0; @@ -244,6 +243,7 @@ class RandomVideoSource : public DummyVideoSource { FillFrame(); } + protected: // 15 frames of noise, followed by 15 static frames. Reset to 0 rather // than holding previous frames to encourage keyframes to be thrown. void FillFrame() override { diff --git a/media/libvpx/libvpx/test/vp8_datarate_test.cc b/media/libvpx/libvpx/test/vp8_datarate_test.cc index aee27af66e..d47ed298fe 100644 --- a/media/libvpx/libvpx/test/vp8_datarate_test.cc +++ b/media/libvpx/libvpx/test/vp8_datarate_test.cc @@ -14,7 +14,7 @@ #include "test/i420_video_source.h" #include "test/util.h" #include "test/y4m_video_source.h" -#include "vpx/vpx_codec.h" +#include "vpx/vpx_encoder.h" namespace { @@ -260,6 +260,27 @@ class DatarateTestLarge << " The datarate for the file missed the target!"; } + virtual void MultiThreadsPSNRTest() { + denoiser_on_ = 0; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_threads = 4; + init_flags_ = VPX_CODEC_USE_PSNR; + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, 30); + cfg_.rc_target_bitrate = 1000; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.5) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 2.0) + << " The datarate for the file missed the target!"; + } + vpx_codec_pts_t last_pts_; int64_t bits_in_buffer_model_; double timebase_; @@ -324,6 +345,8 @@ TEST_P(DatarateTestRealTime, DropFramesMultiThreads) { DropFramesMultiThreadsTest(); } +TEST_P(DatarateTestRealTime, MultiThreadsPSNR) { MultiThreadsPSNRTest(); } + TEST_P(DatarateTestRealTime, RegionOfInterest) { denoiser_on_ = 0; cfg_.rc_buf_initial_sz = 500; diff --git a/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc b/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc index 50478f7635..d87fef5a46 100644 --- a/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc +++ b/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc @@ -149,9 +149,16 @@ class Vp8RcInterfaceTest return; } int qp; + libvpx::UVDeltaQP uv_delta_qp; encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp); if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kOk) { ASSERT_EQ(rc_api_->GetQP(), qp); + uv_delta_qp = rc_api_->GetUVDeltaQP(); + // delta_qp for UV channel is only set for screen. + if (!rc_cfg_.is_screen) { + ASSERT_EQ(uv_delta_qp.uvdc_delta_q, 0); + ASSERT_EQ(uv_delta_qp.uvac_delta_q, 0); + } } else { num_drops_++; } diff --git a/media/libvpx/libvpx/test/vp9_block_error_test.cc b/media/libvpx/libvpx/test/vp9_block_error_test.cc index 0645341ac1..c5ddcd58ab 100644 --- a/media/libvpx/libvpx/test/vp9_block_error_test.cc +++ b/media/libvpx/libvpx/test/vp9_block_error_test.cc @@ -215,4 +215,13 @@ const BlockErrorParam neon_block_error_tests[] = { INSTANTIATE_TEST_SUITE_P(NEON, BlockErrorTest, ::testing::ValuesIn(neon_block_error_tests)); #endif // HAVE_NEON + +#if HAVE_SVE +const BlockErrorParam sve_block_error_tests[] = { make_tuple( + &BlockError8BitWrapper<vp9_block_error_sve>, + &BlockError8BitWrapper<vp9_block_error_c>, VPX_BITS_8) }; + +INSTANTIATE_TEST_SUITE_P(SVE, BlockErrorTest, + ::testing::ValuesIn(sve_block_error_tests)); +#endif // HAVE_SVE } // namespace diff --git a/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc b/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc index 33fa05c65c..5c23a5b0d5 100644 --- a/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc +++ b/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc @@ -10,115 +10,78 @@ #include <cstdint> #include <new> +#include <memory> + +#include "./vpx_config.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/util.h" #include "test/yuv_video_source.h" #include "third_party/googletest/src/include/gtest/gtest.h" +#if CONFIG_VP9_DECODER +#include "vpx/vp8dx.h" +#endif #include "vp9/simple_encode.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_encoder.h" #include "vpx/vpx_ext_ratectrl.h" +#include "vpx/vpx_image.h" #include "vpx/vpx_tpl.h" #include "vpx_dsp/vpx_dsp_common.h" namespace { -constexpr int kModelMagicNumber = 51396; -constexpr uintptr_t PrivMagicNumber = 5566; -constexpr int kFrameNum = 5; -constexpr int kFrameNumGOP = 30; -constexpr int kFrameNumGOPShort = 4; -constexpr int kLosslessCodingIndex = 2; -constexpr int kFixedGOPSize = 9; -// The range check in vp9_cx_iface.c shows that the max -// lag in buffer is MAX_LAG_BUFFERS (25): -// RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS); -constexpr int kMaxLagInFrames = 25; -constexpr int kDefaultMinGfInterval = 4; -constexpr int kDefaultMaxGfInterval = 16; -// The active gf interval might change for each GOP -// See function "get_active_gf_inverval_range". -// The numbers below are from manual inspection. -constexpr int kReadMinGfInterval = 5; -constexpr int kReadMaxGfInterval = 13; -const char kTestFileName[] = "bus_352x288_420_f20_b8.yuv"; -const double kPsnrThreshold = 30.4; - -struct ToyRateCtrl { - int magic_number; - int coding_index; - - int gop_global_index; - int frames_since_key; - int show_index; +constexpr int kFrameNum = 10; +constexpr int kFixedGOPSize = 10; +constexpr int kKeyframeQp = 10; +constexpr int kLeafQp = 40; +constexpr int kArfQp = 15; + +// Simple external rate controller for testing. +class RateControllerForTest { + public: + RateControllerForTest() : current_gop_(-1) {} + ~RateControllerForTest() {} + + void StartNextGop() { ++current_gop_; } + + vpx_rc_gop_decision_t GetCurrentGop() const { + vpx_rc_gop_decision_t gop_decision; + gop_decision.use_key_frame = current_gop_ == 0 ? 1 : 0; + gop_decision.use_alt_ref = 1; + gop_decision.gop_coding_frames = kFixedGOPSize; + return gop_decision; + } + + int CalculateFrameDecision(int frame_index) { + EXPECT_LE(frame_index, kFixedGOPSize); + if (current_gop_ == 0 && frame_index == 0) { + // Key frame, first frame in the first GOP. + return kKeyframeQp; + } else if (frame_index == 1) { + // ARF, we always use ARF for this test. + return kArfQp; + } else { + return kLeafQp; + } + } + int current_gop_; }; -vpx_rc_status_t rc_create_model(void *priv, - const vpx_rc_config_t *ratectrl_config, - vpx_rc_model_t *rate_ctrl_model_ptr) { - ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl; - if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR; - toy_rate_ctrl->magic_number = kModelMagicNumber; - toy_rate_ctrl->coding_index = -1; - *rate_ctrl_model_ptr = toy_rate_ctrl; - EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber)); - EXPECT_EQ(ratectrl_config->frame_width, 352); - EXPECT_EQ(ratectrl_config->frame_height, 288); - EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNum); - EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 24000); - EXPECT_EQ(ratectrl_config->frame_rate_num, 30); - EXPECT_EQ(ratectrl_config->frame_rate_den, 1); - return VPX_RC_OK; -} - -vpx_rc_status_t rc_create_model_gop(void *priv, - const vpx_rc_config_t *ratectrl_config, - vpx_rc_model_t *rate_ctrl_model_ptr) { - ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl; - if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR; - toy_rate_ctrl->magic_number = kModelMagicNumber; - toy_rate_ctrl->gop_global_index = 0; - toy_rate_ctrl->frames_since_key = 0; - toy_rate_ctrl->show_index = 0; - toy_rate_ctrl->coding_index = 0; - *rate_ctrl_model_ptr = toy_rate_ctrl; - EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber)); - EXPECT_EQ(ratectrl_config->frame_width, 640); - EXPECT_EQ(ratectrl_config->frame_height, 360); - EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOP); - EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 4000); - EXPECT_EQ(ratectrl_config->frame_rate_num, 30); - EXPECT_EQ(ratectrl_config->frame_rate_den, 1); - return VPX_RC_OK; -} - -vpx_rc_status_t rc_create_model_gop_short( - void *priv, const vpx_rc_config_t *ratectrl_config, +// Callbacks used in this test. +vpx_rc_status_t rc_test_create_model( + void * /*priv*/, const vpx_rc_config_t * /*ratectrl_config*/, vpx_rc_model_t *rate_ctrl_model_ptr) { - ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl; - if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR; - toy_rate_ctrl->magic_number = kModelMagicNumber; - toy_rate_ctrl->gop_global_index = 0; - toy_rate_ctrl->frames_since_key = 0; - toy_rate_ctrl->show_index = 0; - toy_rate_ctrl->coding_index = 0; - *rate_ctrl_model_ptr = toy_rate_ctrl; - EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber)); - EXPECT_EQ(ratectrl_config->frame_width, 352); - EXPECT_EQ(ratectrl_config->frame_height, 288); - EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOPShort); - EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 500); - EXPECT_EQ(ratectrl_config->frame_rate_num, 30); - EXPECT_EQ(ratectrl_config->frame_rate_den, 1); + std::unique_ptr<RateControllerForTest> test_controller( + new RateControllerForTest()); + *rate_ctrl_model_ptr = test_controller.release(); return VPX_RC_OK; } -vpx_rc_status_t rc_send_firstpass_stats( - vpx_rc_model_t rate_ctrl_model, +vpx_rc_status_t rc_test_send_firstpass_stats( + vpx_rc_model_t /*rate_ctrl_model*/, const vpx_rc_firstpass_stats_t *first_pass_stats) { - const ToyRateCtrl *toy_rate_ctrl = - static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); EXPECT_EQ(first_pass_stats->num_frames, kFrameNum); for (int i = 0; i < first_pass_stats->num_frames; ++i) { EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i); @@ -126,37 +89,8 @@ vpx_rc_status_t rc_send_firstpass_stats( return VPX_RC_OK; } -vpx_rc_status_t rc_send_firstpass_stats_gop( - vpx_rc_model_t rate_ctrl_model, - const vpx_rc_firstpass_stats_t *first_pass_stats) { - const ToyRateCtrl *toy_rate_ctrl = - static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOP); - for (int i = 0; i < first_pass_stats->num_frames; ++i) { - EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i); - } - return VPX_RC_OK; -} - -vpx_rc_status_t rc_send_firstpass_stats_gop_short( - vpx_rc_model_t rate_ctrl_model, - const vpx_rc_firstpass_stats_t *first_pass_stats) { - const ToyRateCtrl *toy_rate_ctrl = - static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOPShort); - for (int i = 0; i < first_pass_stats->num_frames; ++i) { - EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i); - } - return VPX_RC_OK; -} - -vpx_rc_status_t rc_send_tpl_gop_stats(vpx_rc_model_t rate_ctrl_model, - const VpxTplGopStats *tpl_gop_stats) { - const ToyRateCtrl *toy_rate_ctrl = - static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); +vpx_rc_status_t rc_test_send_tpl_gop_stats( + vpx_rc_model_t /*rate_ctrl_model*/, const VpxTplGopStats *tpl_gop_stats) { EXPECT_GT(tpl_gop_stats->size, 0); for (int i = 0; i < tpl_gop_stats->size; ++i) { @@ -165,522 +99,38 @@ vpx_rc_status_t rc_send_tpl_gop_stats(vpx_rc_model_t rate_ctrl_model, return VPX_RC_OK; } -vpx_rc_status_t rc_get_encodeframe_decision( - vpx_rc_model_t rate_ctrl_model, - const vpx_rc_encodeframe_info_t *encode_frame_info, +vpx_rc_status_t rc_test_get_encodeframe_decision( + vpx_rc_model_t rate_ctrl_model, const int frame_gop_index, vpx_rc_encodeframe_decision_t *frame_decision) { - ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); - toy_rate_ctrl->coding_index += 1; - - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - - EXPECT_LT(encode_frame_info->show_index, kFrameNum); - EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); - - if (encode_frame_info->coding_index == 0) { - EXPECT_EQ(encode_frame_info->show_index, 0); - EXPECT_EQ(encode_frame_info->gop_index, 0); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], - 0); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], - 0); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], - 0); // kRefFrameTypeFuture - } else if (encode_frame_info->coding_index == 1) { - EXPECT_EQ(encode_frame_info->show_index, 4); - EXPECT_EQ(encode_frame_info->gop_index, 1); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef); - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], - 1); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], - 0); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], - 0); // kRefFrameTypeFuture - EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], - 0); // kRefFrameTypeLast - } else if (encode_frame_info->coding_index >= 2 && - encode_frame_info->coding_index < 5) { - // In the first group of pictures, coding_index and gop_index are equal. - EXPECT_EQ(encode_frame_info->gop_index, encode_frame_info->coding_index); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); - } else if (encode_frame_info->coding_index == 5) { - EXPECT_EQ(encode_frame_info->show_index, 4); - EXPECT_EQ(encode_frame_info->gop_index, 0); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay); - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], - 1); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], - 1); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], - 1); // kRefFrameTypeFuture - EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], - 4); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[1], - 0); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[2], - 1); // kRefFrameTypeFuture - } - if (encode_frame_info->coding_index == kLosslessCodingIndex) { - // We should get sse == 0 at rc_update_encodeframe_result() - frame_decision->q_index = 0; - } else { - frame_decision->q_index = 100; - } - frame_decision->max_frame_size = 0; + RateControllerForTest *test_controller = + static_cast<RateControllerForTest *>(rate_ctrl_model); + frame_decision->q_index = + test_controller->CalculateFrameDecision(frame_gop_index); return VPX_RC_OK; } -vpx_rc_status_t rc_get_encodeframe_decision_gop( - vpx_rc_model_t rate_ctrl_model, - const vpx_rc_encodeframe_info_t *encode_frame_info, - vpx_rc_encodeframe_decision_t *frame_decision) { - ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - EXPECT_LT(encode_frame_info->show_index, kFrameNumGOP); - EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); - - if (encode_frame_info->coding_index == 0) { - EXPECT_EQ(encode_frame_info->show_index, 0); - EXPECT_EQ(encode_frame_info->gop_index, 0); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], - 0); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], - 0); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], - 0); // kRefFrameTypeFuture - } else if (encode_frame_info->coding_index == 1) { - EXPECT_EQ(encode_frame_info->show_index, 1); - EXPECT_EQ(encode_frame_info->gop_index, 1); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], - 1); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], - 0); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], - 0); // kRefFrameTypeFuture - EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], - 0); // kRefFrameTypeLast - } else if (encode_frame_info->coding_index == 2) { - EXPECT_EQ(encode_frame_info->show_index, 2); - EXPECT_EQ(encode_frame_info->gop_index, 0); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], - 0); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], - 0); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], - 0); // kRefFrameTypeFuture - } else if (encode_frame_info->coding_index == 3 || - encode_frame_info->coding_index == 12 || - encode_frame_info->coding_index == 21) { - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef); - EXPECT_EQ(encode_frame_info->gop_index, 1); - } else if (encode_frame_info->coding_index == 11 || - encode_frame_info->coding_index == 20 || - encode_frame_info->coding_index == 29) { - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay); - EXPECT_EQ(encode_frame_info->gop_index, 0); - } else if (encode_frame_info->coding_index >= 30) { - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); - } - - // When the model recommends an invalid q, valid range [0, 255], - // the encoder will ignore it and use the default q selected - // by libvpx rate control strategy. - frame_decision->q_index = VPX_DEFAULT_Q; - frame_decision->max_frame_size = 0; - - toy_rate_ctrl->coding_index += 1; - return VPX_RC_OK; -} - -vpx_rc_status_t rc_get_encodeframe_decision_gop_short( - vpx_rc_model_t rate_ctrl_model, - const vpx_rc_encodeframe_info_t *encode_frame_info, - vpx_rc_encodeframe_decision_t *frame_decision) { - ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort); - EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); - - if (encode_frame_info->coding_index == 0) { - EXPECT_EQ(encode_frame_info->show_index, 0); - EXPECT_EQ(encode_frame_info->gop_index, 0); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], - 0); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], - 0); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], - 0); // kRefFrameTypeFuture - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); - } else if (encode_frame_info->coding_index == 1) { - EXPECT_EQ(encode_frame_info->show_index, 1); - EXPECT_EQ(encode_frame_info->gop_index, 1); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], - 1); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], - 0); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], - 0); // kRefFrameTypeFuture - EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], - 0); // kRefFrameTypeLast - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); - } else if (encode_frame_info->coding_index == 2) { - EXPECT_EQ(encode_frame_info->show_index, 2); - EXPECT_EQ(encode_frame_info->gop_index, 2); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); - } else if (encode_frame_info->coding_index == 3) { - EXPECT_EQ(encode_frame_info->show_index, 3); - EXPECT_EQ(encode_frame_info->gop_index, 0); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeGolden); - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2); - } - - // When the model recommends an invalid q, valid range [0, 255], - // the encoder will ignore it and use the default q selected - // by libvpx rate control strategy. - frame_decision->q_index = VPX_DEFAULT_Q; - frame_decision->max_frame_size = 0; - - toy_rate_ctrl->coding_index += 1; - return VPX_RC_OK; -} - -vpx_rc_status_t rc_get_encodeframe_decision_gop_short_overlay( - vpx_rc_model_t rate_ctrl_model, - const vpx_rc_encodeframe_info_t *encode_frame_info, - vpx_rc_encodeframe_decision_t *frame_decision) { - ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort); - EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); - - if (encode_frame_info->coding_index == 0) { - EXPECT_EQ(encode_frame_info->show_index, 0); - EXPECT_EQ(encode_frame_info->gop_index, 0); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], - 0); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], - 0); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], - 0); // kRefFrameTypeFuture - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); - } else if (encode_frame_info->coding_index == 1) { - EXPECT_EQ(encode_frame_info->show_index, 3); - EXPECT_EQ(encode_frame_info->gop_index, 1); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef); - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], - 1); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], - 0); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], - 0); // kRefFrameTypeFuture - EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], - 0); // kRefFrameTypeLast - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); - } else if (encode_frame_info->coding_index == 2) { - EXPECT_EQ(encode_frame_info->show_index, 1); - EXPECT_EQ(encode_frame_info->gop_index, 2); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); - } else if (encode_frame_info->coding_index == 3) { - EXPECT_EQ(encode_frame_info->show_index, 2); - EXPECT_EQ(encode_frame_info->gop_index, 3); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); - } else if (encode_frame_info->coding_index == 4) { - EXPECT_EQ(encode_frame_info->show_index, 3); - EXPECT_EQ(encode_frame_info->gop_index, 0); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay); - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); - } - - // When the model recommends an invalid q, valid range [0, 255], - // the encoder will ignore it and use the default q selected - // by libvpx rate control strategy. - frame_decision->q_index = VPX_DEFAULT_Q; - frame_decision->max_frame_size = 0; - - toy_rate_ctrl->coding_index += 1; - return VPX_RC_OK; -} - -vpx_rc_status_t rc_get_encodeframe_decision_gop_short_no_arf( - vpx_rc_model_t rate_ctrl_model, - const vpx_rc_encodeframe_info_t *encode_frame_info, - vpx_rc_encodeframe_decision_t *frame_decision) { - ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort); - EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); - - if (encode_frame_info->coding_index == 0) { - EXPECT_EQ(encode_frame_info->show_index, 0); - EXPECT_EQ(encode_frame_info->gop_index, 0); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], - 0); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], - 0); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], - 0); // kRefFrameTypeFuture - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); - } else if (encode_frame_info->coding_index == 1) { - EXPECT_EQ(encode_frame_info->show_index, 1); - EXPECT_EQ(encode_frame_info->gop_index, 1); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], - 1); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], - 0); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], - 0); // kRefFrameTypeFuture - EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], - 0); // kRefFrameTypeLast - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); - } else if (encode_frame_info->coding_index == 2) { - EXPECT_EQ(encode_frame_info->show_index, 2); - EXPECT_EQ(encode_frame_info->gop_index, 2); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); - } else if (encode_frame_info->coding_index == 3) { - EXPECT_EQ(encode_frame_info->show_index, 3); - EXPECT_EQ(encode_frame_info->gop_index, 3); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); - } - - // When the model recommends an invalid q, valid range [0, 255], - // the encoder will ignore it and use the default q selected - // by libvpx rate control strategy. - frame_decision->q_index = VPX_DEFAULT_Q; - frame_decision->max_frame_size = 0; - - toy_rate_ctrl->coding_index += 1; - return VPX_RC_OK; -} - -vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model, - const vpx_rc_gop_info_t *gop_info, - vpx_rc_gop_decision_t *gop_decision) { - ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames); - EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval); - EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval); - EXPECT_EQ(gop_info->active_min_gf_interval, kReadMinGfInterval); - EXPECT_EQ(gop_info->active_max_gf_interval, kReadMaxGfInterval); - EXPECT_EQ(gop_info->allow_alt_ref, 1); - if (gop_info->is_key_frame) { - EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); - EXPECT_EQ(gop_info->frames_since_key, 0); - EXPECT_EQ(gop_info->gop_global_index, 0); - toy_rate_ctrl->gop_global_index = 0; - toy_rate_ctrl->frames_since_key = 0; - } else { - EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1); - } - EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index); - EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key); - EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index); - EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index); - - gop_decision->gop_coding_frames = - VPXMIN(kFixedGOPSize, gop_info->frames_to_key); - gop_decision->use_alt_ref = gop_decision->gop_coding_frames == kFixedGOPSize; - toy_rate_ctrl->frames_since_key += - gop_decision->gop_coding_frames - gop_decision->use_alt_ref; - toy_rate_ctrl->show_index += - gop_decision->gop_coding_frames - gop_decision->use_alt_ref; - ++toy_rate_ctrl->gop_global_index; - return VPX_RC_OK; -} - -// Test on a 4 frame video. -// Test a setting of 2 GOPs. -// The first GOP has 3 coding frames, no alt ref. -// The second GOP has 1 coding frame, no alt ref. -vpx_rc_status_t rc_get_gop_decision_short(vpx_rc_model_t rate_ctrl_model, - const vpx_rc_gop_info_t *gop_info, - vpx_rc_gop_decision_t *gop_decision) { - ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1); - EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval); - EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval); - EXPECT_EQ(gop_info->allow_alt_ref, 1); - if (gop_info->is_key_frame) { - EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); - EXPECT_EQ(gop_info->frames_since_key, 0); - EXPECT_EQ(gop_info->gop_global_index, 0); - toy_rate_ctrl->gop_global_index = 0; - toy_rate_ctrl->frames_since_key = 0; - } else { - EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); - } - EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index); - EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key); - EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index); - EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index); - - gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 3 : 1; - gop_decision->use_alt_ref = 0; - toy_rate_ctrl->frames_since_key += - gop_decision->gop_coding_frames - gop_decision->use_alt_ref; - toy_rate_ctrl->show_index += - gop_decision->gop_coding_frames - gop_decision->use_alt_ref; - ++toy_rate_ctrl->gop_global_index; - return VPX_RC_OK; -} - -// Test on a 4 frame video. -// Test a setting of 2 GOPs. -// The first GOP has 4 coding frames. Use alt ref. -// The second GOP only contains the overlay frame of the first GOP's alt ref -// frame. -vpx_rc_status_t rc_get_gop_decision_short_overlay( - vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info, - vpx_rc_gop_decision_t *gop_decision) { - ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1); - EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval); - EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval); - EXPECT_EQ(gop_info->allow_alt_ref, 1); - if (gop_info->is_key_frame) { - EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); - EXPECT_EQ(gop_info->frames_since_key, 0); - EXPECT_EQ(gop_info->gop_global_index, 0); - toy_rate_ctrl->gop_global_index = 0; - toy_rate_ctrl->frames_since_key = 0; - } else { - EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1); - } - EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index); - EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key); - EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index); - EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index); - - gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1; - gop_decision->use_alt_ref = gop_info->is_key_frame ? 1 : 0; - toy_rate_ctrl->frames_since_key += - gop_decision->gop_coding_frames - gop_decision->use_alt_ref; - toy_rate_ctrl->show_index += - gop_decision->gop_coding_frames - gop_decision->use_alt_ref; - ++toy_rate_ctrl->gop_global_index; - return VPX_RC_OK; -} - -// Test on a 4 frame video. -// Test a setting of 1 GOP. -// The GOP has 4 coding frames. Do not use alt ref. -vpx_rc_status_t rc_get_gop_decision_short_no_arf( - vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info, - vpx_rc_gop_decision_t *gop_decision) { - ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1); - EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval); - EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval); - EXPECT_EQ(gop_info->allow_alt_ref, 1); - if (gop_info->is_key_frame) { - EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); - EXPECT_EQ(gop_info->frames_since_key, 0); - EXPECT_EQ(gop_info->gop_global_index, 0); - toy_rate_ctrl->gop_global_index = 0; - toy_rate_ctrl->frames_since_key = 0; - } else { - EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); - } - EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index); - EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key); - EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index); - EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index); - - gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1; - gop_decision->use_alt_ref = 0; - toy_rate_ctrl->frames_since_key += - gop_decision->gop_coding_frames - gop_decision->use_alt_ref; - toy_rate_ctrl->show_index += - gop_decision->gop_coding_frames - gop_decision->use_alt_ref; - ++toy_rate_ctrl->gop_global_index; - return VPX_RC_OK; -} - -vpx_rc_status_t rc_update_encodeframe_result( - vpx_rc_model_t rate_ctrl_model, - const vpx_rc_encodeframe_result_t *encode_frame_result) { - const ToyRateCtrl *toy_rate_ctrl = - static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - - const int64_t ref_pixel_count = 352 * 288 * 3 / 2; - EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count); - if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) { - EXPECT_EQ(encode_frame_result->sse, 0); - } - if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) { - EXPECT_EQ(encode_frame_result->actual_encoding_qindex, 0); - } else { - EXPECT_EQ(encode_frame_result->actual_encoding_qindex, 100); - } - return VPX_RC_OK; -} - -vpx_rc_status_t rc_update_encodeframe_result_gop( - vpx_rc_model_t rate_ctrl_model, - const vpx_rc_encodeframe_result_t *encode_frame_result) { - const ToyRateCtrl *toy_rate_ctrl = - static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - - const int64_t ref_pixel_count = 640 * 360 * 3 / 2; - EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count); - return VPX_RC_OK; -} - -vpx_rc_status_t rc_update_encodeframe_result_gop_short( - vpx_rc_model_t rate_ctrl_model, - const vpx_rc_encodeframe_result_t *encode_frame_result) { - const ToyRateCtrl *toy_rate_ctrl = - static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - - const int64_t ref_pixel_count = 352 * 288 * 3 / 2; - EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count); - return VPX_RC_OK; -} - -vpx_rc_status_t rc_get_default_frame_rdmult( - vpx_rc_model_t rate_ctrl_model, - const vpx_rc_encodeframe_info_t *encode_frame_info, int *rdmult) { - const ToyRateCtrl *toy_rate_ctrl = - static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort); - EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); - - *rdmult = VPX_DEFAULT_RDMULT; +vpx_rc_status_t rc_test_get_gop_decision(vpx_rc_model_t rate_ctrl_model, + vpx_rc_gop_decision_t *gop_decision) { + RateControllerForTest *test_controller = + static_cast<RateControllerForTest *>(rate_ctrl_model); + test_controller->StartNextGop(); + *gop_decision = test_controller->GetCurrentGop(); return VPX_RC_OK; } vpx_rc_status_t rc_delete_model(vpx_rc_model_t rate_ctrl_model) { - ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - delete toy_rate_ctrl; + RateControllerForTest *test_controller = + static_cast<RateControllerForTest *>(rate_ctrl_model); + delete test_controller; return VPX_RC_OK; } class ExtRateCtrlTest : public ::libvpx_test::EncoderTest, public ::testing::Test { protected: - ExtRateCtrlTest() : EncoderTest(&::libvpx_test::kVP9) {} + ExtRateCtrlTest() + : EncoderTest(&::libvpx_test::kVP9), frame_number_(0), + current_frame_qp_(0) {} ~ExtRateCtrlTest() override = default; @@ -693,287 +143,62 @@ class ExtRateCtrlTest : public ::libvpx_test::EncoderTest, ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { vpx_rc_funcs_t rc_funcs = {}; - rc_funcs.rc_type = VPX_RC_QP; - rc_funcs.create_model = rc_create_model; - rc_funcs.send_firstpass_stats = rc_send_firstpass_stats; - rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision; - rc_funcs.update_encodeframe_result = rc_update_encodeframe_result; - rc_funcs.delete_model = rc_delete_model; - rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber); - encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); - } - } -}; - -TEST_F(ExtRateCtrlTest, EncodeTest) { - cfg_.rc_target_bitrate = 24000; - - std::unique_ptr<libvpx_test::VideoSource> video; - video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( - "bus_352x288_420_f20_b8.yuv", VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, - kFrameNum)); - - ASSERT_NE(video, nullptr); - ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); -} - -class ExtRateCtrlTestGOP : public ::libvpx_test::EncoderTest, - public ::libvpx_test::CodecTestWithParam<int> { - protected: - ExtRateCtrlTestGOP() : EncoderTest(&::libvpx_test::kVP9) {} - - ~ExtRateCtrlTestGOP() override = default; - - void SetUp() override { - InitializeConfig(); - SetMode(::libvpx_test::kTwoPassGood); - } - - void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) override { - if (video->frame() == 0) { - encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); - encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); - - vpx_rc_funcs_t rc_funcs = {}; - rc_funcs.rc_type = VPX_RC_GOP_QP; - rc_funcs.create_model = rc_create_model_gop; - rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop; - rc_funcs.send_tpl_gop_stats = rc_send_tpl_gop_stats; - rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop; - rc_funcs.get_gop_decision = rc_get_gop_decision; - rc_funcs.update_encodeframe_result = rc_update_encodeframe_result_gop; - rc_funcs.delete_model = rc_delete_model; - rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber); - encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); - } - } -}; - -TEST_F(ExtRateCtrlTestGOP, EncodeTest) { - cfg_.rc_target_bitrate = 4000; - cfg_.g_lag_in_frames = kMaxLagInFrames; - cfg_.rc_end_usage = VPX_VBR; - - std::unique_ptr<libvpx_test::VideoSource> video; - video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( - "noisy_clip_640_360.y4m", VPX_IMG_FMT_I420, 640, 360, 30, 1, 0, - kFrameNumGOP)); - - ASSERT_NE(video, nullptr); - ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); -} - -class ExtRateCtrlTestGOPShort : public ::libvpx_test::EncoderTest, - public ::libvpx_test::CodecTestWithParam<int> { - protected: - ExtRateCtrlTestGOPShort() : EncoderTest(&::libvpx_test::kVP9) {} - - ~ExtRateCtrlTestGOPShort() override = default; - - void SetUp() override { - InitializeConfig(); - SetMode(::libvpx_test::kTwoPassGood); - } - - void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) override { - if (video->frame() == 0) { - encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); - encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); - encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO); - - vpx_rc_funcs_t rc_funcs = {}; - rc_funcs.rc_type = VPX_RC_GOP_QP; - rc_funcs.create_model = rc_create_model_gop_short; - rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; - rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short; - rc_funcs.get_gop_decision = rc_get_gop_decision_short; - rc_funcs.update_encodeframe_result = - rc_update_encodeframe_result_gop_short; - rc_funcs.delete_model = rc_delete_model; - rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber); - encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); - } - } -}; - -TEST_F(ExtRateCtrlTestGOPShort, EncodeTest) { - cfg_.rc_target_bitrate = 500; - cfg_.g_lag_in_frames = kMaxLagInFrames - 1; - cfg_.rc_end_usage = VPX_VBR; - - std::unique_ptr<libvpx_test::VideoSource> video; - video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( - kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort)); - - ASSERT_NE(video, nullptr); - ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); -} - -class ExtRateCtrlTestGOPShortOverlay - : public ::libvpx_test::EncoderTest, - public ::libvpx_test::CodecTestWithParam<int> { - protected: - ExtRateCtrlTestGOPShortOverlay() : EncoderTest(&::libvpx_test::kVP9) {} - - ~ExtRateCtrlTestGOPShortOverlay() override = default; - - void SetUp() override { - InitializeConfig(); - SetMode(::libvpx_test::kTwoPassGood); - } - - void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) override { - if (video->frame() == 0) { - encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); - encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); - encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO); - - vpx_rc_funcs_t rc_funcs = {}; rc_funcs.rc_type = VPX_RC_GOP_QP; - rc_funcs.create_model = rc_create_model_gop_short; - rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; - rc_funcs.get_encodeframe_decision = - rc_get_encodeframe_decision_gop_short_overlay; - rc_funcs.get_gop_decision = rc_get_gop_decision_short_overlay; - rc_funcs.update_encodeframe_result = - rc_update_encodeframe_result_gop_short; + rc_funcs.create_model = rc_test_create_model; + rc_funcs.send_firstpass_stats = rc_test_send_firstpass_stats; + rc_funcs.send_tpl_gop_stats = rc_test_send_tpl_gop_stats; + rc_funcs.get_gop_decision = rc_test_get_gop_decision; + rc_funcs.get_encodeframe_decision = rc_test_get_encodeframe_decision; rc_funcs.delete_model = rc_delete_model; - rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber); encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); } } -}; - -TEST_F(ExtRateCtrlTestGOPShortOverlay, EncodeTest) { - cfg_.rc_target_bitrate = 500; - cfg_.g_lag_in_frames = kMaxLagInFrames - 1; - cfg_.rc_end_usage = VPX_VBR; - - std::unique_ptr<libvpx_test::VideoSource> video; - video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( - kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort)); - - ASSERT_NE(video, nullptr); - ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); -} - -class ExtRateCtrlTestGOPShortNoARF - : public ::libvpx_test::EncoderTest, - public ::libvpx_test::CodecTestWithParam<int> { - protected: - ExtRateCtrlTestGOPShortNoARF() : EncoderTest(&::libvpx_test::kVP9) {} - - ~ExtRateCtrlTestGOPShortNoARF() override = default; - void SetUp() override { - InitializeConfig(); - SetMode(::libvpx_test::kTwoPassGood); +#if CONFIG_VP9_DECODER + bool HandleDecodeResult(const vpx_codec_err_t res_dec, + const ::libvpx_test::VideoSource & /*video*/, + ::libvpx_test::Decoder *decoder) override { + EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError(); + decoder->Control(VPXD_GET_LAST_QUANTIZER, ¤t_frame_qp_); + return VPX_CODEC_OK == res_dec; } - void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) override { - if (video->frame() == 0) { - encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); - encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); - encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO); - - vpx_rc_funcs_t rc_funcs = {}; - rc_funcs.rc_type = VPX_RC_GOP_QP; - rc_funcs.create_model = rc_create_model_gop_short; - rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; - rc_funcs.get_encodeframe_decision = - rc_get_encodeframe_decision_gop_short_no_arf; - rc_funcs.get_gop_decision = rc_get_gop_decision_short_no_arf; - rc_funcs.update_encodeframe_result = - rc_update_encodeframe_result_gop_short; - rc_funcs.delete_model = rc_delete_model; - rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber); - encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + if (frame_number_ == 0) { + // This must be a key frame + EXPECT_TRUE((pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0); + EXPECT_EQ(current_frame_qp_, kKeyframeQp); + ++frame_number_; + return; } - } -}; - -TEST_F(ExtRateCtrlTestGOPShortNoARF, EncodeTest) { - cfg_.rc_target_bitrate = 500; - cfg_.g_lag_in_frames = kMaxLagInFrames - 1; - cfg_.rc_end_usage = VPX_VBR; - - std::unique_ptr<libvpx_test::VideoSource> video; - video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( - kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort)); - - ASSERT_NE(video, nullptr); - ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); -} - -class ExtRateCtrlTestRdmult : public ::libvpx_test::EncoderTest, - public ::testing::Test { - protected: - ExtRateCtrlTestRdmult() : EncoderTest(&::libvpx_test::kVP9) {} - - ~ExtRateCtrlTestRdmult() override = default; - - void SetUp() override { - InitializeConfig(); - SetMode(::libvpx_test::kTwoPassGood); - } - - void BeginPassHook(unsigned int) override { - psnr_ = 0.0; - nframes_ = 0; - } - - void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { - psnr_ += pkt->data.psnr.psnr[0]; - nframes_++; - } - void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) override { - if (video->frame() == 0) { - vpx_rc_funcs_t rc_funcs = {}; - rc_funcs.rc_type = VPX_RC_GOP_QP_RDMULT; - rc_funcs.create_model = rc_create_model_gop_short; - rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; - rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short; - rc_funcs.get_gop_decision = rc_get_gop_decision_short; - rc_funcs.update_encodeframe_result = - rc_update_encodeframe_result_gop_short; - rc_funcs.get_frame_rdmult = rc_get_default_frame_rdmult; - rc_funcs.delete_model = rc_delete_model; - rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber); - encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); + if ((pkt->data.frame.flags & VPX_FRAME_IS_INVISIBLE) != 0) { + // This is ARF + EXPECT_EQ(current_frame_qp_, kArfQp); + ++frame_number_; + return; } - } - double GetAveragePsnr() const { - if (nframes_) return psnr_ / nframes_; - return 0.0; + EXPECT_EQ(current_frame_qp_, kLeafQp); + ++frame_number_; } +#endif // CONFIG_VP9_DECODER - private: - double psnr_; - unsigned int nframes_; + int frame_number_; + int current_frame_qp_; }; -TEST_F(ExtRateCtrlTestRdmult, DefaultRdmult) { - cfg_.rc_target_bitrate = 500; - cfg_.g_lag_in_frames = kMaxLagInFrames - 1; - cfg_.rc_end_usage = VPX_VBR; - init_flags_ = VPX_CODEC_USE_PSNR; +TEST_F(ExtRateCtrlTest, EncodeTest) { + cfg_.rc_target_bitrate = 4000; + cfg_.g_lag_in_frames = 25; std::unique_ptr<libvpx_test::VideoSource> video; video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( - kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort)); + "bus_352x288_420_f20_b8.yuv", VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, + kFrameNum)); ASSERT_NE(video, nullptr); ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); - - const double psnr = GetAveragePsnr(); - EXPECT_GT(psnr, kPsnrThreshold); } } // namespace diff --git a/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc b/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc index f7be47542c..a6c7563348 100644 --- a/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc +++ b/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc @@ -9,6 +9,7 @@ */ #include "vp9/ratectrl_rtc.h" +#include <climits> #include <fstream> // NOLINT #include <string> @@ -19,6 +20,8 @@ #include "test/i420_video_source.h" #include "test/util.h" #include "test/video_source.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_svc_layercontext.h" #include "vpx/vpx_codec.h" #include "vpx_ports/bitops.h" diff --git a/media/libvpx/libvpx/test/vp9_scale_test.cc b/media/libvpx/libvpx/test/vp9_scale_test.cc index 049a10a617..a5a18a7e9d 100644 --- a/media/libvpx/libvpx/test/vp9_scale_test.cc +++ b/media/libvpx/libvpx/test/vp9_scale_test.cc @@ -48,12 +48,11 @@ class ScaleTest : public VpxScaleBase, } void RunTest(INTERP_FILTER filter_type) { - static const int kNumSizesToTest = 20; + static const int kNumSizesToTest = 22; static const int kNumScaleFactorsToTest = 4; - static const int kSizesToTest[] = { - 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, - 22, 24, 26, 28, 30, 32, 34, 68, 128, 134 - }; + static const int kSizesToTest[] = { 1, 2, 3, 4, 6, 8, 10, 12, + 14, 16, 18, 20, 22, 24, 26, 28, + 30, 32, 34, 68, 128, 134 }; static const int kScaleFactors[] = { 1, 2, 3, 4 }; for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) { for (int h = 0; h < kNumSizesToTest; ++h) { diff --git a/media/libvpx/libvpx/tools_common.c b/media/libvpx/libvpx/tools_common.c index 5c13781513..5af971f720 100644 --- a/media/libvpx/libvpx/tools_common.c +++ b/media/libvpx/libvpx/tools_common.c @@ -26,15 +26,9 @@ #include "vpx/vpx_codec.h" -#if defined(_WIN32) || defined(__OS2__) +#if defined(_WIN32) #include <io.h> #include <fcntl.h> - -#ifdef __OS2__ -#define _setmode setmode -#define _fileno fileno -#define _O_BINARY O_BINARY -#endif #endif #define LOG_ERROR(label) \ @@ -58,7 +52,7 @@ static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) { FILE *set_binary_mode(FILE *stream) { (void)stream; -#if defined(_WIN32) || defined(__OS2__) +#if defined(_WIN32) _setmode(_fileno(stream), _O_BINARY); #endif return stream; @@ -96,9 +90,9 @@ int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame) { int w = vpx_img_plane_width(yuv_frame, plane); const int h = vpx_img_plane_height(yuv_frame, plane); int r; - // Assuming that for nv12 we read all chroma data at one time + // Assuming that for nv12 we read all chroma data at once if (yuv_frame->fmt == VPX_IMG_FMT_NV12 && plane > 1) break; - // Fixing NV12 chroma width it is odd + // Fixing NV12 chroma width if it is odd if (yuv_frame->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1; /* Determine the correct plane based on the image format. The for-loop * always counts in Y,U,V order, but this may not match the order of @@ -229,17 +223,22 @@ int vpx_img_plane_height(const vpx_image_t *img, int plane) { void vpx_img_write(const vpx_image_t *img, FILE *file) { int plane; + const int bytespp = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; for (plane = 0; plane < 3; ++plane) { const unsigned char *buf = img->planes[plane]; const int stride = img->stride[plane]; - const int w = vpx_img_plane_width(img, plane) * - ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); + int w = vpx_img_plane_width(img, plane); const int h = vpx_img_plane_height(img, plane); int y; + // Assuming that for nv12 we write all chroma data at once + if (img->fmt == VPX_IMG_FMT_NV12 && plane > 1) break; + // Fixing NV12 chroma width if it is odd + if (img->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1; + for (y = 0; y < h; ++y) { - fwrite(buf, 1, w, file); + fwrite(buf, bytespp, w, file); buf += stride; } } @@ -247,17 +246,22 @@ void vpx_img_write(const vpx_image_t *img, FILE *file) { int vpx_img_read(vpx_image_t *img, FILE *file) { int plane; + const int bytespp = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; for (plane = 0; plane < 3; ++plane) { unsigned char *buf = img->planes[plane]; const int stride = img->stride[plane]; - const int w = vpx_img_plane_width(img, plane) * - ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); + int w = vpx_img_plane_width(img, plane); const int h = vpx_img_plane_height(img, plane); int y; + // Assuming that for nv12 we read all chroma data at once + if (img->fmt == VPX_IMG_FMT_NV12 && plane > 1) break; + // Fixing NV12 chroma width if it is odd + if (img->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1; + for (y = 0; y < h; ++y) { - if (fread(buf, 1, w, file) != (size_t)w) return 0; + if (fread(buf, bytespp, w, file) != (size_t)w) return 0; buf += stride; } } diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c index ee3c281f0f..a54e81084b 100644 --- a/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c +++ b/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c @@ -16,7 +16,7 @@ #include "vpx_ports/mem.h" static const int8_t vp8_sub_pel_filters[8][8] = { - { 0, 0, 128, 0, 0, 0, 0, 0 }, /* note that 1/8 pel positions are */ + { 0, 0, -128, 0, 0, 0, 0, 0 }, /* note that 1/8 pel positions are */ { 0, -6, 123, 12, -1, 0, 0, 0 }, /* just as per alpha -0.5 bicubic */ { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */ { 0, -9, 93, 50, -6, 0, 0, 0 }, diff --git a/media/libvpx/libvpx/vp8/common/entropy.c b/media/libvpx/libvpx/vp8/common/entropy.c index fc4a3539fd..b9efc0cc1f 100644 --- a/media/libvpx/libvpx/vp8/common/entropy.c +++ b/media/libvpx/libvpx/vp8/common/entropy.c @@ -114,7 +114,7 @@ static const vp8_prob Pcat6[] = { 254, 254, 243, 230, 196, 177, p[0] = p[1] = 0; } - void init_bit_trees() { + void init_bit_trees(void) { init_bit_tree(cat1, 1); init_bit_tree(cat2, 2); init_bit_tree(cat3, 3); diff --git a/media/libvpx/libvpx/vp8/common/generic/systemdependent.c b/media/libvpx/libvpx/vp8/common/generic/systemdependent.c index 71529bdfd8..7c8e083f4f 100644 --- a/media/libvpx/libvpx/vp8/common/generic/systemdependent.c +++ b/media/libvpx/libvpx/vp8/common/generic/systemdependent.c @@ -25,23 +25,19 @@ #include "vp8/common/systemdependent.h" #if CONFIG_MULTITHREAD -#if HAVE_UNISTD_H && !defined(__OS2__) +#if HAVE_UNISTD_H #include <unistd.h> #elif defined(_WIN32) #include <windows.h> typedef void(WINAPI *PGNSI)(LPSYSTEM_INFO); -#elif defined(__OS2__) -#define INCL_DOS -#define INCL_DOSSPINLOCK -#include <os2.h> #endif #endif #if CONFIG_MULTITHREAD -static int get_cpu_count() { +static int get_cpu_count(void) { int core_count = 16; -#if HAVE_UNISTD_H && !defined(__OS2__) +#if HAVE_UNISTD_H #if defined(_SC_NPROCESSORS_ONLN) core_count = (int)sysconf(_SC_NPROCESSORS_ONLN); #elif defined(_SC_NPROC_ONLN) @@ -49,38 +45,13 @@ static int get_cpu_count() { #endif #elif defined(_WIN32) { -#if _WIN32_WINNT >= 0x0501 +#if _WIN32_WINNT < 0x0501 +#error _WIN32_WINNT must target Windows XP or newer. +#endif SYSTEM_INFO sysinfo; GetNativeSystemInfo(&sysinfo); -#else - PGNSI pGNSI; - SYSTEM_INFO sysinfo; - - /* Call GetNativeSystemInfo if supported or - * GetSystemInfo otherwise. */ - - pGNSI = (PGNSI)GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), - "GetNativeSystemInfo"); - if (pGNSI != NULL) - pGNSI(&sysinfo); - else - GetSystemInfo(&sysinfo); -#endif - core_count = (int)sysinfo.dwNumberOfProcessors; } -#elif defined(__OS2__) - { - ULONG proc_id; - ULONG status; - - core_count = 0; - for (proc_id = 1;; ++proc_id) { - if (DosGetProcessorStatus(proc_id, &status)) break; - - if (status == PROC_ONLINE) core_count++; - } - } #else /* other platforms */ #endif diff --git a/media/libvpx/libvpx/vp8/common/onyx.h b/media/libvpx/libvpx/vp8/common/onyx.h index 1b70ea5dba..2038c000b0 100644 --- a/media/libvpx/libvpx/vp8/common/onyx.h +++ b/media/libvpx/libvpx/vp8/common/onyx.h @@ -242,7 +242,7 @@ typedef struct { #endif } VP8_CONFIG; -void vp8_initialize(); +void vp8_initialize(void); struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf); void vp8_remove_compressor(struct VP8_COMP **comp); diff --git a/media/libvpx/libvpx/vp8/common/rtcd.c b/media/libvpx/libvpx/vp8/common/rtcd.c index 09a0e2b4b3..102b7ccd54 100644 --- a/media/libvpx/libvpx/vp8/common/rtcd.c +++ b/media/libvpx/libvpx/vp8/common/rtcd.c @@ -12,4 +12,4 @@ #include "./vp8_rtcd.h" #include "vpx_ports/vpx_once.h" -void vp8_rtcd() { once(setup_rtcd_internal); } +void vp8_rtcd(void) { once(setup_rtcd_internal); } diff --git a/media/libvpx/libvpx/vp8/common/threading.h b/media/libvpx/libvpx/vp8/common/threading.h index 1cfb9fec51..0de75cfde3 100644 --- a/media/libvpx/libvpx/vp8/common/threading.h +++ b/media/libvpx/libvpx/vp8/common/threading.h @@ -19,161 +19,57 @@ extern "C" { #if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD -/* Thread management macros */ #if defined(_WIN32) && !HAVE_PTHREAD_H /* Win32 */ -#include <process.h> #include <windows.h> -#if defined(__GNUC__) && \ - (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) -#define THREAD_FUNCTION \ - __attribute__((force_align_arg_pointer)) unsigned int __stdcall -#else -#define THREAD_FUNCTION unsigned int __stdcall -#endif -#define THREAD_FUNCTION_RETURN DWORD -#define THREAD_SPECIFIC_INDEX DWORD -#define pthread_t HANDLE -#define pthread_attr_t DWORD -#define pthread_detach(thread) \ - if (thread != NULL) CloseHandle(thread) -#define thread_sleep(nms) Sleep(nms) -#define pthread_cancel(thread) terminate_thread(thread, 0) -#define ts_key_create(ts_key, destructor) \ - { ts_key = TlsAlloc(); }; -#define pthread_getspecific(ts_key) TlsGetValue(ts_key) -#define pthread_setspecific(ts_key, value) TlsSetValue(ts_key, (void *)value) -#define pthread_self() GetCurrentThreadId() - -#elif defined(__OS2__) -/* OS/2 */ -#define INCL_DOS -#include <os2.h> - -#include <stdlib.h> -#define THREAD_FUNCTION void * -#define THREAD_FUNCTION_RETURN void * -#define THREAD_SPECIFIC_INDEX PULONG -#define pthread_t TID -#define pthread_attr_t ULONG -#define pthread_detach(thread) 0 -#define thread_sleep(nms) DosSleep(nms) -#define pthread_cancel(thread) DosKillThread(thread) -#define ts_key_create(ts_key, destructor) \ - DosAllocThreadLocalMemory(1, &(ts_key)); -#define pthread_getspecific(ts_key) ((void *)(*(ts_key))) -#define pthread_setspecific(ts_key, value) (*(ts_key) = (ULONG)(value)) -#define pthread_self() _gettid() #else +/* pthreads */ #ifdef __APPLE__ #include <mach/mach_init.h> #include <mach/semaphore.h> #include <mach/task.h> #include <time.h> #include <unistd.h> - #else #include <semaphore.h> #endif - -#include <pthread.h> -/* pthreads */ -/* Nearly everything is already defined */ -#define THREAD_FUNCTION void * -#define THREAD_FUNCTION_RETURN void * -#define THREAD_SPECIFIC_INDEX pthread_key_t -#define ts_key_create(ts_key, destructor) \ - pthread_key_create(&(ts_key), destructor); #endif /* Synchronization macros: Win32 and Pthreads */ #if defined(_WIN32) && !HAVE_PTHREAD_H -#define sem_t HANDLE -#define pause(voidpara) __asm PAUSE -#define sem_init(sem, sem_attr1, sem_init_value) \ - (int)((*sem = CreateSemaphore(NULL, 0, 32768, NULL)) == NULL) -#define sem_wait(sem) \ +#define vp8_sem_t HANDLE +#define vp8_sem_init(sem, pshared, value) \ + (int)((*sem = CreateSemaphore(NULL, value, 32768, NULL)) == NULL) +#define vp8_sem_wait(sem) \ (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem, INFINITE)) -#define sem_post(sem) ReleaseSemaphore(*sem, 1, NULL) -#define sem_destroy(sem) \ +#define vp8_sem_post(sem) ReleaseSemaphore(*sem, 1, NULL) +#define vp8_sem_destroy(sem) \ if (*sem) ((int)(CloseHandle(*sem)) == TRUE) #define thread_sleep(nms) Sleep(nms) -#elif defined(__OS2__) -typedef struct { - HEV event; - HMTX wait_mutex; - HMTX count_mutex; - int count; -} sem_t; - -static inline int sem_init(sem_t *sem, int pshared, unsigned int value) { - DosCreateEventSem(NULL, &sem->event, pshared ? DC_SEM_SHARED : 0, - value > 0 ? TRUE : FALSE); - DosCreateMutexSem(NULL, &sem->wait_mutex, 0, FALSE); - DosCreateMutexSem(NULL, &sem->count_mutex, 0, FALSE); - - sem->count = value; - - return 0; -} - -static inline int sem_wait(sem_t *sem) { - DosRequestMutexSem(sem->wait_mutex, -1); - - DosWaitEventSem(sem->event, -1); - - DosRequestMutexSem(sem->count_mutex, -1); - - sem->count--; - if (sem->count == 0) { - ULONG post_count; - - DosResetEventSem(sem->event, &post_count); - } - - DosReleaseMutexSem(sem->count_mutex); - - DosReleaseMutexSem(sem->wait_mutex); - - return 0; -} - -static inline int sem_post(sem_t *sem) { - DosRequestMutexSem(sem->count_mutex, -1); - - if (sem->count < 32768) { - sem->count++; - DosPostEventSem(sem->event); - } - - DosReleaseMutexSem(sem->count_mutex); - - return 0; -} - -static inline int sem_destroy(sem_t *sem) { - DosCloseEventSem(sem->event); - DosCloseMutexSem(sem->wait_mutex); - DosCloseMutexSem(sem->count_mutex); - - return 0; -} - -#define thread_sleep(nms) DosSleep(nms) - #else #ifdef __APPLE__ -#define sem_t semaphore_t -#define sem_init(X, Y, Z) \ - semaphore_create(mach_task_self(), X, SYNC_POLICY_FIFO, Z) -#define sem_wait(sem) (semaphore_wait(*sem)) -#define sem_post(sem) semaphore_signal(*sem) -#define sem_destroy(sem) semaphore_destroy(mach_task_self(), *sem) +#define vp8_sem_t semaphore_t +#define vp8_sem_init(sem, pshared, value) \ + semaphore_create(mach_task_self(), sem, SYNC_POLICY_FIFO, value) +#define vp8_sem_wait(sem) semaphore_wait(*sem) +#define vp8_sem_post(sem) semaphore_signal(*sem) +#define vp8_sem_destroy(sem) semaphore_destroy(mach_task_self(), *sem) #else +#include <errno.h> #include <unistd.h> #include <sched.h> +#define vp8_sem_t sem_t +#define vp8_sem_init sem_init +static INLINE int vp8_sem_wait(vp8_sem_t *sem) { + int ret; + while ((ret = sem_wait(sem)) == -1 && errno == EINTR) { + } + return ret; +} +#define vp8_sem_post sem_post +#define vp8_sem_destroy sem_destroy #endif /* __APPLE__ */ /* Not Windows. Assume pthreads */ @@ -194,7 +90,6 @@ static inline int sem_destroy(sem_t *sem) { #define x86_pause_hint() #endif -#include "vpx_util/vpx_thread.h" #include "vpx_util/vpx_atomics.h" static INLINE void vp8_atomic_spin_wait( diff --git a/media/libvpx/libvpx/vp8/decoder/onyxd_if.c b/media/libvpx/libvpx/vp8/decoder/onyxd_if.c index 2248345ba2..88f2de024b 100644 --- a/media/libvpx/libvpx/vp8/decoder/onyxd_if.c +++ b/media/libvpx/libvpx/vp8/decoder/onyxd_if.c @@ -428,6 +428,7 @@ int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf) { #if CONFIG_MULTITHREAD if (setjmp(fb->pbi[0]->common.error.jmp)) { + fb->pbi[0]->common.error.setjmp = 0; vp8_remove_decoder_instances(fb); vp8_zero(fb->pbi); vpx_clear_system_state(); @@ -452,6 +453,7 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb) { /* decoder instance for single thread mode */ remove_decompressor(pbi); + fb->pbi[0] = NULL; return VPX_CODEC_OK; } diff --git a/media/libvpx/libvpx/vp8/decoder/onyxd_int.h b/media/libvpx/libvpx/vp8/decoder/onyxd_int.h index 1070849620..08a60b31b9 100644 --- a/media/libvpx/libvpx/vp8/decoder/onyxd_int.h +++ b/media/libvpx/libvpx/vp8/decoder/onyxd_int.h @@ -14,6 +14,7 @@ #include <assert.h> #include "vpx_config.h" +#include "vpx_util/vpx_pthread.h" #include "vp8/common/onyxd.h" #include "treereader.h" #include "vp8/common/onyxc_int.h" @@ -94,8 +95,8 @@ typedef struct VP8D_COMP { DECODETHREAD_DATA *de_thread_data; pthread_t *h_decoding_thread; - sem_t *h_event_start_decoding; - sem_t h_event_end_decoding; + vp8_sem_t *h_event_start_decoding; + vp8_sem_t h_event_end_decoding; /* end of threading data */ #endif diff --git a/media/libvpx/libvpx/vp8/decoder/threading.c b/media/libvpx/libvpx/vp8/decoder/threading.c index 6ccb080cf9..d16284d134 100644 --- a/media/libvpx/libvpx/vp8/decoder/threading.c +++ b/media/libvpx/libvpx/vp8/decoder/threading.c @@ -15,6 +15,7 @@ #endif #include "onyxd_int.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_util/vpx_pthread.h" #include "vp8/common/common.h" #include "vp8/common/threading.h" #include "vp8/common/loopfilter.h" @@ -577,10 +578,10 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, /* signal end of decoding of current thread for current frame */ if (last_mb_row + (int)pbi->decoding_thread_count + 1 >= pc->mb_rows) - sem_post(&pbi->h_event_end_decoding); + vp8_sem_post(&pbi->h_event_end_decoding); } -static THREAD_FUNCTION thread_decoding_proc(void *p_data) { +static THREADFN thread_decoding_proc(void *p_data) { int ithread = ((DECODETHREAD_DATA *)p_data)->ithread; VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1); MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2); @@ -589,7 +590,7 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) { while (1) { if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) break; - if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) { + if (vp8_sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) { if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) { break; } else { @@ -598,16 +599,17 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) { if (setjmp(xd->error_info.jmp)) { xd->error_info.setjmp = 0; // Signal the end of decoding for current thread. - sem_post(&pbi->h_event_end_decoding); + vp8_sem_post(&pbi->h_event_end_decoding); continue; } xd->error_info.setjmp = 1; mt_decode_mb_rows(pbi, xd, ithread + 1); + xd->error_info.setjmp = 0; } } } - return 0; + return THREAD_EXIT_SUCCESS; } void vp8_decoder_create_threads(VP8D_COMP *pbi) { @@ -634,13 +636,13 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) { CALLOC_ARRAY_ALIGNED(pbi->mb_row_di, pbi->decoding_thread_count, 32); CALLOC_ARRAY(pbi->de_thread_data, pbi->decoding_thread_count); - if (sem_init(&pbi->h_event_end_decoding, 0, 0)) { + if (vp8_sem_init(&pbi->h_event_end_decoding, 0, 0)) { vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, "Failed to initialize semaphore"); } for (ithread = 0; ithread < pbi->decoding_thread_count; ++ithread) { - if (sem_init(&pbi->h_event_start_decoding[ithread], 0, 0)) break; + if (vp8_sem_init(&pbi->h_event_start_decoding[ithread], 0, 0)) break; vp8_setup_block_dptrs(&pbi->mb_row_di[ithread].mbd); @@ -650,7 +652,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) { if (pthread_create(&pbi->h_decoding_thread[ithread], 0, thread_decoding_proc, &pbi->de_thread_data[ithread])) { - sem_destroy(&pbi->h_event_start_decoding[ithread]); + vp8_sem_destroy(&pbi->h_event_start_decoding[ithread]); break; } } @@ -661,7 +663,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) { /* the remainder of cleanup cases will be handled in * vp8_decoder_remove_threads(). */ if (pbi->allocated_decoding_thread_count == 0) { - sem_destroy(&pbi->h_event_end_decoding); + vp8_sem_destroy(&pbi->h_event_end_decoding); } vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, "Failed to create threads"); @@ -812,16 +814,16 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi) { /* allow all threads to exit */ for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) { - sem_post(&pbi->h_event_start_decoding[i]); + vp8_sem_post(&pbi->h_event_start_decoding[i]); pthread_join(pbi->h_decoding_thread[i], NULL); } for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) { - sem_destroy(&pbi->h_event_start_decoding[i]); + vp8_sem_destroy(&pbi->h_event_start_decoding[i]); } if (pbi->allocated_decoding_thread_count) { - sem_destroy(&pbi->h_event_end_decoding); + vp8_sem_destroy(&pbi->h_event_end_decoding); } vpx_free(pbi->h_decoding_thread); @@ -883,7 +885,7 @@ int vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) { pbi->decoding_thread_count); for (i = 0; i < pbi->decoding_thread_count; ++i) { - sem_post(&pbi->h_event_start_decoding[i]); + vp8_sem_post(&pbi->h_event_start_decoding[i]); } if (setjmp(xd->error_info.jmp)) { @@ -893,15 +895,16 @@ int vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) { // the current frame while the main thread starts decoding the next frame, // which causes a data race. for (i = 0; i < pbi->decoding_thread_count; ++i) - sem_wait(&pbi->h_event_end_decoding); + vp8_sem_wait(&pbi->h_event_end_decoding); return -1; } xd->error_info.setjmp = 1; mt_decode_mb_rows(pbi, xd, 0); + xd->error_info.setjmp = 0; for (i = 0; i < pbi->decoding_thread_count + 1; ++i) - sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */ + vp8_sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */ return 0; } diff --git a/media/libvpx/libvpx/vp8/encoder/encodeframe.c b/media/libvpx/libvpx/vp8/encoder/encodeframe.c index 82c48b13a7..d0117897db 100644 --- a/media/libvpx/libvpx/vp8/encoder/encodeframe.c +++ b/media/libvpx/libvpx/vp8/encoder/encodeframe.c @@ -7,38 +7,38 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include <stdio.h> #include <limits.h> +#include <stdio.h> #include "vpx_config.h" -#include "vp8_rtcd.h" -#include "./vpx_dsp_rtcd.h" -#include "bitstream.h" -#include "encodemb.h" -#include "encodemv.h" -#if CONFIG_MULTITHREAD -#include "ethreading.h" -#endif + #include "vp8/common/common.h" -#include "onyx_int.h" -#include "vp8/common/extend.h" #include "vp8/common/entropymode.h" -#include "vp8/common/quant_common.h" -#include "segmentation.h" -#include "vp8/common/setupintrarecon.h" -#include "encodeintra.h" -#include "vp8/common/reconinter.h" -#include "rdopt.h" -#include "pickinter.h" +#include "vp8/common/extend.h" #include "vp8/common/findnearmv.h" #include "vp8/common/invtrans.h" +#include "vp8/common/quant_common.h" +#include "vp8/common/reconinter.h" +#include "vp8/common/setupintrarecon.h" +#include "vp8/common/threading.h" +#include "vp8/encoder/bitstream.h" +#include "vp8/encoder/encodeframe.h" +#include "vp8/encoder/encodeintra.h" +#include "vp8/encoder/encodemb.h" +#include "vp8/encoder/encodemv.h" +#include "vp8/encoder/onyx_int.h" +#include "vp8/encoder/pickinter.h" +#include "vp8/encoder/rdopt.h" +#include "vp8/encoder/segmentation.h" +#include "vp8_rtcd.h" #include "vpx/internal/vpx_codec_internal.h" +#include "vpx_dsp_rtcd.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/vpx_timer.h" -#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING -#include "bitstream.h" + +#if CONFIG_MULTITHREAD +#include "vp8/encoder/ethreading.h" #endif -#include "encodeframe.h" extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t); static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x); @@ -773,7 +773,7 @@ void vp8_encode_frame(VP8_COMP *cpi) { vpx_atomic_store_release(&cpi->mt_current_mb_col[i], -1); for (i = 0; i < cpi->encoding_thread_count; ++i) { - sem_post(&cpi->h_event_start_encoding[i]); + vp8_sem_post(&cpi->h_event_start_encoding[i]); } for (mb_row = 0; mb_row < cm->mb_rows; @@ -806,7 +806,7 @@ void vp8_encode_frame(VP8_COMP *cpi) { } /* Wait for all the threads to finish. */ for (i = 0; i < cpi->encoding_thread_count; ++i) { - sem_wait(&cpi->h_event_end_encoding[i]); + vp8_sem_wait(&cpi->h_event_end_encoding[i]); } for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { diff --git a/media/libvpx/libvpx/vp8/encoder/ethreading.c b/media/libvpx/libvpx/vp8/encoder/ethreading.c index e2f8b89d46..98c87d3cbc 100644 --- a/media/libvpx/libvpx/vp8/encoder/ethreading.c +++ b/media/libvpx/libvpx/vp8/encoder/ethreading.c @@ -10,6 +10,7 @@ #include <stddef.h> #include "onyx_int.h" +#include "vpx_util/vpx_pthread.h" #include "vp8/common/threading.h" #include "vp8/common/common.h" #include "vp8/common/extend.h" @@ -22,27 +23,27 @@ extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip); -static THREAD_FUNCTION thread_loopfilter(void *p_data) { +static THREADFN thread_loopfilter(void *p_data) { VP8_COMP *cpi = (VP8_COMP *)(((LPFTHREAD_DATA *)p_data)->ptr1); VP8_COMMON *cm = &cpi->common; while (1) { if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; - if (sem_wait(&cpi->h_event_start_lpf) == 0) { + if (vp8_sem_wait(&cpi->h_event_start_lpf) == 0) { /* we're shutting down */ if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; vp8_loopfilter_frame(cpi, cm); - sem_post(&cpi->h_event_end_lpf); + vp8_sem_post(&cpi->h_event_end_lpf); } } - return 0; + return THREAD_EXIT_SUCCESS; } -static THREAD_FUNCTION thread_encoding_proc(void *p_data) { +static THREADFN thread_encoding_proc(void *p_data) { int ithread = ((ENCODETHREAD_DATA *)p_data)->ithread; VP8_COMP *cpi = (VP8_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr1); MB_ROW_COMP *mbri = (MB_ROW_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr2); @@ -51,7 +52,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { while (1) { if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; - if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) { + if (vp8_sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) { const int nsync = cpi->mt_sync_range; VP8_COMMON *cm = &cpi->common; int mb_row; @@ -307,12 +308,12 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { x->gf_active_ptr += cm->mb_cols * cpi->encoding_thread_count; } /* Signal that this thread has completed processing its rows. */ - sem_post(&cpi->h_event_end_encoding[ithread]); + vp8_sem_post(&cpi->h_event_end_encoding[ithread]); } } /* printf("exit thread %d\n", ithread); */ - return 0; + return THREAD_EXIT_SUCCESS; } static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc) { @@ -514,9 +515,9 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { CHECK_MEM_ERROR(&cpi->common.error, cpi->h_encoding_thread, vpx_malloc(sizeof(pthread_t) * th_count)); CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_start_encoding, - vpx_malloc(sizeof(sem_t) * th_count)); + vpx_malloc(sizeof(vp8_sem_t) * th_count)); CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_end_encoding, - vpx_malloc(sizeof(sem_t) * th_count)); + vpx_malloc(sizeof(vp8_sem_t) * th_count)); CHECK_MEM_ERROR(&cpi->common.error, cpi->mb_row_ei, vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count)); memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count); @@ -538,8 +539,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { vp8_setup_block_ptrs(&cpi->mb_row_ei[ithread].mb); vp8_setup_block_dptrs(&cpi->mb_row_ei[ithread].mb.e_mbd); - sem_init(&cpi->h_event_start_encoding[ithread], 0, 0); - sem_init(&cpi->h_event_end_encoding[ithread], 0, 0); + vp8_sem_init(&cpi->h_event_start_encoding[ithread], 0, 0); + vp8_sem_init(&cpi->h_event_end_encoding[ithread], 0, 0); ethd->ithread = ithread; ethd->ptr1 = (void *)cpi; @@ -554,11 +555,11 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { /* shutdown other threads */ vpx_atomic_store_release(&cpi->b_multi_threaded, 0); for (--ithread; ithread >= 0; ithread--) { - sem_post(&cpi->h_event_start_encoding[ithread]); - sem_post(&cpi->h_event_end_encoding[ithread]); + vp8_sem_post(&cpi->h_event_start_encoding[ithread]); + vp8_sem_post(&cpi->h_event_end_encoding[ithread]); pthread_join(cpi->h_encoding_thread[ithread], 0); - sem_destroy(&cpi->h_event_start_encoding[ithread]); - sem_destroy(&cpi->h_event_end_encoding[ithread]); + vp8_sem_destroy(&cpi->h_event_start_encoding[ithread]); + vp8_sem_destroy(&cpi->h_event_end_encoding[ithread]); } /* free thread related resources */ @@ -580,8 +581,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { { LPFTHREAD_DATA *lpfthd = &cpi->lpf_thread_data; - sem_init(&cpi->h_event_start_lpf, 0, 0); - sem_init(&cpi->h_event_end_lpf, 0, 0); + vp8_sem_init(&cpi->h_event_start_lpf, 0, 0); + vp8_sem_init(&cpi->h_event_end_lpf, 0, 0); lpfthd->ptr1 = (void *)cpi; rc = pthread_create(&cpi->h_filter_thread, 0, thread_loopfilter, lpfthd); @@ -590,14 +591,14 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { /* shutdown other threads */ vpx_atomic_store_release(&cpi->b_multi_threaded, 0); for (--ithread; ithread >= 0; ithread--) { - sem_post(&cpi->h_event_start_encoding[ithread]); - sem_post(&cpi->h_event_end_encoding[ithread]); + vp8_sem_post(&cpi->h_event_start_encoding[ithread]); + vp8_sem_post(&cpi->h_event_end_encoding[ithread]); pthread_join(cpi->h_encoding_thread[ithread], 0); - sem_destroy(&cpi->h_event_start_encoding[ithread]); - sem_destroy(&cpi->h_event_end_encoding[ithread]); + vp8_sem_destroy(&cpi->h_event_start_encoding[ithread]); + vp8_sem_destroy(&cpi->h_event_end_encoding[ithread]); } - sem_destroy(&cpi->h_event_end_lpf); - sem_destroy(&cpi->h_event_start_lpf); + vp8_sem_destroy(&cpi->h_event_end_lpf); + vp8_sem_destroy(&cpi->h_event_start_lpf); /* free thread related resources */ vpx_free(cpi->h_event_start_encoding); @@ -627,21 +628,21 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi) { int i; for (i = 0; i < cpi->encoding_thread_count; ++i) { - sem_post(&cpi->h_event_start_encoding[i]); - sem_post(&cpi->h_event_end_encoding[i]); + vp8_sem_post(&cpi->h_event_start_encoding[i]); + vp8_sem_post(&cpi->h_event_end_encoding[i]); pthread_join(cpi->h_encoding_thread[i], 0); - sem_destroy(&cpi->h_event_start_encoding[i]); - sem_destroy(&cpi->h_event_end_encoding[i]); + vp8_sem_destroy(&cpi->h_event_start_encoding[i]); + vp8_sem_destroy(&cpi->h_event_end_encoding[i]); } - sem_post(&cpi->h_event_start_lpf); + vp8_sem_post(&cpi->h_event_start_lpf); pthread_join(cpi->h_filter_thread, 0); } - sem_destroy(&cpi->h_event_end_lpf); - sem_destroy(&cpi->h_event_start_lpf); + vp8_sem_destroy(&cpi->h_event_end_lpf); + vp8_sem_destroy(&cpi->h_event_start_lpf); cpi->b_lpf_running = 0; /* free thread related resources */ diff --git a/media/libvpx/libvpx/vp8/encoder/onyx_if.c b/media/libvpx/libvpx/vp8/encoder/onyx_if.c index 4e128e3c49..ad01c6fc86 100644 --- a/media/libvpx/libvpx/vp8/encoder/onyx_if.c +++ b/media/libvpx/libvpx/vp8/encoder/onyx_if.c @@ -63,7 +63,7 @@ extern int vp8_update_coef_context(VP8_COMP *cpi); #endif -extern unsigned int vp8_get_processor_freq(); +extern unsigned int vp8_get_processor_freq(void); int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest); @@ -267,7 +267,11 @@ static int rescale(int val, int num, int denom) { int64_t llden = denom; int64_t llval = val; - return (int)(llval * llnum / llden); + int64_t result = (llval * llnum / llden); + if (result <= INT_MAX) + return (int)result; + else + return INT_MAX; } void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf, @@ -276,7 +280,10 @@ void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf, LAYER_CONTEXT *lc = &cpi->layer_context[layer]; lc->framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[layer]; - lc->target_bandwidth = cpi->oxcf.target_bitrate[layer] * 1000; + if (cpi->oxcf.target_bitrate[layer] > INT_MAX / 1000) + lc->target_bandwidth = INT_MAX; + else + lc->target_bandwidth = cpi->oxcf.target_bitrate[layer] * 1000; lc->starting_buffer_level_in_ms = oxcf->starting_buffer_level; lc->optimal_buffer_level_in_ms = oxcf->optimal_buffer_level; @@ -1381,7 +1388,10 @@ void vp8_update_layer_contexts(VP8_COMP *cpi) { LAYER_CONTEXT *lc = &cpi->layer_context[i]; lc->framerate = cpi->ref_framerate / oxcf->rate_decimator[i]; - lc->target_bandwidth = oxcf->target_bitrate[i] * 1000; + if (oxcf->target_bitrate[i] > INT_MAX / 1000) + lc->target_bandwidth = INT_MAX; + else + lc->target_bandwidth = oxcf->target_bitrate[i] * 1000; lc->starting_buffer_level = rescale( (int)oxcf->starting_buffer_level_in_ms, lc->target_bandwidth, 1000); @@ -1995,6 +2005,7 @@ struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) { #if CONFIG_MULTITHREAD if (vp8cx_create_encoder_threads(cpi)) { + cpi->common.error.setjmp = 0; vp8_remove_compressor(&cpi); return 0; } @@ -2048,8 +2059,6 @@ struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) { vp8_loop_filter_init(cm); - cpi->common.error.setjmp = 0; - #if CONFIG_MULTI_RES_ENCODING /* Calculate # of MBs in a row in lower-resolution level image. */ @@ -2076,6 +2085,8 @@ struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) { vp8_setup_block_ptrs(&cpi->mb); vp8_setup_block_dptrs(&cpi->mb.e_mbd); + cpi->common.error.setjmp = 0; + return cpi; } @@ -3172,7 +3183,8 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) { #if CONFIG_MULTITHREAD if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { - sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */ + /* signal that we have set filter_level */ + vp8_sem_post(&cpi->h_event_end_lpf); } #endif @@ -4387,11 +4399,11 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, #if CONFIG_MULTITHREAD if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { /* start loopfilter in separate thread */ - sem_post(&cpi->h_event_start_lpf); + vp8_sem_post(&cpi->h_event_start_lpf); cpi->b_lpf_running = 1; /* wait for the filter_level to be picked so that we can continue with * stream packing */ - sem_wait(&cpi->h_event_end_lpf); + vp8_sem_wait(&cpi->h_event_end_lpf); } else #endif { @@ -5120,6 +5132,14 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, vpx_usec_timer_mark(&cmptimer); cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer); +#if CONFIG_MULTITHREAD + /* wait for the lpf thread done */ + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) && cpi->b_lpf_running) { + vp8_sem_wait(&cpi->h_event_end_lpf); + cpi->b_lpf_running = 0; + } +#endif + if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame) { generate_psnr_packet(cpi); } @@ -5247,16 +5267,6 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, #endif #endif - cpi->common.error.setjmp = 0; - -#if CONFIG_MULTITHREAD - /* wait for the lpf thread done */ - if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) && cpi->b_lpf_running) { - sem_wait(&cpi->h_event_end_lpf); - cpi->b_lpf_running = 0; - } -#endif - return 0; } diff --git a/media/libvpx/libvpx/vp8/encoder/onyx_int.h b/media/libvpx/libvpx/vp8/encoder/onyx_int.h index 1451a27812..bb1518ed7f 100644 --- a/media/libvpx/libvpx/vp8/encoder/onyx_int.h +++ b/media/libvpx/libvpx/vp8/encoder/onyx_int.h @@ -20,6 +20,7 @@ #include "tokenize.h" #include "vp8/common/onyxc_int.h" #include "vpx_dsp/variance.h" +#include "vpx_util/vpx_pthread.h" #include "encodemb.h" #include "vp8/encoder/quantize.h" #include "vp8/common/entropy.h" @@ -540,10 +541,10 @@ typedef struct VP8_COMP { LPFTHREAD_DATA lpf_thread_data; /* events */ - sem_t *h_event_start_encoding; - sem_t *h_event_end_encoding; - sem_t h_event_start_lpf; - sem_t h_event_end_lpf; + vp8_sem_t *h_event_start_encoding; + vp8_sem_t *h_event_end_encoding; + vp8_sem_t h_event_start_lpf; + vp8_sem_t h_event_end_lpf; #endif TOKENLIST *tplist; diff --git a/media/libvpx/libvpx/vp8/encoder/ratectrl.c b/media/libvpx/libvpx/vp8/encoder/ratectrl.c index fcd4eb04eb..7ba7a308ab 100644 --- a/media/libvpx/libvpx/vp8/encoder/ratectrl.c +++ b/media/libvpx/libvpx/vp8/encoder/ratectrl.c @@ -791,8 +791,12 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { (int)((cpi->buffer_level - cpi->oxcf.optimal_buffer_level) / one_percent_bits); } else if (cpi->bits_off_target > cpi->oxcf.optimal_buffer_level) { - percent_high = - (int)((100 * cpi->bits_off_target) / (cpi->total_byte_count * 8)); + if (cpi->total_byte_count > 0) { + percent_high = (int)((100 * cpi->bits_off_target) / + (cpi->total_byte_count * 8)); + } else { + percent_high = cpi->oxcf.over_shoot_pct; + } } if (percent_high > cpi->oxcf.over_shoot_pct) { @@ -1190,10 +1194,13 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) { /* Calculate required scaling factor based on target frame size and * size of frame produced using previous Q */ - if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS)) { - /* Case where we would overflow int */ - target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs) - << BPER_MB_NORMBITS; + if (target_bits_per_frame > (INT_MAX >> BPER_MB_NORMBITS)) { + int temp = target_bits_per_frame / cpi->common.MBs; + if (temp > (INT_MAX >> BPER_MB_NORMBITS)) { + target_bits_per_mb = INT_MAX; + } else { + target_bits_per_mb = temp << BPER_MB_NORMBITS; + } } else { target_bits_per_mb = (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs; @@ -1534,9 +1541,13 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) { // undershoots significantly, and then we end up dropping every other // frame because the QP/rate_correction_factor may have been too low // before the drop and then takes too long to come up. - if (target_size >= (INT_MAX >> BPER_MB_NORMBITS)) { - target_bits_per_mb = (target_size / cpi->common.MBs) - << BPER_MB_NORMBITS; + if (target_size > (INT_MAX >> BPER_MB_NORMBITS)) { + int temp = target_size / cpi->common.MBs; + if (temp > (INT_MAX >> BPER_MB_NORMBITS)) { + target_bits_per_mb = INT_MAX; + } else { + target_bits_per_mb = temp << BPER_MB_NORMBITS; + } } else { target_bits_per_mb = (target_size << BPER_MB_NORMBITS) / cpi->common.MBs; diff --git a/media/libvpx/libvpx/vp8/encoder/tokenize.h b/media/libvpx/libvpx/vp8/encoder/tokenize.h index 47b5be17f1..5223aa2d86 100644 --- a/media/libvpx/libvpx/vp8/encoder/tokenize.h +++ b/media/libvpx/libvpx/vp8/encoder/tokenize.h @@ -18,8 +18,6 @@ extern "C" { #endif -void vp8_tokenize_initialize(); - typedef struct { short Token; short Extra; diff --git a/media/libvpx/libvpx/vp8/vp8_cx_iface.c b/media/libvpx/libvpx/vp8/vp8_cx_iface.c index 1f16cc53d3..2b238c1a97 100644 --- a/media/libvpx/libvpx/vp8/vp8_cx_iface.c +++ b/media/libvpx/libvpx/vp8/vp8_cx_iface.c @@ -8,6 +8,11 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <limits.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> + #include "./vpx_config.h" #include "./vp8_rtcd.h" #include "./vpx_dsp_rtcd.h" @@ -18,6 +23,7 @@ #include "vpx_mem/vpx_mem.h" #include "vpx_ports/static_assert.h" #include "vpx_ports/system_state.h" +#include "vpx_util/vpx_thread.h" #include "vpx_util/vpx_timestamp.h" #if CONFIG_MULTITHREAD #include "vp8/encoder/ethreading.h" @@ -27,8 +33,6 @@ #include "vp8/encoder/firstpass.h" #include "vp8/common/onyx.h" #include "vp8/common/common.h" -#include <stdlib.h> -#include <string.h> struct vp8_extracfg { struct vpx_codec_pkt_list *pkt_list; @@ -148,7 +152,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK_HI(cfg, g_profile, 3); RANGE_CHECK_HI(cfg, rc_max_quantizer, 63); RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer); - RANGE_CHECK_HI(cfg, g_threads, 64); + RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS); #if CONFIG_REALTIME_ONLY RANGE_CHECK_HI(cfg, g_lag_in_frames, 0); #elif CONFIG_MULTI_RES_ENCODING @@ -495,7 +499,10 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx, set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL); vp8_change_config(ctx->cpi, &ctx->oxcf); #if CONFIG_MULTITHREAD - if (vp8cx_create_encoder_threads(ctx->cpi)) return VPX_CODEC_ERROR; + if (vp8cx_create_encoder_threads(ctx->cpi)) { + ctx->cpi->common.error.setjmp = 0; + return VPX_CODEC_ERROR; + } #endif ctx->cpi->common.error.setjmp = 0; return VPX_CODEC_OK; @@ -777,9 +784,9 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, return res; } -static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, - unsigned long duration, - vpx_enc_deadline_t deadline) { +static vpx_codec_err_t pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, + unsigned long duration, + vpx_enc_deadline_t deadline) { int new_qc; #if !(CONFIG_REALTIME_ONLY) @@ -788,13 +795,15 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, if (deadline) { /* Convert duration parameter from stream timebase to microseconds */ - uint64_t duration_us; - VPX_STATIC_ASSERT(TICKS_PER_SEC > 1000000 && (TICKS_PER_SEC % 1000000) == 0); - duration_us = duration * (uint64_t)ctx->timestamp_ratio.num / - (ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000)); + if (duration > UINT64_MAX / (uint64_t)ctx->timestamp_ratio.num) { + ERROR("duration is too big"); + } + uint64_t duration_us = + duration * (uint64_t)ctx->timestamp_ratio.num / + ((uint64_t)ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000)); /* If the deadline is more that the duration this frame is to be shown, * use good quality mode. Otherwise use realtime mode. @@ -820,6 +829,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, ctx->oxcf.Mode = new_qc; vp8_change_config(ctx->cpi, &ctx->oxcf); } + return VPX_CODEC_OK; } static vpx_codec_err_t set_reference_and_update(vpx_codec_alg_priv_t *ctx, @@ -894,13 +904,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, if (!res) res = validate_config(ctx, &ctx->cfg, &ctx->vp8_cfg, 1); - if (!ctx->pts_offset_initialized) { - ctx->pts_offset = pts_val; - ctx->pts_offset_initialized = 1; - } - pts_val -= ctx->pts_offset; - - pick_quickcompress_mode(ctx, duration, deadline); + if (!res) res = pick_quickcompress_mode(ctx, duration, deadline); vpx_codec_pkt_list_init(&ctx->pkt_list); // If no flags are set in the encode call, then use the frame flags as @@ -924,7 +928,6 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, /* Initialize the encoder instance on the first frame*/ if (!res && ctx->cpi) { unsigned int lib_flags; - YV12_BUFFER_CONFIG sd; int64_t dst_time_stamp, dst_end_time_stamp; size_t size, cx_data_sz; unsigned char *cx_data; @@ -951,12 +954,44 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, /* Convert API flags to internal codec lib flags */ lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; - dst_time_stamp = - pts_val * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den; - dst_end_time_stamp = (pts_val + (int64_t)duration) * - ctx->timestamp_ratio.num / ctx->timestamp_ratio.den; - if (img != NULL) { + YV12_BUFFER_CONFIG sd; + + if (!ctx->pts_offset_initialized) { + ctx->pts_offset = pts_val; + ctx->pts_offset_initialized = 1; + } + if (pts_val < ctx->pts_offset) { + vpx_internal_error(&ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM, + "pts is smaller than initial pts"); + } + pts_val -= ctx->pts_offset; + if (pts_val > INT64_MAX / ctx->timestamp_ratio.num) { + vpx_internal_error( + &ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM, + "conversion of relative pts to ticks would overflow"); + } + dst_time_stamp = + pts_val * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den; +#if ULONG_MAX > INT64_MAX + if (duration > INT64_MAX) { + vpx_internal_error(&ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM, + "duration is too big"); + } +#endif + if (pts_val > INT64_MAX - (int64_t)duration) { + vpx_internal_error(&ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM, + "relative pts + duration is too big"); + } + vpx_codec_pts_t pts_end = pts_val + (int64_t)duration; + if (pts_end > INT64_MAX / ctx->timestamp_ratio.num) { + vpx_internal_error( + &ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM, + "conversion of relative pts + duration to ticks would overflow"); + } + dst_end_time_stamp = + pts_end * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den; + res = image2yuvconfig(img, &sd); if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) { @@ -989,6 +1024,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, &dst_end_time_stamp, !img); if (comp_data_state == VPX_CODEC_CORRUPT_FRAME) { + ctx->cpi->common.error.setjmp = 0; return VPX_CODEC_CORRUPT_FRAME; } else if (comp_data_state == -1) { break; diff --git a/media/libvpx/libvpx/vp8/vp8_dx_iface.c b/media/libvpx/libvpx/vp8/vp8_dx_iface.c index e81deaf4ea..fa7d7be403 100644 --- a/media/libvpx/libvpx/vp8/vp8_dx_iface.c +++ b/media/libvpx/libvpx/vp8/vp8_dx_iface.c @@ -488,7 +488,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, if (pc->fb_idx_ref_cnt[pc->new_fb_idx] > 0) { pc->fb_idx_ref_cnt[pc->new_fb_idx]--; } - pc->error.setjmp = 0; + pbi->common.error.setjmp = 0; #if CONFIG_MULTITHREAD if (pbi->restart_threads) { ctx->si.w = 0; diff --git a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc index 261c316fd1..312092f190 100644 --- a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc +++ b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc @@ -8,10 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "vp8/vp8_ratectrl_rtc.h" + #include <math.h> + #include <new> + #include "vp8/common/common.h" -#include "vp8/vp8_ratectrl_rtc.h" #include "vp8/encoder/onyx_int.h" #include "vp8/encoder/ratectrl.h" #include "vpx_ports/system_state.h" @@ -311,6 +314,14 @@ FrameDropDecision VP8RateControlRTC::ComputeQP( int VP8RateControlRTC::GetQP() const { return q_; } +UVDeltaQP VP8RateControlRTC::GetUVDeltaQP() const { + VP8_COMMON *cm = &cpi_->common; + UVDeltaQP uv_delta_q; + uv_delta_q.uvdc_delta_q = cm->uvdc_delta_q; + uv_delta_q.uvac_delta_q = cm->uvac_delta_q; + return uv_delta_q; +} + int VP8RateControlRTC::GetLoopfilterLevel() const { VP8_COMMON *cm = &cpi_->common; const double qp = q_; diff --git a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h index 59fb607526..b458b5ce65 100644 --- a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h +++ b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h @@ -21,7 +21,6 @@ struct VP8_COMP; namespace libvpx { struct VP8RateControlRtcConfig : public VpxRateControlRtcConfig { - public: VP8RateControlRtcConfig() { memset(&layer_target_bitrate, 0, sizeof(layer_target_bitrate)); memset(&ts_rate_decimator, 0, sizeof(ts_rate_decimator)); @@ -42,6 +41,9 @@ class VP8RateControlRTC { bool UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg); // GetQP() needs to be called after ComputeQP() to get the latest QP int GetQP() const; + // GetUVDeltaQP() needs to be called after ComputeQP() to get the latest + // delta QP for UV. + UVDeltaQP GetUVDeltaQP() const; // GetLoopfilterLevel() needs to be called after ComputeQP() since loopfilter // level is calculated from frame qp. int GetLoopfilterLevel() const; @@ -53,10 +55,10 @@ class VP8RateControlRTC { void PostEncodeUpdate(uint64_t encoded_frame_size); private: - VP8RateControlRTC() {} + VP8RateControlRTC() = default; bool InitRateControl(const VP8RateControlRtcConfig &cfg); - struct VP8_COMP *cpi_; - int q_; + struct VP8_COMP *cpi_ = nullptr; + int q_ = -1; }; } // namespace libvpx diff --git a/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h b/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h index 1cfc12f6fa..4c8fcf6989 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h +++ b/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h @@ -13,7 +13,6 @@ #include "./vpx_config.h" #include "vpx/internal/vpx_codec_internal.h" -#include "vpx_util/vpx_thread.h" #include "./vp9_rtcd.h" #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_loopfilter.h" diff --git a/media/libvpx/libvpx/vp9/common/vp9_rtcd.c b/media/libvpx/libvpx/vp9/common/vp9_rtcd.c index 37762ca15a..1a93b97e56 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_rtcd.c +++ b/media/libvpx/libvpx/vp9/common/vp9_rtcd.c @@ -12,4 +12,4 @@ #include "./vp9_rtcd.h" #include "vpx_ports/vpx_once.h" -void vp9_rtcd() { once(setup_rtcd_internal); } +void vp9_rtcd(void) { once(setup_rtcd_internal); } diff --git a/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl b/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl index 3ecbd5417f..af3ff0e980 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl +++ b/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl @@ -129,7 +129,7 @@ if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") { add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size"; -specialize qw/vp9_block_error_fp neon avx2 sse2/; +specialize qw/vp9_block_error_fp neon sve avx2 sse2/; add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order"; specialize qw/vp9_quantize_fp neon sse2 ssse3 avx2 vsx/; @@ -138,12 +138,12 @@ add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t specialize qw/vp9_quantize_fp_32x32 neon ssse3 avx2 vsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - specialize qw/vp9_block_error neon avx2 sse2/; + specialize qw/vp9_block_error neon sve avx2 sse2/; add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd"; specialize qw/vp9_highbd_block_error neon sse2/; } else { - specialize qw/vp9_block_error neon avx2 msa sse2/; + specialize qw/vp9_block_error neon sve avx2 msa sse2/; } # fdct functions diff --git a/media/libvpx/libvpx/vp9/common/vp9_thread_common.c b/media/libvpx/libvpx/vp9/common/vp9_thread_common.c index 8df18af3b8..24adbcbff0 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_thread_common.c +++ b/media/libvpx/libvpx/vp9/common/vp9_thread_common.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_util/vpx_pthread.h" #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_thread_common.h" #include "vp9/common/vp9_reconinter.h" diff --git a/media/libvpx/libvpx/vp9/common/vp9_thread_common.h b/media/libvpx/libvpx/vp9/common/vp9_thread_common.h index 5df0117f12..96c705d0d5 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_thread_common.h +++ b/media/libvpx/libvpx/vp9/common/vp9_thread_common.h @@ -12,6 +12,7 @@ #define VPX_VP9_COMMON_VP9_THREAD_COMMON_H_ #include "./vpx_config.h" #include "vp9/common/vp9_loopfilter.h" +#include "vpx_util/vpx_pthread.h" #include "vpx_util/vpx_thread.h" #ifdef __cplusplus diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c b/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c index c5892156f4..4fe680cefc 100644 --- a/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c +++ b/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c @@ -22,6 +22,7 @@ #include "vpx_ports/mem.h" #include "vpx_ports/mem_ops.h" #include "vpx_scale/vpx_scale.h" +#include "vpx_util/vpx_pthread.h" #include "vpx_util/vpx_thread.h" #if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG #include "vpx_util/vpx_debug_util.h" @@ -2292,6 +2293,7 @@ static INLINE void init_mt(VP9Decoder *pbi) { ++pbi->num_tile_workers; winterface->init(worker); + worker->thread_name = "vpx tile worker"; if (n < num_threads - 1 && !winterface->reset(worker)) { do { winterface->end(&pbi->tile_workers[pbi->num_tile_workers - 1]); diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c index 5a7e9f9ab3..5c77df5002 100644 --- a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c +++ b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c @@ -21,6 +21,7 @@ #include "vpx_ports/vpx_once.h" #include "vpx_ports/vpx_timer.h" #include "vpx_scale/vpx_scale.h" +#include "vpx_util/vpx_pthread.h" #include "vpx_util/vpx_thread.h" #include "vp9/common/vp9_alloccommon.h" @@ -210,6 +211,7 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) { cm->error.setjmp = 0; vpx_get_worker_interface()->init(&pbi->lf_worker); + pbi->lf_worker.thread_name = "vpx lf worker"; return pbi; } diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h index 2e198d552e..b3ee4eab5f 100644 --- a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h +++ b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h @@ -16,6 +16,7 @@ #include "vpx/vpx_codec.h" #include "vpx_dsp/bitreader.h" #include "vpx_scale/yv12config.h" +#include "vpx_util/vpx_pthread.h" #include "vpx_util/vpx_thread.h" #include "vp9/common/vp9_thread_common.h" diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c index 9a31f5a6d0..926ae87739 100644 --- a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c +++ b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c @@ -12,6 +12,7 @@ #include <string.h> #include "vpx/vpx_integer.h" +#include "vpx_util/vpx_pthread.h" #include "vp9/decoder/vp9_job_queue.h" diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h index bc23bf9c2c..59f71fb9ba 100644 --- a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h +++ b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h @@ -11,7 +11,7 @@ #ifndef VPX_VP9_DECODER_VP9_JOB_QUEUE_H_ #define VPX_VP9_DECODER_VP9_JOB_QUEUE_H_ -#include "vpx_util/vpx_thread.h" +#include "vpx_util/vpx_pthread.h" typedef struct { // Pointer to buffer base which contains the jobs diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c new file mode 100644 index 0000000000..78e7361d85 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "./vp9_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" +#include "vpx_dsp/arm/vpx_neon_sve_bridge.h" + +int64_t vp9_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + int64x2_t err_v = vdupq_n_s64(0); + int64x2_t ssz_v = vdupq_n_s64(0); + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + const int16x8_t c0 = load_tran_low_to_s16q(coeff); + const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); + + const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); + const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); + + const int16x8_t diff0 = vabdq_s16(c0, d0); + const int16x8_t diff1 = vabdq_s16(c1, d1); + + err_v = vpx_dotq_s16(err_v, diff0, diff0); + err_v = vpx_dotq_s16(err_v, diff1, diff1); + + ssz_v = vpx_dotq_s16(ssz_v, c0, c0); + ssz_v = vpx_dotq_s16(ssz_v, c1, c1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + *ssz = horizontal_add_int64x2(ssz_v); + return horizontal_add_int64x2(err_v); +} + +int64_t vp9_block_error_fp_sve(const tran_low_t *coeff, + const tran_low_t *dqcoeff, int block_size) { + int64x2_t err = vdupq_n_s64(0); + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + const int16x8_t c0 = load_tran_low_to_s16q(coeff); + const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); + + const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); + const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); + + const int16x8_t diff0 = vabdq_s16(c0, d0); + const int16x8_t diff1 = vabdq_s16(c1, d1); + + err = vpx_dotq_s16(err, diff0, diff0); + err = vpx_dotq_s16(err, diff1, diff1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + return horizontal_add_int64x2(err); +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_block.h b/media/libvpx/libvpx/vp9/encoder/vp9_block.h index 7fa00cd194..6542794667 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_block.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_block.h @@ -11,8 +11,6 @@ #ifndef VPX_VP9_ENCODER_VP9_BLOCK_H_ #define VPX_VP9_ENCODER_VP9_BLOCK_H_ -#include "vpx_util/vpx_thread.h" - #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_entropy.h" diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c index 42073f756c..ee0fcd8729 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c @@ -119,8 +119,8 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) { PC_TREE *const tree = &td->pc_tree[pc_tree_index]; tree->block_size = square[0]; alloc_tree_contexts(cm, tree, 4); - tree->leaf_split[0] = this_leaf++; - for (j = 1; j < 4; j++) tree->leaf_split[j] = tree->leaf_split[0]; + tree->u.leaf_split[0] = this_leaf++; + for (j = 1; j < 4; j++) tree->u.leaf_split[j] = tree->u.leaf_split[0]; } // Each node has 4 leaf nodes, fill each block_size level of the tree @@ -130,7 +130,7 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) { PC_TREE *const tree = &td->pc_tree[pc_tree_index]; alloc_tree_contexts(cm, tree, 4 << (2 * square_index)); tree->block_size = square[square_index]; - for (j = 0; j < 4; j++) tree->split[j] = this_pc++; + for (j = 0; j < 4; j++) tree->u.split[j] = this_pc++; ++pc_tree_index; } ++square_index; diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h index 4e301cc17d..51e13ba654 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h @@ -90,7 +90,7 @@ typedef struct PC_TREE { union { struct PC_TREE *split[4]; PICK_MODE_CONTEXT *leaf_split[4]; - }; + } u; // Obtained from a simple motion search. Used by the ML based partition search // speed feature. MV mv; diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c index 46291f4868..b24c85f406 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c @@ -21,7 +21,7 @@ #include "vpx_ports/mem.h" #include "vpx_ports/vpx_timer.h" #include "vpx_ports/system_state.h" - +#include "vpx_util/vpx_pthread.h" #if CONFIG_MISMATCH_DEBUG #include "vpx_util/vpx_debug_util.h" #endif // CONFIG_MISMATCH_DEBUG @@ -2303,16 +2303,16 @@ static void encode_sb(VP9_COMP *cpi, ThreadData *td, const TileInfo *const tile, assert(partition == PARTITION_SPLIT); if (bsize == BLOCK_8X8) { encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize, - pc_tree->leaf_split[0]); + pc_tree->u.leaf_split[0]); } else { encode_sb(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize, - pc_tree->split[0]); + pc_tree->u.split[0]); encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled, - subsize, pc_tree->split[1]); + subsize, pc_tree->u.split[1]); encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled, - subsize, pc_tree->split[2]); + subsize, pc_tree->u.split[2]); encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled, - subsize, pc_tree->split[3]); + subsize, pc_tree->u.split[3]); } break; } @@ -2645,13 +2645,13 @@ static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td, assert(partition == PARTITION_SPLIT); subsize = get_subsize(bsize, PARTITION_SPLIT); encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize, - pc_tree->split[0]); + pc_tree->u.split[0]); encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled, - subsize, pc_tree->split[1]); + subsize, pc_tree->u.split[1]); encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled, - subsize, pc_tree->split[2]); + subsize, pc_tree->u.split[2]); encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, - output_enabled, subsize, pc_tree->split[3]); + output_enabled, subsize, pc_tree->u.split[3]); break; } @@ -2801,7 +2801,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, assert(partition == PARTITION_SPLIT); if (bsize == BLOCK_8X8) { rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, - subsize, pc_tree->leaf_split[0], INT_MAX, INT64_MAX); + subsize, pc_tree->u.leaf_split[0], INT_MAX, INT64_MAX); break; } last_part_rdc.rate = 0; @@ -2819,7 +2819,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, rd_use_partition(cpi, td, tile_data, mi_8x8 + jj * bss * mis + ii * bss, tp, mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate, &tmp_rdc.dist, i != 3, - pc_tree->split[i]); + pc_tree->u.split[i]); if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { vp9_rd_cost_reset(&last_part_rdc); break; @@ -2860,9 +2860,9 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, continue; save_context(x, mi_row, mi_col, a, l, sa, sl, bsize); - pc_tree->split[i]->partitioning = PARTITION_NONE; + pc_tree->u.split[i]->partitioning = PARTITION_NONE; rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, - &tmp_rdc, split_subsize, &pc_tree->split[i]->none, + &tmp_rdc, split_subsize, &pc_tree->u.split[i]->none, INT_MAX, INT64_MAX); restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); @@ -2877,7 +2877,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, if (i != 3) encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx, 0, - split_subsize, pc_tree->split[i]); + split_subsize, pc_tree->u.split[i]); pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx, split_subsize); @@ -3391,7 +3391,7 @@ static void ml_prune_rect_partition(VP9_COMP *const cpi, MACROBLOCK *const x, features[feature_index++] = VPXMIN(rd_ratio, 2.0f); for (i = 0; i < 4; ++i) { - const int64_t this_rd = pc_tree->split[i]->none.rdcost; + const int64_t this_rd = pc_tree->u.split[i]->none.rdcost; const int rd_valid = this_rd > 0 && this_rd < 1000000000; // Ratio between sub-block RD and whole block RD. features[feature_index++] = @@ -3958,19 +3958,19 @@ static void store_superblock_info( } // recursively traverse partition tree when partition is split. assert(pc_tree->partitioning == PARTITION_SPLIT); - store_superblock_info(pc_tree->split[0], mi_grid_visible, mi_stride, + store_superblock_info(pc_tree->u.split[0], mi_grid_visible, mi_stride, subblock_square_size_4x4, num_unit_rows, num_unit_cols, row_start_4x4, col_start_4x4, partition_info, motion_vector_info); - store_superblock_info(pc_tree->split[1], mi_grid_visible, mi_stride, + store_superblock_info(pc_tree->u.split[1], mi_grid_visible, mi_stride, subblock_square_size_4x4, num_unit_rows, num_unit_cols, row_start_4x4, col_start_4x4 + subblock_square_size_4x4, partition_info, motion_vector_info); - store_superblock_info(pc_tree->split[2], mi_grid_visible, mi_stride, + store_superblock_info(pc_tree->u.split[2], mi_grid_visible, mi_stride, subblock_square_size_4x4, num_unit_rows, num_unit_cols, row_start_4x4 + subblock_square_size_4x4, col_start_4x4, partition_info, motion_vector_info); - store_superblock_info(pc_tree->split[3], mi_grid_visible, mi_stride, + store_superblock_info(pc_tree->u.split[3], mi_grid_visible, mi_stride, subblock_square_size_4x4, num_unit_rows, num_unit_cols, row_start_4x4 + subblock_square_size_4x4, col_start_4x4 + subblock_square_size_4x4, @@ -4114,7 +4114,7 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td, vp9_zero(pc_tree->mv); } if (bsize > BLOCK_8X8) { // Store MV result as reference for subblocks. - for (i = 0; i < 4; ++i) pc_tree->split[i]->mv = pc_tree->mv; + for (i = 0; i < 4; ++i) pc_tree->u.split[i]->mv = pc_tree->mv; } } @@ -4199,25 +4199,25 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td, // PARTITION_SPLIT // TODO(jingning): use the motion vectors given by the above search as // the starting point of motion search in the following partition type check. - pc_tree->split[0]->none.rdcost = 0; - pc_tree->split[1]->none.rdcost = 0; - pc_tree->split[2]->none.rdcost = 0; - pc_tree->split[3]->none.rdcost = 0; + pc_tree->u.split[0]->none.rdcost = 0; + pc_tree->u.split[1]->none.rdcost = 0; + pc_tree->u.split[2]->none.rdcost = 0; + pc_tree->u.split[3]->none.rdcost = 0; if (do_split || must_split) { subsize = get_subsize(bsize, PARTITION_SPLIT); load_pred_mv(x, ctx); if (bsize == BLOCK_8X8) { i = 4; if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed) - pc_tree->leaf_split[0]->pred_interp_filter = pred_interp_filter; + pc_tree->u.leaf_split[0]->pred_interp_filter = pred_interp_filter; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize, - pc_tree->leaf_split[0], best_rdc.rate, best_rdc.dist); + pc_tree->u.leaf_split[0], best_rdc.rate, best_rdc.dist); if (sum_rdc.rate == INT_MAX) { sum_rdc.rdcost = INT64_MAX; } else { if (cpi->sf.prune_ref_frame_for_rect_partitions) { - const int ref1 = pc_tree->leaf_split[0]->mic.ref_frame[0]; - const int ref2 = pc_tree->leaf_split[0]->mic.ref_frame[1]; + const int ref1 = pc_tree->u.leaf_split[0]->mic.ref_frame[0]; + const int ref2 = pc_tree->u.leaf_split[0]->mic.ref_frame[1]; for (i = 0; i < 4; ++i) { ref_frames_used[i] |= (1 << ref1); if (ref2 > 0) ref_frames_used[i] |= (1 << ref2); @@ -4250,21 +4250,21 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td, if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) continue; - pc_tree->split[i]->index = i; + pc_tree->u.split[i]->index = i; if (cpi->sf.prune_ref_frame_for_rect_partitions) - pc_tree->split[i]->none.rate = INT_MAX; + pc_tree->u.split[i]->none.rate = INT_MAX; found_best_rd = rd_pick_partition( cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize, - &this_rdc, best_rdc_split, pc_tree->split[i]); + &this_rdc, best_rdc_split, pc_tree->u.split[i]); if (found_best_rd == 0) { sum_rdc.rdcost = INT64_MAX; break; } else { if (cpi->sf.prune_ref_frame_for_rect_partitions && - pc_tree->split[i]->none.rate != INT_MAX) { - const int ref1 = pc_tree->split[i]->none.mic.ref_frame[0]; - const int ref2 = pc_tree->split[i]->none.mic.ref_frame[1]; + pc_tree->u.split[i]->none.rate != INT_MAX) { + const int ref1 = pc_tree->u.split[i]->none.mic.ref_frame[0]; + const int ref2 = pc_tree->u.split[i]->none.mic.ref_frame[1]; ref_frames_used[i] |= (1 << ref1); if (ref2 > 0) ref_frames_used[i] |= (1 << ref2); } @@ -4821,13 +4821,13 @@ static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x, int mi_row, } break; case PARTITION_SPLIT: { - fill_mode_info_sb(cm, x, mi_row, mi_col, subsize, pc_tree->split[0]); + fill_mode_info_sb(cm, x, mi_row, mi_col, subsize, pc_tree->u.split[0]); fill_mode_info_sb(cm, x, mi_row, mi_col + hbs, subsize, - pc_tree->split[1]); + pc_tree->u.split[1]); fill_mode_info_sb(cm, x, mi_row + hbs, mi_col, subsize, - pc_tree->split[2]); + pc_tree->u.split[2]); fill_mode_info_sb(cm, x, mi_row + hbs, mi_col + hbs, subsize, - pc_tree->split[3]); + pc_tree->u.split[3]); break; } default: break; @@ -4845,7 +4845,8 @@ static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) { if (bsize > BLOCK_8X8) { BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT); int i; - for (i = 0; i < 4; ++i) pred_pixel_ready_reset(pc_tree->split[i], subsize); + for (i = 0; i < 4; ++i) + pred_pixel_ready_reset(pc_tree->u.split[i], subsize); } } @@ -5046,9 +5047,9 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) continue; load_pred_mv(x, ctx); - nonrd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, - mi_col + x_idx, subsize, &this_rdc, 0, - best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]); + nonrd_pick_partition( + cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize, + &this_rdc, 0, best_rdc.rdcost - sum_rdc.rdcost, pc_tree->u.split[i]); if (this_rdc.rate == INT_MAX) { vp9_rd_cost_reset(&sum_rdc); @@ -5281,10 +5282,10 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td, subsize = get_subsize(bsize, PARTITION_SPLIT); nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, subsize, output_enabled, rd_cost, - pc_tree->split[0]); + pc_tree->u.split[0]); nonrd_select_partition(cpi, td, tile_data, mi + hbs, tp, mi_row, mi_col + hbs, subsize, output_enabled, &this_rdc, - pc_tree->split[1]); + pc_tree->u.split[1]); if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { rd_cost->rate += this_rdc.rate; @@ -5292,7 +5293,7 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td, } nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis, tp, mi_row + hbs, mi_col, subsize, output_enabled, - &this_rdc, pc_tree->split[2]); + &this_rdc, pc_tree->u.split[2]); if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { rd_cost->rate += this_rdc.rate; @@ -5300,7 +5301,7 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td, } nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp, mi_row + hbs, mi_col + hbs, subsize, - output_enabled, &this_rdc, pc_tree->split[3]); + output_enabled, &this_rdc, pc_tree->u.split[3]); if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { rd_cost->rate += this_rdc.rate; @@ -5400,21 +5401,21 @@ static void nonrd_use_partition(VP9_COMP *cpi, ThreadData *td, subsize = get_subsize(bsize, PARTITION_SPLIT); if (bsize == BLOCK_8X8) { nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost, - subsize, pc_tree->leaf_split[0]); + subsize, pc_tree->u.leaf_split[0]); encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, - subsize, pc_tree->leaf_split[0]); + subsize, pc_tree->u.leaf_split[0]); } else { nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, subsize, - output_enabled, dummy_cost, pc_tree->split[0]); + output_enabled, dummy_cost, pc_tree->u.split[0]); nonrd_use_partition(cpi, td, tile_data, mi + hbs, tp, mi_row, mi_col + hbs, subsize, output_enabled, dummy_cost, - pc_tree->split[1]); + pc_tree->u.split[1]); nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis, tp, mi_row + hbs, mi_col, subsize, output_enabled, - dummy_cost, pc_tree->split[2]); + dummy_cost, pc_tree->u.split[2]); nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp, mi_row + hbs, mi_col + hbs, subsize, output_enabled, - dummy_cost, pc_tree->split[3]); + dummy_cost, pc_tree->u.split[3]); } break; } diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c index fd213f1e6b..3b8b5345f1 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c @@ -31,12 +31,14 @@ #include "vpx_ports/system_state.h" #include "vpx_ports/vpx_once.h" #include "vpx_ports/vpx_timer.h" +#include "vpx_util/vpx_pthread.h" #if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG #include "vpx_util/vpx_debug_util.h" #endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_enums.h" #include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_idct.h" #if CONFIG_VP9_POSTPROC @@ -2135,24 +2137,22 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { cpi->external_resize = 1; } - if (cpi->initial_width) { - int new_mi_size = 0; - vp9_set_mb_mi(cm, cm->width, cm->height); - new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows); - if (cm->mi_alloc_size < new_mi_size) { - vp9_free_context_buffers(cm); - vp9_free_pc_tree(&cpi->td); - vpx_free(cpi->mbmi_ext_base); - alloc_compressor_data(cpi); - realloc_segmentation_maps(cpi); - cpi->initial_width = cpi->initial_height = 0; - cpi->external_resize = 0; - } else if (cm->mi_alloc_size == new_mi_size && - (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) { - if (vp9_alloc_loop_filter(cm)) { - vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to allocate loop filter data"); - } + int new_mi_size = 0; + vp9_set_mb_mi(cm, cm->width, cm->height); + new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows); + if (cm->mi_alloc_size < new_mi_size) { + vp9_free_context_buffers(cm); + vp9_free_pc_tree(&cpi->td); + vpx_free(cpi->mbmi_ext_base); + alloc_compressor_data(cpi); + realloc_segmentation_maps(cpi); + cpi->initial_width = cpi->initial_height = 0; + cpi->external_resize = 0; + } else if (cm->mi_alloc_size == new_mi_size && + (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) { + if (vp9_alloc_loop_filter(cm)) { + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate loop filter data"); } } @@ -3472,7 +3472,6 @@ void vp9_scale_references(VP9_COMP *cpi) { continue; } -#if CONFIG_VP9_HIGHBITDEPTH if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { RefCntBuffer *new_fb_ptr = NULL; int force_scaling = 0; @@ -3485,6 +3484,7 @@ void vp9_scale_references(VP9_COMP *cpi) { new_fb_ptr = &pool->frame_bufs[new_fb]; if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width || new_fb_ptr->buf.y_crop_height != cm->height) { +#if CONFIG_VP9_HIGHBITDEPTH if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, cm->use_highbitdepth, @@ -3494,22 +3494,7 @@ void vp9_scale_references(VP9_COMP *cpi) { "Failed to allocate frame buffer"); scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth, EIGHTTAP, 0); - cpi->scaled_ref_idx[ref_frame - 1] = new_fb; - alloc_frame_mvs(cm, new_fb); - } #else - if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { - RefCntBuffer *new_fb_ptr = NULL; - int force_scaling = 0; - int new_fb = cpi->scaled_ref_idx[ref_frame - 1]; - if (new_fb == INVALID_IDX) { - new_fb = get_free_fb(cm); - force_scaling = 1; - } - if (new_fb == INVALID_IDX) return; - new_fb_ptr = &pool->frame_bufs[new_fb]; - if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width || - new_fb_ptr->buf.y_crop_height != cm->height) { if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, VP9_ENC_BORDER_IN_PIXELS, @@ -3517,10 +3502,10 @@ void vp9_scale_references(VP9_COMP *cpi) { vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf, EIGHTTAP, 0); +#endif // CONFIG_VP9_HIGHBITDEPTH cpi->scaled_ref_idx[ref_frame - 1] = new_fb; alloc_frame_mvs(cm, new_fb); } -#endif // CONFIG_VP9_HIGHBITDEPTH } else { int buf_idx; RefCntBuffer *buf = NULL; @@ -3958,6 +3943,35 @@ static INLINE void set_raw_source_frame(VP9_COMP *cpi) { #endif } +static YV12_BUFFER_CONFIG *svc_twostage_scale( + VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, + YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type, + int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2) { + if (cm->mi_cols * MI_SIZE != unscaled->y_width || + cm->mi_rows * MI_SIZE != unscaled->y_height) { +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->bit_depth == VPX_BITS_8) { + vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2, + phase_scaler2); + vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, + phase_scaler); + } else { + scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth, + filter_type2, phase_scaler2); + scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth, + filter_type, phase_scaler); + } +#else + vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2, + phase_scaler2); + vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, phase_scaler); +#endif // CONFIG_VP9_HIGHBITDEPTH + return scaled; + } else { + return unscaled; + } +} + static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest) { VP9_COMMON *const cm = &cpi->common; @@ -4000,7 +4014,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, // result will be saved in scaled_temp and might be used later. const INTERP_FILTER filter_scaler2 = svc->downsample_filter_type[1]; const int phase_scaler2 = svc->downsample_filter_phase[1]; - cpi->Source = vp9_svc_twostage_scale( + cpi->Source = svc_twostage_scale( cm, cpi->un_scaled_source, &cpi->scaled_source, &svc->scaled_temp, filter_scaler, phase_scaler, filter_scaler2, phase_scaler2); svc->scaled_one_half = 1; @@ -4486,21 +4500,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest // external rate control model. // This flag doesn't have any impact when external rate control is not used. int ext_rc_recode = 0; - // Maximal frame size allowed by the external rate control. - // case: 0, we ignore the max frame size limit, and encode with the qindex - // passed in by the external rate control model. - // If the external qindex is VPX_DEFAULT_Q, libvpx will pick a qindex - // and may recode if undershoot/overshoot is seen. - // If the external qindex is not VPX_DEFAULT_Q, we force no recode. - // case: -1, we take libvpx's decision for the max frame size, as well as - // the recode decision. - // Otherwise: if a specific size is given, libvpx's recode decision - // will respect the given size. - int ext_rc_max_frame_size = 0; - // Use VP9's decision of qindex. This flag is in use only in external rate - // control model to help determine whether to recode when - // |ext_rc_max_frame_size| is 0. - int ext_rc_use_default_q = 1; const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth; #if CONFIG_RATE_CTRL @@ -4616,27 +4615,14 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest } #endif // CONFIG_RATE_CTRL if (cpi->ext_ratectrl.ready && !ext_rc_recode && + !cpi->tpl_with_external_rc && (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 && cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) { vpx_codec_err_t codec_status; const GF_GROUP *gf_group = &cpi->twopass.gf_group; vpx_rc_encodeframe_decision_t encode_frame_decision; - FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index]; - const int ref_frame_flags = get_ref_frame_flags(cpi); - RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES]; - const RefCntBuffer *curr_frame_buf = - get_ref_cnt_buffer(cm, cm->new_fb_idx); - // index 0 of a gf group is always KEY/OVERLAY/GOLDEN. - // index 1 refers to the first encoding frame in a gf group. - // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref. - // See function define_gf_group_structure(). - const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE; - get_ref_frame_bufs(cpi, ref_frame_bufs); codec_status = vp9_extrc_get_encodeframe_decision( - &cpi->ext_ratectrl, curr_frame_buf->frame_index, - cm->current_frame_coding_index, gf_group->index, update_type, - gf_group->gf_group_size, use_alt_ref, ref_frame_bufs, ref_frame_flags, - &encode_frame_decision); + &cpi->ext_ratectrl, gf_group->index, &encode_frame_decision); if (codec_status != VPX_CODEC_OK) { vpx_internal_error(&cm->error, codec_status, "vp9_extrc_get_encodeframe_decision() failed"); @@ -4645,9 +4631,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest // libvpx's default q. if (encode_frame_decision.q_index != VPX_DEFAULT_Q) { q = encode_frame_decision.q_index; - ext_rc_use_default_q = 0; } - ext_rc_max_frame_size = encode_frame_decision.max_frame_size; } vp9_set_quantizer(cpi, q); @@ -4690,21 +4674,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest if (cpi->ext_ratectrl.ready && (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) { - // In general, for the external rate control, we take the qindex provided - // as input and encode the frame with this qindex faithfully. However, - // in some extreme scenarios, the provided qindex leads to a massive - // overshoot of frame size. In this case, we fall back to VP9's decision - // to pick a new qindex and recode the frame. We return the new qindex - // through the API to the external model. - if (ext_rc_max_frame_size == 0) { - if (!ext_rc_use_default_q) break; - } else if (ext_rc_max_frame_size == -1) { - // Do nothing, fall back to libvpx's recode decision. - } else { - // Change the max frame size, used in libvpx's recode decision. - rc->max_frame_bandwidth = ext_rc_max_frame_size; - } - ext_rc_recode = 1; + break; } #if CONFIG_RATE_CTRL if (cpi->oxcf.use_simple_encode_api) { @@ -4974,35 +4944,6 @@ static void set_ext_overrides(VP9_COMP *cpi) { } } -YV12_BUFFER_CONFIG *vp9_svc_twostage_scale( - VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, - YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type, - int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2) { - if (cm->mi_cols * MI_SIZE != unscaled->y_width || - cm->mi_rows * MI_SIZE != unscaled->y_height) { -#if CONFIG_VP9_HIGHBITDEPTH - if (cm->bit_depth == VPX_BITS_8) { - vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2, - phase_scaler2); - vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, - phase_scaler); - } else { - scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth, - filter_type2, phase_scaler2); - scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth, - filter_type, phase_scaler); - } -#else - vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2, - phase_scaler2); - vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, phase_scaler); -#endif // CONFIG_VP9_HIGHBITDEPTH - return scaled; - } else { - return unscaled; - } -} - YV12_BUFFER_CONFIG *vp9_scale_if_required( VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler) { @@ -6429,7 +6370,12 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } if (arf_src_index) { - assert(arf_src_index <= rc->frames_to_key); + if (!(cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 && + cpi->ext_ratectrl.funcs.get_gop_decision != NULL)) { + // This assert only makes sense when not using external RC. + assert(arf_src_index <= rc->frames_to_key); + } if ((source = vp9_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) { cpi->alt_ref_source = source; @@ -6617,7 +6563,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cpi->twopass.gf_group.update_type[gf_group_index] == ARF_UPDATE && cpi->sf.enable_tpl_model) { vp9_init_tpl_buffer(cpi); - vp9_estimate_qp_gop(cpi); + vp9_estimate_tpl_qp_gop(cpi); vp9_setup_tpl_stats(cpi); } #if CONFIG_COLLECT_COMPONENT_TIMING diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h index 91df538821..898855d10d 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h @@ -25,6 +25,7 @@ #include "vpx_dsp/variance.h" #include "vpx_dsp/psnr.h" #include "vpx_ports/system_state.h" +#include "vpx_util/vpx_pthread.h" #include "vpx_util/vpx_thread.h" #include "vpx_util/vpx_timestamp.h" @@ -1062,7 +1063,7 @@ typedef struct VP9_COMP { */ uint64_t frame_component_time[kTimingComponents]; #endif - // Flag to indicate if QP and GOP for TPL is controlled by external RC. + // Flag to indicate if QP and GOP for TPL are controlled by external RC. int tpl_with_external_rc; } VP9_COMP; @@ -1395,11 +1396,6 @@ void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst); #endif // CONFIG_VP9_HIGHBITDEPTH -YV12_BUFFER_CONFIG *vp9_svc_twostage_scale( - VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, - YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type, - int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2); - YV12_BUFFER_CONFIG *vp9_scale_if_required( VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler); diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c index a8d1cb7a7a..c3b79507e6 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c @@ -17,6 +17,7 @@ #include "vp9/encoder/vp9_multi_thread.h" #include "vp9/encoder/vp9_temporal_filter.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_util/vpx_pthread.h" static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { int i, j, k, l, m, n; @@ -55,7 +56,7 @@ static int enc_worker_hook(void *arg1, void *unused) { vp9_encode_tile(cpi, thread_data->td, tile_row, tile_col); } - return 0; + return 1; } static int get_max_tile_cols(VP9_COMP *cpi) { @@ -106,6 +107,7 @@ static void create_enc_workers(VP9_COMP *cpi, int num_workers) { ++cpi->num_workers; winterface->init(worker); + worker->thread_name = "vpx enc worker"; if (i < num_workers - 1) { thread_data->cpi = cpi; @@ -204,8 +206,7 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) { create_enc_workers(cpi, num_workers); for (i = 0; i < num_workers; i++) { - EncWorkerData *thread_data; - thread_data = &cpi->tile_thr_data[i]; + EncWorkerData *const thread_data = &cpi->tile_thr_data[i]; // Before encoding a frame, copy the thread data from cpi. if (thread_data->td != &cpi->td) { @@ -456,7 +457,7 @@ static int first_pass_worker_hook(void *arg1, void *arg2) { this_tile, &best_ref_mv, mb_row); } } - return 0; + return 1; } void vp9_encode_fp_row_mt(VP9_COMP *cpi) { @@ -543,7 +544,7 @@ static int temporal_filter_worker_hook(void *arg1, void *arg2) { mb_col_start, mb_col_end); } } - return 0; + return 1; } void vp9_temporal_filter_row_mt(VP9_COMP *cpi) { @@ -616,7 +617,7 @@ static int enc_row_mt_worker_hook(void *arg1, void *arg2) { vp9_encode_sb_row(cpi, thread_data->td, tile_row, tile_col, mi_row); } } - return 0; + return 1; } void vp9_encode_tiles_row_mt(VP9_COMP *cpi) { diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h index 4c192da515..359cdd1290 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h @@ -11,13 +11,14 @@ #ifndef VPX_VP9_ENCODER_VP9_ETHREAD_H_ #define VPX_VP9_ENCODER_VP9_ETHREAD_H_ +#include "vpx_util/vpx_pthread.h" + #ifdef __cplusplus extern "C" { #endif #define MAX_NUM_TILE_COLS (1 << 6) #define MAX_NUM_TILE_ROWS 4 -#define MAX_NUM_THREADS 80 struct VP9_COMP; struct ThreadData; diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c index 4664e8c5e2..7b0d89acd2 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c @@ -156,32 +156,15 @@ static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) { } vpx_codec_err_t vp9_extrc_get_encodeframe_decision( - EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, - FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref, - RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, + EXT_RATECTRL *ext_ratectrl, int gop_index, vpx_rc_encodeframe_decision_t *encode_frame_decision) { - if (ext_ratectrl == NULL) { - return VPX_CODEC_INVALID_PARAM; - } - if (ext_ratectrl->ready && (ext_ratectrl->funcs.rc_type & VPX_RC_QP) != 0) { - vpx_rc_status_t rc_status; - vpx_rc_encodeframe_info_t encode_frame_info; - encode_frame_info.show_index = show_index; - encode_frame_info.coding_index = coding_index; - encode_frame_info.gop_index = gop_index; - encode_frame_info.frame_type = extrc_get_frame_type(update_type); - encode_frame_info.gop_size = gop_size; - encode_frame_info.use_alt_ref = use_alt_ref; - - vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs, - encode_frame_info.ref_frame_coding_indexes, - encode_frame_info.ref_frame_valid_list); + assert(ext_ratectrl != NULL); + assert(ext_ratectrl->ready && (ext_ratectrl->funcs.rc_type & VPX_RC_QP) != 0); - rc_status = ext_ratectrl->funcs.get_encodeframe_decision( - ext_ratectrl->model, &encode_frame_info, encode_frame_decision); - if (rc_status == VPX_RC_ERROR) { - return VPX_CODEC_ERROR; - } + vpx_rc_status_t rc_status = ext_ratectrl->funcs.get_encodeframe_decision( + ext_ratectrl->model, gop_index, encode_frame_decision); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; } return VPX_CODEC_OK; } @@ -222,29 +205,14 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result( } vpx_codec_err_t vp9_extrc_get_gop_decision( - EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info, - vpx_rc_gop_decision_t *gop_decision) { + EXT_RATECTRL *ext_ratectrl, vpx_rc_gop_decision_t *gop_decision) { vpx_rc_status_t rc_status; if (ext_ratectrl == NULL || !ext_ratectrl->ready || (ext_ratectrl->funcs.rc_type & VPX_RC_GOP) == 0) { return VPX_CODEC_INVALID_PARAM; } - rc_status = ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model, - gop_info, gop_decision); - if (gop_decision->use_alt_ref) { - const int arf_constraint = - gop_decision->gop_coding_frames >= gop_info->min_gf_interval && - gop_decision->gop_coding_frames < gop_info->lag_in_frames; - if (!arf_constraint || !gop_info->allow_alt_ref) return VPX_CODEC_ERROR; - } - // TODO(chengchen): Take min and max gf interval from the model - // and overwrite libvpx's decision so that we can get rid - // of one of the checks here. - if (gop_decision->gop_coding_frames > gop_info->frames_to_key || - gop_decision->gop_coding_frames - gop_decision->use_alt_ref > - gop_info->max_gf_interval) { - return VPX_CODEC_ERROR; - } + rc_status = + ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model, gop_decision); if (rc_status == VPX_RC_ERROR) { return VPX_CODEC_ERROR; } diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h index b04580c1d4..d1be5f2aef 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h @@ -39,9 +39,7 @@ vpx_codec_err_t vp9_extrc_send_tpl_stats(EXT_RATECTRL *ext_ratectrl, const VpxTplGopStats *tpl_gop_stats); vpx_codec_err_t vp9_extrc_get_encodeframe_decision( - EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, - FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref, - RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, + EXT_RATECTRL *ext_ratectrl, int gop_index, vpx_rc_encodeframe_decision_t *encode_frame_decision); vpx_codec_err_t vp9_extrc_update_encodeframe_result( @@ -50,9 +48,8 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result( const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth, uint32_t input_bit_depth, const int actual_encoding_qindex); -vpx_codec_err_t vp9_extrc_get_gop_decision( - EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info, - vpx_rc_gop_decision_t *gop_decision); +vpx_codec_err_t vp9_extrc_get_gop_decision(EXT_RATECTRL *ext_ratectrl, + vpx_rc_gop_decision_t *gop_decision); vpx_codec_err_t vp9_extrc_get_frame_rdmult( EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_extend.c b/media/libvpx/libvpx/vp9/encoder/vp9_extend.c index dcb62e8768..69261ac65f 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_extend.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_extend.c @@ -162,42 +162,3 @@ void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, dst->uv_stride, src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv, chroma_step); } - -void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst, int srcy, - int srcx, int srch, int srcw) { - // If the side is not touching the bounder then don't extend. - const int et_y = srcy ? 0 : dst->border; - const int el_y = srcx ? 0 : dst->border; - const int eb_y = srcy + srch != src->y_height - ? 0 - : dst->border + dst->y_height - src->y_height; - const int er_y = srcx + srcw != src->y_width - ? 0 - : dst->border + dst->y_width - src->y_width; - const int src_y_offset = srcy * src->y_stride + srcx; - const int dst_y_offset = srcy * dst->y_stride + srcx; - - const int et_uv = ROUND_POWER_OF_TWO(et_y, 1); - const int el_uv = ROUND_POWER_OF_TWO(el_y, 1); - const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1); - const int er_uv = ROUND_POWER_OF_TWO(er_y, 1); - const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1); - const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1); - const int srch_uv = ROUND_POWER_OF_TWO(srch, 1); - const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1); - // detect nv12 colorspace - const int chroma_step = src->v_buffer - src->u_buffer == 1 ? 2 : 1; - - copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride, - dst->y_buffer + dst_y_offset, dst->y_stride, srcw, srch, - et_y, el_y, eb_y, er_y, 1); - - copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride, - dst->u_buffer + dst_uv_offset, dst->uv_stride, srcw_uv, - srch_uv, et_uv, el_uv, eb_uv, er_uv, chroma_step); - - copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride, - dst->v_buffer + dst_uv_offset, dst->uv_stride, srcw_uv, - srch_uv, et_uv, el_uv, eb_uv, er_uv, chroma_step); -} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_extend.h b/media/libvpx/libvpx/vp9/encoder/vp9_extend.h index 4ba7fc95e3..21d7e68b9f 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_extend.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_extend.h @@ -21,9 +21,6 @@ extern "C" { void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst); -void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst, int srcy, - int srcx, int srch, int srcw); #ifdef __cplusplus } // extern "C" #endif diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c index a9cdf5353f..58b9b7ba61 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c @@ -37,6 +37,7 @@ #include "vp9/encoder/vp9_mcomp.h" #include "vp9/encoder/vp9_quantize.h" #include "vp9/encoder/vp9_rd.h" +#include "vpx/vpx_ext_ratectrl.h" #include "vpx_dsp/variance.h" #define OUTPUT_FPF 0 @@ -1164,7 +1165,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, v_fn_ptr.vf = get_block_variance_fn(bsize); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, 8); + v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd); } #endif // CONFIG_VP9_HIGHBITDEPTH this_motion_error = @@ -2769,38 +2770,6 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { } } #endif - // If the external rate control model for GOP is used, the gop decisions - // are overwritten. Specifically, |gop_coding_frames| and |use_alt_ref| - // will be overwritten. - if (cpi->ext_ratectrl.ready && - (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 && - cpi->ext_ratectrl.funcs.get_gop_decision != NULL && !end_of_sequence) { - vpx_codec_err_t codec_status; - vpx_rc_gop_decision_t gop_decision; - vpx_rc_gop_info_t gop_info; - gop_info.min_gf_interval = rc->min_gf_interval; - gop_info.max_gf_interval = rc->max_gf_interval; - gop_info.active_min_gf_interval = active_gf_interval.min; - gop_info.active_max_gf_interval = active_gf_interval.max; - gop_info.allow_alt_ref = allow_alt_ref; - gop_info.is_key_frame = is_key_frame; - gop_info.last_gop_use_alt_ref = rc->source_alt_ref_active; - gop_info.frames_since_key = rc->frames_since_key; - gop_info.frames_to_key = rc->frames_to_key; - gop_info.lag_in_frames = cpi->oxcf.lag_in_frames; - gop_info.show_index = cm->current_video_frame; - gop_info.coding_index = cm->current_frame_coding_index; - gop_info.gop_global_index = rc->gop_global_index; - - codec_status = vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_info, - &gop_decision); - if (codec_status != VPX_CODEC_OK) { - vpx_internal_error(&cm->error, codec_status, - "vp9_extrc_get_gop_decision() failed"); - } - gop_coding_frames = gop_decision.gop_coding_frames; - use_alt_ref = gop_decision.use_alt_ref; - } // Was the group length constrained by the requirement for a new KF? rc->constrained_gf_group = (gop_coding_frames >= rc->frames_to_key) ? 1 : 0; @@ -3600,32 +3569,71 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { else twopass->fr_content_type = FC_NORMAL; - // Keyframe and section processing. - if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) { - // Define next KF group and assign bits to it. - find_next_key_frame(cpi, show_idx); + // If the external rate control model for GOP is used, the gop decisions + // are overwritten, including whether to use key frame in this GF group, + // GF group length, and whether to use arf. + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 && + cpi->ext_ratectrl.funcs.get_gop_decision != NULL && + rc->frames_till_gf_update_due == 0) { + vpx_codec_err_t codec_status; + vpx_rc_gop_decision_t gop_decision; + codec_status = + vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_decision); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_get_gop_decision() failed"); + } + if (gop_decision.use_key_frame) { + cpi->common.frame_type = KEY_FRAME; + rc->frames_since_key = 0; + // Clear the alt ref active flag and last group multi arf flags as they + // can never be set for a key frame. + rc->source_alt_ref_active = 0; + // KF is always a GF so clear frames till next gf counter. + rc->frames_till_gf_update_due = 0; + } + + // A new GF group + if (rc->frames_till_gf_update_due == 0) { + vp9_zero(twopass->gf_group); + ++rc->gop_global_index; + if (gop_decision.use_alt_ref) { + rc->source_alt_ref_pending = 1; + } + rc->baseline_gf_interval = + gop_decision.gop_coding_frames - rc->source_alt_ref_pending; + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + define_gf_group_structure(cpi); + } } else { - cm->frame_type = INTER_FRAME; - } + // Keyframe and section processing. + if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) { + // Define next KF group and assign bits to it. + find_next_key_frame(cpi, show_idx); + } else { + cm->frame_type = INTER_FRAME; + } - // Define a new GF/ARF group. (Should always enter here for key frames). - if (rc->frames_till_gf_update_due == 0) { - define_gf_group(cpi, show_idx); + // Define a new GF/ARF group. (Should always enter here for key frames). + if (rc->frames_till_gf_update_due == 0) { + define_gf_group(cpi, show_idx); - rc->frames_till_gf_update_due = rc->baseline_gf_interval; + rc->frames_till_gf_update_due = rc->baseline_gf_interval; #if ARF_STATS_OUTPUT - { - FILE *fpfile; - fpfile = fopen("arf.stt", "a"); - ++arf_count; - fprintf(fpfile, "%10d %10ld %10d %10d %10ld %10ld\n", - cm->current_video_frame, rc->frames_till_gf_update_due, - rc->kf_boost, arf_count, rc->gfu_boost, cm->frame_type); - - fclose(fpfile); - } + { + FILE *fpfile; + fpfile = fopen("arf.stt", "a"); + ++arf_count; + fprintf(fpfile, "%10d %10ld %10d %10d %10ld %10ld\n", + cm->current_video_frame, rc->frames_till_gf_update_due, + rc->kf_boost, arf_count, rc->gfu_boost, cm->frame_type); + + fclose(fpfile); + } #endif + } } vp9_configure_buffer_updates(cpi, gf_group->index); diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c index 97838c38e6..b6be4f88ac 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c @@ -9,6 +9,7 @@ */ #include <assert.h> #include <stdlib.h> +#include <string.h> #include "./vpx_config.h" @@ -81,7 +82,6 @@ bail: return NULL; } -#define USE_PARTIAL_COPY 0 int vp9_lookahead_full(const struct lookahead_ctx *ctx) { return ctx->sz + 1 + MAX_PRE_FRAMES > ctx->max_sz; } @@ -94,11 +94,6 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, int64_t ts_start, int64_t ts_end, int use_highbitdepth, vpx_enc_frame_flags_t flags) { struct lookahead_entry *buf; -#if USE_PARTIAL_COPY - int row, col, active_end; - int mb_rows = (src->y_height + 15) >> 4; - int mb_cols = (src->y_width + 15) >> 4; -#endif int width = src->y_crop_width; int height = src->y_crop_height; int uv_width = src->uv_crop_width; @@ -119,76 +114,36 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, height != buf->img.y_crop_height || uv_width != buf->img.uv_crop_width || uv_height != buf->img.uv_crop_height; - larger_dimensions = width > buf->img.y_width || height > buf->img.y_height || - uv_width > buf->img.uv_width || - uv_height > buf->img.uv_height; + larger_dimensions = + width > buf->img.y_crop_width || height > buf->img.y_crop_height || + uv_width > buf->img.uv_crop_width || uv_height > buf->img.uv_crop_height; assert(!larger_dimensions || new_dimensions); -#if USE_PARTIAL_COPY - // TODO(jkoleszar): This is disabled for now, as - // vp9_copy_and_extend_frame_with_rect is not subsampling/alpha aware. - - // Only do this partial copy if the following conditions are all met: - // 1. Lookahead queue has has size of 1. - // 2. Active map is provided. - // 3. This is not a key frame, golden nor altref frame. - if (!new_dimensions && ctx->max_sz == 1 && active_map && !flags) { - for (row = 0; row < mb_rows; ++row) { - col = 0; - - while (1) { - // Find the first active macroblock in this row. - for (; col < mb_cols; ++col) { - if (active_map[col]) break; - } - - // No more active macroblock in this row. - if (col == mb_cols) break; - - // Find the end of active region in this row. - active_end = col; - - for (; active_end < mb_cols; ++active_end) { - if (!active_map[active_end]) break; - } - - // Only copy this active region. - vp9_copy_and_extend_frame_with_rect(src, &buf->img, row << 4, col << 4, - 16, (active_end - col) << 4); - - // Start again from the end of this active region. - col = active_end; - } - - active_map += mb_cols; - } - } else { -#endif - if (larger_dimensions) { - YV12_BUFFER_CONFIG new_img; - memset(&new_img, 0, sizeof(new_img)); - if (vpx_alloc_frame_buffer(&new_img, width, height, subsampling_x, - subsampling_y, + if (larger_dimensions) { + YV12_BUFFER_CONFIG new_img; + memset(&new_img, 0, sizeof(new_img)); + if (vpx_alloc_frame_buffer(&new_img, width, height, subsampling_x, + subsampling_y, #if CONFIG_VP9_HIGHBITDEPTH - use_highbitdepth, + use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS, 0)) - return 1; - vpx_free_frame_buffer(&buf->img); - buf->img = new_img; - } else if (new_dimensions) { - buf->img.y_crop_width = src->y_crop_width; - buf->img.y_crop_height = src->y_crop_height; - buf->img.uv_crop_width = src->uv_crop_width; - buf->img.uv_crop_height = src->uv_crop_height; - buf->img.subsampling_x = src->subsampling_x; - buf->img.subsampling_y = src->subsampling_y; - } - // Partial copy not implemented yet - vp9_copy_and_extend_frame(src, &buf->img); -#if USE_PARTIAL_COPY + VP9_ENC_BORDER_IN_PIXELS, 0)) + return 1; + vpx_free_frame_buffer(&buf->img); + buf->img = new_img; + } else if (new_dimensions) { + buf->img.y_width = src->y_width; + buf->img.y_height = src->y_height; + buf->img.uv_width = src->uv_width; + buf->img.uv_height = src->uv_height; + buf->img.y_crop_width = src->y_crop_width; + buf->img.y_crop_height = src->y_crop_height; + buf->img.uv_crop_width = src->uv_crop_width; + buf->img.uv_crop_height = src->uv_crop_height; + buf->img.subsampling_x = src->subsampling_x; + buf->img.subsampling_y = src->subsampling_y; } -#endif + vp9_copy_and_extend_frame(src, &buf->img); buf->ts_start = ts_start; buf->ts_end = ts_end; diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c index 0843cd97e4..6e124f9944 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c @@ -10,6 +10,7 @@ #include <assert.h> +#include "vpx_util/vpx_pthread.h" #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_ethread.h" #include "vp9/encoder/vp9_multi_thread.h" diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c index 3f4fe6957b..d37e020b0a 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c @@ -12,6 +12,7 @@ #include <math.h> #include "./vpx_dsp_rtcd.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/bitops.h" #include "vpx_ports/mem.h" #include "vp9/common/vp9_quant_common.h" diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c index 62d6b93028..76d5435e60 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c @@ -35,6 +35,7 @@ #include "vp9/encoder/vp9_ext_ratectrl.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/encoder/vp9_svc_layercontext.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_ext_ratectrl.h" @@ -1433,8 +1434,8 @@ static int rc_constant_q(const VP9_COMP *cpi, int *bottom_index, int *top_index, return q; } -static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, - int *top_index, int gf_group_index) { +int vp9_rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, + int *top_index, int gf_group_index) { const VP9_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; const VP9EncoderConfig *const oxcf = &cpi->oxcf; @@ -1581,7 +1582,6 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, q = active_worst_quality; } } - clamp(q, active_best_quality, active_worst_quality); *top_index = active_worst_quality; *bottom_index = active_best_quality; @@ -1603,8 +1603,8 @@ int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi, int *bottom_index, else q = rc_pick_q_and_bounds_one_pass_vbr(cpi, bottom_index, top_index); } else { - q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index, - gf_group_index); + q = vp9_rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index, + gf_group_index); } if (cpi->sf.use_nonrd_pick_mode) { if (cpi->sf.force_frame_boost == 1) q -= cpi->sf.max_delta_qindex; @@ -1675,63 +1675,6 @@ void vp9_configure_buffer_updates(VP9_COMP *cpi, int gf_group_index) { } } -void vp9_estimate_qp_gop(VP9_COMP *cpi) { - int gop_length = cpi->twopass.gf_group.gf_group_size; - int bottom_index, top_index; - int idx; - const int gf_index = cpi->twopass.gf_group.index; - const int is_src_frame_alt_ref = cpi->rc.is_src_frame_alt_ref; - const int refresh_frame_context = cpi->common.refresh_frame_context; - - for (idx = 1; idx <= gop_length; ++idx) { - TplDepFrame *tpl_frame = &cpi->tpl_stats[idx]; - int target_rate = cpi->twopass.gf_group.bit_allocation[idx]; - cpi->twopass.gf_group.index = idx; - vp9_rc_set_frame_target(cpi, target_rate); - vp9_configure_buffer_updates(cpi, idx); - if (cpi->tpl_with_external_rc) { - if (cpi->ext_ratectrl.ready && - (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 && - cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) { - VP9_COMMON *cm = &cpi->common; - vpx_codec_err_t codec_status; - const GF_GROUP *gf_group = &cpi->twopass.gf_group; - vpx_rc_encodeframe_decision_t encode_frame_decision; - FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index]; - RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES]; - const RefCntBuffer *curr_frame_buf = - get_ref_cnt_buffer(cm, cm->new_fb_idx); - // index 0 of a gf group is always KEY/OVERLAY/GOLDEN. - // index 1 refers to the first encoding frame in a gf group. - // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref. - // See function define_gf_group_structure(). - const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE; - const int frame_coding_index = cm->current_frame_coding_index + idx - 1; - get_ref_frame_bufs(cpi, ref_frame_bufs); - codec_status = vp9_extrc_get_encodeframe_decision( - &cpi->ext_ratectrl, curr_frame_buf->frame_index, frame_coding_index, - gf_group->index, update_type, gf_group->gf_group_size, use_alt_ref, - ref_frame_bufs, 0 /*ref_frame_flags is not used*/, - &encode_frame_decision); - if (codec_status != VPX_CODEC_OK) { - vpx_internal_error(&cm->error, codec_status, - "vp9_extrc_get_encodeframe_decision() failed"); - } - tpl_frame->base_qindex = encode_frame_decision.q_index; - } - } else { - tpl_frame->base_qindex = - rc_pick_q_and_bounds_two_pass(cpi, &bottom_index, &top_index, idx); - tpl_frame->base_qindex = VPXMAX(tpl_frame->base_qindex, 1); - } - } - // Reset the actual index and frame update - cpi->twopass.gf_group.index = gf_index; - cpi->rc.is_src_frame_alt_ref = is_src_frame_alt_ref; - cpi->common.refresh_frame_context = refresh_frame_context; - vp9_configure_buffer_updates(cpi, gf_index); -} - void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi, int frame_target, int *frame_under_shoot_limit, int *frame_over_shoot_limit) { @@ -3361,14 +3304,20 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) { cpi->rc.rate_correction_factors[INTER_NORMAL] = rate_correction_factor; } // For temporal layers, reset the rate control parametes across all - // temporal layers. If the first_spatial_layer_to_encode > 0, then this - // superframe has skipped lower base layers. So in this case we should also - // reset and force max-q for spatial layers < first_spatial_layer_to_encode. + // temporal layers. + // If the first_spatial_layer_to_encode > 0, then this superframe has + // skipped lower base layers. So in this case we should also reset and + // force max-q for spatial layers < first_spatial_layer_to_encode. + // For the case of no inter-layer prediction on delta frames: reset and + // force max-q for all spatial layers, to avoid excessive frame drops. if (cpi->use_svc) { int tl = 0; int sl = 0; SVC *svc = &cpi->svc; - for (sl = 0; sl < VPXMAX(1, svc->first_spatial_layer_to_encode); ++sl) { + int num_spatial_layers = VPXMAX(1, svc->first_spatial_layer_to_encode); + if (svc->disable_inter_layer_pred != INTER_LAYER_PRED_ON) + num_spatial_layers = svc->number_spatial_layers; + for (sl = 0; sl < num_spatial_layers; ++sl) { for (tl = 0; tl < svc->number_temporal_layers; ++tl) { const int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h index 48c49e937e..0c61ad3461 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h @@ -346,12 +346,14 @@ int vp9_encodedframe_overshoot(struct VP9_COMP *cpi, int frame_size, int *q); void vp9_configure_buffer_updates(struct VP9_COMP *cpi, int gf_group_index); -void vp9_estimate_qp_gop(struct VP9_COMP *cpi); - void vp9_compute_frame_low_motion(struct VP9_COMP *const cpi); void vp9_update_buffer_level_svc_preencode(struct VP9_COMP *cpi); +int vp9_rc_pick_q_and_bounds_two_pass(const struct VP9_COMP *cpi, + int *bottom_index, int *top_index, + int gf_group_index); + #ifdef __cplusplus } // extern "C" #endif diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c index 974e43c90f..447136ed84 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c @@ -1834,7 +1834,7 @@ static int check_best_zero_mv(const VP9_COMP *cpi, return 1; } -static INLINE int skip_iters(const int_mv iter_mvs[][2], int ite, int id) { +static INLINE int skip_iters(int_mv iter_mvs[][2], int ite, int id) { if (ite >= 2 && iter_mvs[ite - 2][!id].as_int == iter_mvs[ite][!id].as_int) { int_mv cur_fullpel_mv, prev_fullpel_mv; cur_fullpel_mv.as_mv.row = iter_mvs[ite][id].as_mv.row >> 3; diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c index b8910370e0..048ab8732d 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c @@ -18,9 +18,12 @@ #include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_firstpass.h" +#include "vp9/encoder/vp9_ratectrl.h" #include "vp9/encoder/vp9_tpl_model.h" #include "vpx/internal/vpx_codec_internal.h" #include "vpx/vpx_codec.h" +#include "vpx/vpx_ext_ratectrl.h" static int init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture, const GF_GROUP *gf_group, int *tpl_group_frames) { @@ -407,8 +410,12 @@ static void tpl_store_before_propagation(VpxTplBlockStats *tpl_block_stats, tpl_block_stats_ptr->col = mi_col * 8; tpl_block_stats_ptr->inter_cost = src_stats->inter_cost; tpl_block_stats_ptr->intra_cost = src_stats->intra_cost; - tpl_block_stats_ptr->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; - tpl_block_stats_ptr->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2; + // inter/intra_cost here is calculated with SATD which should be close + // enough to be used as inter/intra_pred_error + tpl_block_stats_ptr->inter_pred_err = src_stats->inter_cost; + tpl_block_stats_ptr->intra_pred_err = src_stats->intra_cost; + tpl_block_stats_ptr->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; + tpl_block_stats_ptr->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2; tpl_block_stats_ptr->mv_r = src_stats->mv.as_mv.row; tpl_block_stats_ptr->mv_c = src_stats->mv.as_mv.col; tpl_block_stats_ptr->ref_frame_index = ref_frame_idx; @@ -721,7 +728,9 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, 1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); tpl_stats->intra_cost = VPXMAX( 1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); - tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx]; + if (best_rf_idx >= 0) { + tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx]; + } tpl_stats->mv.as_int = best_mv.as_int; *ref_frame_idx = best_rf_idx; } @@ -1489,6 +1498,53 @@ static void accumulate_frame_tpl_stats(VP9_COMP *cpi) { } #endif // CONFIG_RATE_CTRL +void vp9_estimate_tpl_qp_gop(VP9_COMP *cpi) { + int gop_length = cpi->twopass.gf_group.gf_group_size; + int bottom_index, top_index; + int idx; + const int gf_index = cpi->twopass.gf_group.index; + const int is_src_frame_alt_ref = cpi->rc.is_src_frame_alt_ref; + const int refresh_frame_context = cpi->common.refresh_frame_context; + + for (idx = 1; idx <= gop_length; ++idx) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[idx]; + int target_rate = cpi->twopass.gf_group.bit_allocation[idx]; + cpi->twopass.gf_group.index = idx; + vp9_rc_set_frame_target(cpi, target_rate); + vp9_configure_buffer_updates(cpi, idx); + if (cpi->tpl_with_external_rc) { + VP9_COMMON *cm = &cpi->common; + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 && + cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) { + vpx_codec_err_t codec_status; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + vpx_rc_encodeframe_decision_t encode_frame_decision; + codec_status = vp9_extrc_get_encodeframe_decision( + &cpi->ext_ratectrl, gf_group->index - 1, &encode_frame_decision); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_get_encodeframe_decision() failed"); + } + tpl_frame->base_qindex = encode_frame_decision.q_index; + } else { + vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM, + "The external rate control library is not set " + "properly for TPL pass."); + } + } else { + tpl_frame->base_qindex = vp9_rc_pick_q_and_bounds_two_pass( + cpi, &bottom_index, &top_index, idx); + tpl_frame->base_qindex = VPXMAX(tpl_frame->base_qindex, 1); + } + } + // Reset the actual index and frame update + cpi->twopass.gf_group.index = gf_index; + cpi->rc.is_src_frame_alt_ref = is_src_frame_alt_ref; + cpi->common.refresh_frame_context = refresh_frame_context; + vp9_configure_buffer_updates(cpi, gf_index); +} + void vp9_setup_tpl_stats(VP9_COMP *cpi) { GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE]; const GF_GROUP *gf_group = &cpi->twopass.gf_group; @@ -1512,12 +1568,16 @@ void vp9_setup_tpl_stats(VP9_COMP *cpi) { mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize); } - // TPL stats has extra frames from next GOP. Trim those extra frames for - // Qmode. - trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats, extended_frame_count); - if (cpi->ext_ratectrl.ready && cpi->ext_ratectrl.funcs.send_tpl_gop_stats != NULL) { + // Intra search on key frame + if (gf_picture[0].update_type == KF_UPDATE) { + mc_flow_dispenser(cpi, gf_picture, 0, cpi->tpl_bsize); + } + // TPL stats has extra frames from next GOP. Trim those extra frames for + // Qmode. + trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats, + extended_frame_count); const vpx_codec_err_t codec_status = vp9_extrc_send_tpl_stats(&cpi->ext_ratectrl, &cpi->tpl_gop_stats); if (codec_status != VPX_CODEC_OK) { diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h index 04beb22610..de0ac39a1f 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h @@ -31,6 +31,7 @@ typedef struct GF_PICTURE { void vp9_init_tpl_buffer(VP9_COMP *cpi); void vp9_setup_tpl_stats(VP9_COMP *cpi); void vp9_free_tpl_buffer(VP9_COMP *cpi); +void vp9_estimate_tpl_qp_gop(VP9_COMP *cpi); void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, TX_SIZE tx_size); diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c index 94506aad0f..628dc4fead 100644 --- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c @@ -886,14 +886,14 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, scale_plane_1_to_2_phase_0( src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src_w, src_h, vp9_filter_kernels[filter_type][8], temp_buffer); - scale_plane_1_to_2_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer, - dst->uv_stride, src_w / 2, src_h / 2, - vp9_filter_kernels[filter_type][8], - temp_buffer); - scale_plane_1_to_2_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer, - dst->uv_stride, src_w / 2, src_h / 2, - vp9_filter_kernels[filter_type][8], - temp_buffer); + const int src_uv_w = src->uv_crop_width; + const int src_uv_h = src->uv_crop_height; + scale_plane_1_to_2_phase_0( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + src_uv_w, src_uv_h, vp9_filter_kernels[filter_type][8], temp_buffer); + scale_plane_1_to_2_phase_0( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + src_uv_w, src_uv_h, vp9_filter_kernels[filter_type][8], temp_buffer); free(temp_buffer); } } diff --git a/media/libvpx/libvpx/vp9/ratectrl_rtc.cc b/media/libvpx/libvpx/vp9/ratectrl_rtc.cc index fd81bce7b5..942c15ce49 100644 --- a/media/libvpx/libvpx/vp9/ratectrl_rtc.cc +++ b/media/libvpx/libvpx/vp9/ratectrl_rtc.cc @@ -12,10 +12,12 @@ #include <new> #include "vp9/common/vp9_common.h" +#include "vp9/encoder/vp9_aq_cyclicrefresh.h" #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_picklpf.h" #include "vpx/vp8cx.h" #include "vpx/vpx_codec.h" +#include "vpx_mem/vpx_mem.h" namespace libvpx { diff --git a/media/libvpx/libvpx/vp9/ratectrl_rtc.h b/media/libvpx/libvpx/vp9/ratectrl_rtc.h index 85005c5474..4c39255886 100644 --- a/media/libvpx/libvpx/vp9/ratectrl_rtc.h +++ b/media/libvpx/libvpx/vp9/ratectrl_rtc.h @@ -12,43 +12,34 @@ #define VPX_VP9_RATECTRL_RTC_H_ #include <cstdint> +#include <cstring> +#include <limits> #include <memory> -#include "vp9/common/vp9_enums.h" -#include "vp9/vp9_iface_common.h" -#include "vp9/encoder/vp9_aq_cyclicrefresh.h" -#include "vp9/vp9_cx_iface.h" +#include "vpx/vpx_encoder.h" #include "vpx/internal/vpx_ratectrl_rtc.h" -#include "vpx_mem/vpx_mem.h" struct VP9_COMP; namespace libvpx { struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig { - public: VP9RateControlRtcConfig() { - ss_number_layers = 1; - vp9_zero(max_quantizers); - vp9_zero(min_quantizers); - vp9_zero(scaling_factor_den); - vp9_zero(scaling_factor_num); - vp9_zero(layer_target_bitrate); - vp9_zero(ts_rate_decimator); + memset(layer_target_bitrate, 0, sizeof(layer_target_bitrate)); + memset(ts_rate_decimator, 0, sizeof(ts_rate_decimator)); scaling_factor_num[0] = 1; scaling_factor_den[0] = 1; max_quantizers[0] = max_quantizer; min_quantizers[0] = min_quantizer; - max_consec_drop = INT_MAX; } // Number of spatial layers - int ss_number_layers; - int max_quantizers[VPX_MAX_LAYERS]; - int min_quantizers[VPX_MAX_LAYERS]; - int scaling_factor_num[VPX_SS_MAX_LAYERS]; - int scaling_factor_den[VPX_SS_MAX_LAYERS]; + int ss_number_layers = 1; + int max_quantizers[VPX_MAX_LAYERS] = {}; + int min_quantizers[VPX_MAX_LAYERS] = {}; + int scaling_factor_num[VPX_SS_MAX_LAYERS] = {}; + int scaling_factor_den[VPX_SS_MAX_LAYERS] = {}; // This is only for SVC for now. - int max_consec_drop; + int max_consec_drop = std::numeric_limits<int>::max(); }; struct VP9FrameParamsQpRTC { @@ -105,9 +96,9 @@ class VP9RateControlRTC { const VP9FrameParamsQpRTC &frame_params); private: - VP9RateControlRTC() {} + VP9RateControlRTC() = default; bool InitRateControl(const VP9RateControlRtcConfig &cfg); - struct VP9_COMP *cpi_; + struct VP9_COMP *cpi_ = nullptr; }; } // namespace libvpx diff --git a/media/libvpx/libvpx/vp9/simple_encode.cc b/media/libvpx/libvpx/vp9/simple_encode.cc index 2e6f9a4513..5e565d1b1a 100644 --- a/media/libvpx/libvpx/vp9/simple_encode.cc +++ b/media/libvpx/libvpx/vp9/simple_encode.cc @@ -8,8 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <stdio.h> +#include <stdlib.h> + #include <memory> #include <vector> + #include "./ivfenc.h" #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_enums.h" @@ -888,6 +892,10 @@ void SimpleEncode::ComputeFirstPassStats() { use_highbitdepth = impl_ptr_->cpi->common.use_highbitdepth; #endif vpx_image_t img; + if (impl_ptr_->img_fmt == VPX_IMG_FMT_NV12) { + fprintf(stderr, "VPX_IMG_FMT_NV12 is not supported\n"); + abort(); + } vpx_img_alloc(&img, impl_ptr_->img_fmt, frame_width_, frame_height_, 1); rewind(in_file_); impl_ptr_->first_pass_stats.clear(); @@ -1053,6 +1061,10 @@ void SimpleEncode::StartEncode() { vp9_set_first_pass_stats(&oxcf, &stats); assert(impl_ptr_->cpi == nullptr); impl_ptr_->cpi = init_encoder(&oxcf, impl_ptr_->img_fmt); + if (impl_ptr_->img_fmt == VPX_IMG_FMT_NV12) { + fprintf(stderr, "VPX_IMG_FMT_NV12 is not supported\n"); + abort(); + } vpx_img_alloc(&impl_ptr_->tmp_img, impl_ptr_->img_fmt, frame_width_, frame_height_, 1); diff --git a/media/libvpx/libvpx/vp9/vp9_cx_iface.c b/media/libvpx/libvpx/vp9/vp9_cx_iface.c index 8df04f29f0..fe62bac5f2 100644 --- a/media/libvpx/libvpx/vp9/vp9_cx_iface.c +++ b/media/libvpx/libvpx/vp9/vp9_cx_iface.c @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <limits.h> +#include <stdint.h> #include <stdlib.h> #include <string.h> @@ -17,6 +19,7 @@ #include "vpx_dsp/psnr.h" #include "vpx_ports/static_assert.h" #include "vpx_ports/system_state.h" +#include "vpx_util/vpx_thread.h" #include "vpx_util/vpx_timestamp.h" #include "vpx/internal/vpx_codec_internal.h" #include "./vpx_version.h" @@ -110,7 +113,6 @@ struct vpx_codec_alg_priv { vpx_codec_priv_t base; vpx_codec_enc_cfg_t cfg; struct vp9_extracfg extra_cfg; - vpx_rational64_t timestamp_ratio; vpx_codec_pts_t pts_offset; unsigned char pts_offset_initialized; VP9EncoderConfig oxcf; @@ -190,7 +192,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK(extra_cfg, aq_mode, 0, AQ_MODE_COUNT - 2); RANGE_CHECK(extra_cfg, alt_ref_aq, 0, 1); RANGE_CHECK(extra_cfg, frame_periodic_boost, 0, 1); - RANGE_CHECK_HI(cfg, g_threads, 64); + RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS); RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS); RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_Q); RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100); @@ -1140,10 +1142,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx, if (res == VPX_CODEC_OK) { priv->pts_offset_initialized = 0; - // TODO(angiebird): Replace priv->timestamp_ratio by - // oxcf->g_timebase_in_ts - priv->timestamp_ratio = get_g_timebase_in_ts(priv->cfg.g_timebase); - set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg); #if CONFIG_VP9_HIGHBITDEPTH priv->oxcf.use_highbitdepth = @@ -1166,9 +1164,9 @@ static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) { return VPX_CODEC_OK; } -static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, - unsigned long duration, - vpx_enc_deadline_t deadline) { +static vpx_codec_err_t pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, + unsigned long duration, + vpx_enc_deadline_t deadline) { MODE new_mode = BEST; #if CONFIG_REALTIME_ONLY @@ -1179,13 +1177,16 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, case VPX_RC_ONE_PASS: if (deadline > 0) { // Convert duration parameter from stream timebase to microseconds. - uint64_t duration_us; - VPX_STATIC_ASSERT(TICKS_PER_SEC > 1000000 && (TICKS_PER_SEC % 1000000) == 0); - duration_us = duration * (uint64_t)ctx->timestamp_ratio.num / - (ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000)); + if (duration > UINT64_MAX / (uint64_t)ctx->oxcf.g_timebase_in_ts.num) { + ERROR("duration is too big"); + } + uint64_t duration_us = duration * + (uint64_t)ctx->oxcf.g_timebase_in_ts.num / + ((uint64_t)ctx->oxcf.g_timebase_in_ts.den * + (TICKS_PER_SEC / 1000000)); // If the deadline is more that the duration this frame is to be shown, // use good quality mode. Otherwise use realtime mode. @@ -1208,6 +1209,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, ctx->oxcf.mode = new_mode; vp9_change_config(ctx->cpi, &ctx->oxcf); } + return VPX_CODEC_OK; } // Turn on to test if supplemental superframe data breaks decoding @@ -1281,6 +1283,10 @@ static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi, .is_key_frame)) flags |= VPX_FRAME_IS_KEY; + if (!cpi->common.show_frame) { + flags |= VPX_FRAME_IS_INVISIBLE; + } + if (cpi->droppable) flags |= VPX_FRAME_IS_DROPPABLE; return flags; @@ -1318,7 +1324,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, volatile vpx_enc_frame_flags_t flags = enc_flags; volatile vpx_codec_pts_t pts = pts_val; VP9_COMP *const cpi = ctx->cpi; - const vpx_rational64_t *const timestamp_ratio = &ctx->timestamp_ratio; + const vpx_rational64_t *const timebase_in_ts = &ctx->oxcf.g_timebase_in_ts; size_t data_sz; vpx_codec_cx_pkt_t pkt; memset(&pkt, 0, sizeof(pkt)); @@ -1347,13 +1353,10 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, } } - if (!ctx->pts_offset_initialized) { - ctx->pts_offset = pts; - ctx->pts_offset_initialized = 1; + res = pick_quickcompress_mode(ctx, duration, deadline); + if (res != VPX_CODEC_OK) { + return res; } - pts -= ctx->pts_offset; - - pick_quickcompress_mode(ctx, duration, deadline); vpx_codec_pkt_list_init(&ctx->pkt_list); // Handle Flags @@ -1384,20 +1387,53 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, if (res == VPX_CODEC_OK) { unsigned int lib_flags = 0; - YV12_BUFFER_CONFIG sd; - int64_t dst_time_stamp = timebase_units_to_ticks(timestamp_ratio, pts); size_t size, cx_data_sz; unsigned char *cx_data; - cpi->svc.timebase_fac = timebase_units_to_ticks(timestamp_ratio, 1); - cpi->svc.time_stamp_superframe = dst_time_stamp; - // Set up internal flags if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1; if (img != NULL) { + YV12_BUFFER_CONFIG sd; + + if (!ctx->pts_offset_initialized) { + ctx->pts_offset = pts; + ctx->pts_offset_initialized = 1; + } + if (pts < ctx->pts_offset) { + vpx_internal_error(&cpi->common.error, VPX_CODEC_INVALID_PARAM, + "pts is smaller than initial pts"); + } + pts -= ctx->pts_offset; + if (pts > INT64_MAX / timebase_in_ts->num) { + vpx_internal_error( + &cpi->common.error, VPX_CODEC_INVALID_PARAM, + "conversion of relative pts to ticks would overflow"); + } + const int64_t dst_time_stamp = + timebase_units_to_ticks(timebase_in_ts, pts); + + cpi->svc.timebase_fac = timebase_units_to_ticks(timebase_in_ts, 1); + cpi->svc.time_stamp_superframe = dst_time_stamp; + +#if ULONG_MAX > INT64_MAX + if (duration > INT64_MAX) { + vpx_internal_error(&cpi->common.error, VPX_CODEC_INVALID_PARAM, + "duration is too big"); + } +#endif + if (pts > INT64_MAX - (int64_t)duration) { + vpx_internal_error(&cpi->common.error, VPX_CODEC_INVALID_PARAM, + "relative pts + duration is too big"); + } + vpx_codec_pts_t pts_end = pts + (int64_t)duration; + if (pts_end > INT64_MAX / timebase_in_ts->num) { + vpx_internal_error( + &cpi->common.error, VPX_CODEC_INVALID_PARAM, + "conversion of relative pts + duration to ticks would overflow"); + } const int64_t dst_end_time_stamp = - timebase_units_to_ticks(timestamp_ratio, pts + duration); + timebase_units_to_ticks(timebase_in_ts, pts_end); res = image2yuvconfig(img, &sd); if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) { @@ -1434,7 +1470,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, if (cx_data_sz < ctx->cx_data_sz / 2) { vpx_internal_error(&cpi->common.error, VPX_CODEC_ERROR, "Compressed data buffer too small"); - return VPX_CODEC_ERROR; } } @@ -1443,6 +1478,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, // compute first pass stats if (img) { int ret; + int64_t dst_time_stamp; int64_t dst_end_time_stamp; vpx_codec_cx_pkt_t fps_pkt; ENCODE_FRAME_RESULT encode_frame_result; @@ -1469,6 +1505,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, #endif // !CONFIG_REALTIME_ONLY } else { ENCODE_FRAME_RESULT encode_frame_result; + int64_t dst_time_stamp; int64_t dst_end_time_stamp; vp9_init_encode_frame_result(&encode_frame_result); while (cx_data_sz >= ctx->cx_data_sz / 2 && @@ -1507,10 +1544,10 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, if (ctx->output_cx_pkt_cb.output_cx_pkt) { pkt.kind = VPX_CODEC_CX_FRAME_PKT; pkt.data.frame.pts = - ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) + + ticks_to_timebase_units(timebase_in_ts, dst_time_stamp) + ctx->pts_offset; pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units( - timestamp_ratio, dst_end_time_stamp - dst_time_stamp); + timebase_in_ts, dst_end_time_stamp - dst_time_stamp); pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags); pkt.data.frame.buf = ctx->pending_cx_data; pkt.data.frame.sz = size; @@ -1527,10 +1564,10 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, // Add the frame packet to the list of returned packets. pkt.kind = VPX_CODEC_CX_FRAME_PKT; pkt.data.frame.pts = - ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) + + ticks_to_timebase_units(timebase_in_ts, dst_time_stamp) + ctx->pts_offset; pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units( - timestamp_ratio, dst_end_time_stamp - dst_time_stamp); + timebase_in_ts, dst_end_time_stamp - dst_time_stamp); pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags); pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width; pkt.data.frame.height[cpi->svc.spatial_layer_id] = cpi->common.height; @@ -1979,6 +2016,7 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx, ratectrl_config.frame_rate_den = oxcf->g_timebase.num; ratectrl_config.overshoot_percent = oxcf->over_shoot_pct; ratectrl_config.undershoot_percent = oxcf->under_shoot_pct; + ratectrl_config.base_qp = oxcf->cq_level; if (oxcf->rc_mode == VPX_VBR) { ratectrl_config.rc_mode = VPX_RC_VBR; @@ -2223,7 +2261,7 @@ static vpx_codec_enc_cfg_t get_enc_cfg(int frame_width, int frame_height, return enc_cfg; } -static vp9_extracfg get_extra_cfg() { +static vp9_extracfg get_extra_cfg(void) { vp9_extracfg extra_cfg = default_extra_cfg; return extra_cfg; } diff --git a/media/libvpx/libvpx/vp9/vp9_dx_iface.c b/media/libvpx/libvpx/vp9/vp9_dx_iface.c index 860f721dc5..7567910b9b 100644 --- a/media/libvpx/libvpx/vp9/vp9_dx_iface.c +++ b/media/libvpx/libvpx/vp9/vp9_dx_iface.c @@ -19,7 +19,6 @@ #include "vpx/vpx_decoder.h" #include "vpx_dsp/bitreader_buffer.h" #include "vpx_dsp/vpx_dsp_common.h" -#include "vpx_util/vpx_thread.h" #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_frame_buffers.h" diff --git a/media/libvpx/libvpx/vp9/vp9cx.mk b/media/libvpx/libvpx/vp9/vp9cx.mk index 44790ef6a4..7a0e2d8d1f 100644 --- a/media/libvpx/libvpx/vp9/vp9cx.mk +++ b/media/libvpx/libvpx/vp9/vp9cx.mk @@ -140,6 +140,7 @@ endif VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c +VP9_CX_SRCS-$(HAVE_SVE) += encoder/arm/neon/vp9_error_sve.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) diff --git a/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h b/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h index 01d64b14b7..2643b5578a 100644 --- a/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h +++ b/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h @@ -22,8 +22,14 @@ enum class FrameDropDecision { kDrop, // Frame is dropped. }; +struct UVDeltaQP { + // For the UV channel: the QP for the dc/ac value is given as + // GetQP() + uvdc/ac_delta_q, where the uvdc/ac_delta_q are negative numbers. + int uvdc_delta_q; + int uvac_delta_q; +}; + struct VpxRateControlRtcConfig { - public: VpxRateControlRtcConfig() { width = 1280; height = 720; diff --git a/media/libvpx/libvpx/vpx/src/vpx_encoder.c b/media/libvpx/libvpx/vpx/src/vpx_encoder.c index 017525aeee..001d854abe 100644 --- a/media/libvpx/libvpx/vpx/src/vpx_encoder.c +++ b/media/libvpx/libvpx/vpx/src/vpx_encoder.c @@ -14,6 +14,7 @@ */ #include <assert.h> #include <limits.h> +#include <stdint.h> #include <stdlib.h> #include <string.h> #include "vp8/common/blockd.h" @@ -184,8 +185,8 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, while (0) #else -static void FLOATING_POINT_INIT() {} -static void FLOATING_POINT_RESTORE() {} +static void FLOATING_POINT_INIT(void) {} +static void FLOATING_POINT_RESTORE(void) {} #endif vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img, @@ -200,6 +201,10 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img, res = VPX_CODEC_ERROR; else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER)) res = VPX_CODEC_INCAPABLE; +#if ULONG_MAX > UINT32_MAX + else if (duration > UINT32_MAX || deadline > UINT32_MAX) + res = VPX_CODEC_INVALID_PARAM; +#endif else { unsigned int num_enc = ctx->priv->enc.total_encoders; diff --git a/media/libvpx/libvpx/vpx/src/vpx_image.c b/media/libvpx/libvpx/vpx/src/vpx_image.c index f9f0dd6025..3f7ff74244 100644 --- a/media/libvpx/libvpx/vpx/src/vpx_image.c +++ b/media/libvpx/libvpx/vpx/src/vpx_image.c @@ -27,6 +27,8 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt, if (img != NULL) memset(img, 0, sizeof(vpx_image_t)); + if (fmt == VPX_IMG_FMT_NONE) goto fail; + /* Treat align==0 like align==1 */ if (!buf_align) buf_align = 1; @@ -56,7 +58,7 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt, /* Get chroma shift values for this format */ // For VPX_IMG_FMT_NV12, xcs needs to be 0 such that UV data is all read at - // one time. + // once. switch (fmt) { case VPX_IMG_FMT_I420: case VPX_IMG_FMT_YV12: diff --git a/media/libvpx/libvpx/vpx/src/vpx_tpl.c b/media/libvpx/libvpx/vpx/src/vpx_tpl.c index 62c2a9c857..b0687a8135 100644 --- a/media/libvpx/libvpx/vpx/src/vpx_tpl.c +++ b/media/libvpx/libvpx/vpx/src/vpx_tpl.c @@ -47,8 +47,8 @@ vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file, "%" PRId64 " %" PRId64 " %" PRId16 " %" PRId16 " %" PRId64 " %" PRId64 " %d\n", block_stats.inter_cost, block_stats.intra_cost, - block_stats.mv_c, block_stats.mv_r, block_stats.recrf_dist, - block_stats.recrf_rate, block_stats.ref_frame_index)); + block_stats.mv_c, block_stats.mv_r, block_stats.srcrf_dist, + block_stats.srcrf_rate, block_stats.ref_frame_index)); } } @@ -88,7 +88,7 @@ vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file, " %" SCNd64 " %d\n", &block_stats->inter_cost, &block_stats->intra_cost, &block_stats->mv_c, &block_stats->mv_r, - &block_stats->recrf_dist, &block_stats->recrf_rate, + &block_stats->srcrf_dist, &block_stats->srcrf_rate, &block_stats->ref_frame_index), 7); } diff --git a/media/libvpx/libvpx/vpx/vp8cx.h b/media/libvpx/libvpx/vpx/vp8cx.h index b12938d3d8..dfdbb3c770 100644 --- a/media/libvpx/libvpx/vpx/vp8cx.h +++ b/media/libvpx/libvpx/vpx/vp8cx.h @@ -772,6 +772,8 @@ enum vp8e_enc_control_id { /*!\brief Codec control to use external RC to control TPL. * * This will use external RC to control the QP and GOP structure for TPL. + * (rc_type & VPX_RC_QP) in vpx_rc_funcs_t must be non zero. + * get_encodeframe_decision callback in vpx_rc_funcs_t also needs to be set. * * Supported in codecs: VP9 */ diff --git a/media/libvpx/libvpx/vpx/vpx_encoder.h b/media/libvpx/libvpx/vpx/vpx_encoder.h index 18e3862bd7..809a097d94 100644 --- a/media/libvpx/libvpx/vpx/vpx_encoder.h +++ b/media/libvpx/libvpx/vpx/vpx_encoder.h @@ -31,7 +31,6 @@ extern "C" { #include "./vpx_codec.h" // IWYU pragma: export #include "./vpx_ext_ratectrl.h" -#include "./vpx_tpl.h" /*! Temporal Scalability: Maximum length of the sequence defining frame * layer membership @@ -57,10 +56,15 @@ extern "C" { * must be bumped. Examples include, but are not limited to, changing * types, removing or reassigning enums, adding/removing/rearranging * fields to structures + * + * \note + * VPX_ENCODER_ABI_VERSION has a VPX_EXT_RATECTRL_ABI_VERSION component + * because the VP9E_SET_EXTERNAL_RATE_CONTROL codec control uses + * vpx_rc_funcs_t. */ -#define VPX_ENCODER_ABI_VERSION \ - (16 + VPX_CODEC_ABI_VERSION + VPX_EXT_RATECTRL_ABI_VERSION + \ - VPX_TPL_ABI_VERSION) /**<\hideinitializer*/ +#define VPX_ENCODER_ABI_VERSION \ + (18 + VPX_CODEC_ABI_VERSION + \ + VPX_EXT_RATECTRL_ABI_VERSION) /**<\hideinitializer*/ /*! \brief Encoder capabilities bitfield * @@ -1074,6 +1078,12 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img, * The buffer was set successfully. * \retval #VPX_CODEC_INVALID_PARAM * A parameter was NULL, the image format is unsupported, etc. + * + * \note + * `duration` and `deadline` are of the unsigned long type, which can be 32 + * or 64 bits. `duration` and `deadline` must be less than or equal to + * UINT32_MAX so that their ranges are independent of the size of unsigned + * long. */ vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t *ctx, const vpx_fixed_buf_t *buf, diff --git a/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h b/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h index 46d290dff4..ba12e4f83b 100644 --- a/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h +++ b/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h @@ -26,7 +26,7 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures. */ -#define VPX_EXT_RATECTRL_ABI_VERSION (7) +#define VPX_EXT_RATECTRL_ABI_VERSION (5 + VPX_TPL_ABI_VERSION) /*!\brief The control type of the inference API. * In VPX_RC_QP mode, the external rate control model determines the @@ -81,17 +81,10 @@ typedef void *vpx_rc_model_t; * * The encoder will receive the decision from the external rate control model * through get_encodeframe_decision() defined in vpx_rc_funcs_t. - * - * If q_index = VPX_DEFAULT_Q, the encoder will use libvpx's default q. - * - * If max_frame_size = 0, the encoding ignores max frame size limit. - * If max_frame_size = -1, the encoding uses VP9's max frame size as the limit. - * If the encoded frame size is larger than max_frame_size, the frame is - * recoded to meet the size limit, following VP9's recoding principles. */ typedef struct vpx_rc_encodeframe_decision { - int q_index; /**< Quantizer step index [0..255]*/ - int max_frame_size; /**< Maximal frame size allowed to encode a frame*/ + int q_index; /**< Quantizer step index [0..255]*/ + int rdmult; /**< Frame level Lagrangian multiplier*/ } vpx_rc_encodeframe_decision_t; /*!\brief Information for the frame to be encoded. @@ -322,6 +315,7 @@ typedef struct vpx_rc_config { vpx_ext_rc_mode_t rc_mode; /**< Q mode or VBR mode */ int overshoot_percent; /**< for VBR mode only */ int undershoot_percent; /**< for VBR mode only */ + int base_qp; /**< base QP for leaf frames, 0-255 */ } vpx_rc_config_t; /*!\brief Information passed to the external rate control model to @@ -400,6 +394,7 @@ typedef struct vpx_rc_gop_info { typedef struct vpx_rc_gop_decision { int gop_coding_frames; /**< The number of frames of this GOP */ int use_alt_ref; /**< Whether to use alt ref for this GOP */ + int use_key_frame; /**< Whether to set key frame for this GOP */ } vpx_rc_gop_decision_t; /*!\brief Create an external rate control model callback prototype @@ -446,12 +441,11 @@ typedef vpx_rc_status_t (*vpx_rc_send_tpl_gop_stats_cb_fn_t)( * the external rate control model. * * \param[in] rate_ctrl_model rate control model - * \param[in] encode_frame_info information of the coding frame + * \param[in] frame_gop_index index of the frame in current gop * \param[out] frame_decision encode decision of the coding frame */ typedef vpx_rc_status_t (*vpx_rc_get_encodeframe_decision_cb_fn_t)( - vpx_rc_model_t rate_ctrl_model, - const vpx_rc_encodeframe_info_t *encode_frame_info, + vpx_rc_model_t rate_ctrl_model, const int frame_gop_index, vpx_rc_encodeframe_decision_t *frame_decision); /*!\brief Update encode frame result callback prototype @@ -472,12 +466,10 @@ typedef vpx_rc_status_t (*vpx_rc_update_encodeframe_result_cb_fn_t)( * the external rate control model. * * \param[in] rate_ctrl_model rate control model - * \param[in] gop_info information collected from the encoder * \param[out] gop_decision GOP decision from the model */ typedef vpx_rc_status_t (*vpx_rc_get_gop_decision_cb_fn_t)( - vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info, - vpx_rc_gop_decision_t *gop_decision); + vpx_rc_model_t rate_ctrl_model, vpx_rc_gop_decision_t *gop_decision); /*!\brief Get the frame rdmult from the external rate control model. * diff --git a/media/libvpx/libvpx/vpx/vpx_tpl.h b/media/libvpx/libvpx/vpx/vpx_tpl.h index a250aada60..7e4c9ab7e1 100644 --- a/media/libvpx/libvpx/vpx/vpx_tpl.h +++ b/media/libvpx/libvpx/vpx/vpx_tpl.h @@ -32,19 +32,21 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ -#define VPX_TPL_ABI_VERSION (2) /**<\hideinitializer*/ +#define VPX_TPL_ABI_VERSION (3) /**<\hideinitializer*/ /*!\brief Temporal dependency model stats for each block before propagation */ typedef struct VpxTplBlockStats { - int16_t row; /**< Pixel row of the top left corner */ - int16_t col; /**< Pixel col of the top left corner */ - int64_t intra_cost; /**< Intra cost */ - int64_t inter_cost; /**< Inter cost */ - int16_t mv_r; /**< Motion vector row */ - int16_t mv_c; /**< Motion vector col */ - int64_t recrf_rate; /**< Rate from reconstructed ref frame */ - int64_t recrf_dist; /**< Distortion from reconstructed ref frame */ - int ref_frame_index; /**< Ref frame index in the ref frame buffer */ + int16_t row; /**< Pixel row of the top left corner */ + int16_t col; /**< Pixel col of the top left corner */ + int64_t intra_cost; /**< Intra cost */ + int64_t inter_cost; /**< Inter cost */ + int16_t mv_r; /**< Motion vector row */ + int16_t mv_c; /**< Motion vector col */ + int64_t srcrf_rate; /**< Rate from source ref frame */ + int64_t srcrf_dist; /**< Distortion from source ref frame */ + int64_t inter_pred_err; /**< Inter prediction error */ + int64_t intra_pred_err; /**< Intra prediction error */ + int ref_frame_index; /**< Ref frame index in the ref frame buffer */ } VpxTplBlockStats; /*!\brief Temporal dependency model stats for each frame before propagation */ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c index 683df5797a..f8b94620d4 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c @@ -168,40 +168,40 @@ static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr, \ if (xoffset == 0) { \ if (yoffset == 0) { \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp[w * h]; \ highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \ h); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride, \ src_stride, h, yoffset); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } \ } else if (xoffset == 4) { \ uint16_t tmp0[w * (h + 1)]; \ if (yoffset == 0) { \ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * (h + 1)]; \ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ (h + 1)); \ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * (h + 1)]; \ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ (h + 1)); \ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } else { \ @@ -209,21 +209,21 @@ static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr, if (yoffset == 0) { \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \ xoffset); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ (h + 1), xoffset); \ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ (h + 1), xoffset); \ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } \ @@ -430,22 +430,22 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, } while (--i != 0); } -#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ - uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t tmp0[w * (h + 1)]; \ - uint16_t tmp1[w * h]; \ - uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ - \ - highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ - xoffset); \ - highbd_avg_pred_var_filter_block2d_bil_w##w( \ - tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ - \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ - CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ +#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ + uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t tmp0[w * (h + 1)]; \ + uint16_t tmp1[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ + xoffset); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ + \ + return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \ + w, ref, ref_stride, sse); \ } #define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ @@ -460,19 +460,19 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, if (yoffset == 0) { \ highbd_avg_pred(src_ptr, tmp, source_stride, w, h, \ CONVERT_TO_SHORTPTR(second_pred)); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ highbd_avg_pred_var_filter_block2d_avg( \ src_ptr, tmp, source_stride, source_stride, w, h, \ CONVERT_TO_SHORTPTR(second_pred)); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } else { \ highbd_avg_pred_var_filter_block2d_bil_w##w( \ src_ptr, tmp, source_stride, source_stride, h, yoffset, \ CONVERT_TO_SHORTPTR(second_pred)); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } \ } else if (xoffset == 4) { \ @@ -481,7 +481,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, highbd_avg_pred_var_filter_block2d_avg( \ src_ptr, tmp0, source_stride, 1, w, h, \ CONVERT_TO_SHORTPTR(second_pred)); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * (h + 1)]; \ @@ -489,7 +489,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, (h + 1)); \ highbd_avg_pred_var_filter_block2d_avg( \ tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * (h + 1)]; \ @@ -497,7 +497,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, (h + 1)); \ highbd_avg_pred_var_filter_block2d_bil_w##w( \ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } else { \ @@ -506,7 +506,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, highbd_avg_pred_var_filter_block2d_bil_w##w( \ src_ptr, tmp0, source_stride, 1, h, xoffset, \ CONVERT_TO_SHORTPTR(second_pred)); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * h]; \ @@ -514,7 +514,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, (h + 1), xoffset); \ highbd_avg_pred_var_filter_block2d_avg( \ tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * h]; \ @@ -522,7 +522,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, (h + 1), xoffset); \ highbd_avg_pred_var_filter_block2d_bil_w##w( \ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } \ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c new file mode 100644 index 0000000000..cebe06b099 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c @@ -0,0 +1,344 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" +#include "vpx_dsp/arm/vpx_neon_sve_bridge.h" +#include "vpx_ports/mem.h" + +static INLINE uint32_t highbd_mse_wxh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int w, int h) { + uint64x2_t sse = vdupq_n_u64(0); + + do { + int j = 0; + do { + uint16x8_t s = vld1q_u16(src_ptr + j); + uint16x8_t r = vld1q_u16(ref_ptr + j); + + uint16x8_t diff = vabdq_u16(s, r); + + sse = vpx_dotq_u16(sse, diff, diff); + + j += 8; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--h != 0); + + return (uint32_t)horizontal_add_uint64x2(sse); +} + +#define HIGHBD_MSE_WXH_SVE(w, h) \ + uint32_t vpx_highbd_10_mse##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + uint32_t sse_tmp = \ + highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h); \ + sse_tmp = ROUND_POWER_OF_TWO(sse_tmp, 4); \ + *sse = sse_tmp; \ + return sse_tmp; \ + } \ + \ + uint32_t vpx_highbd_12_mse##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + uint32_t sse_tmp = \ + highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h); \ + sse_tmp = ROUND_POWER_OF_TWO(sse_tmp, 8); \ + *sse = sse_tmp; \ + return sse_tmp; \ + } + +HIGHBD_MSE_WXH_SVE(16, 16) +HIGHBD_MSE_WXH_SVE(16, 8) +HIGHBD_MSE_WXH_SVE(8, 16) +HIGHBD_MSE_WXH_SVE(8, 8) + +#undef HIGHBD_MSE_WXH_SVE + +// Process a block of width 4 two rows at a time. +static INLINE void highbd_variance_4xh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, uint64_t *sse, + int64_t *sum) { + int16x8_t sum_s16 = vdupq_n_s16(0); + int64x2_t sse_s64 = vdupq_n_s64(0); + + do { + const uint16x8_t s = load_unaligned_u16q(src_ptr, src_stride); + const uint16x8_t r = load_unaligned_u16q(ref_ptr, ref_stride); + + int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); + sum_s16 = vaddq_s16(sum_s16, diff); + sse_s64 = vpx_dotq_s16(sse_s64, diff, diff); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + h -= 2; + } while (h != 0); + + *sum = horizontal_add_int16x8(sum_s16); + *sse = horizontal_add_int64x2(sse_s64); +} + +static INLINE void highbd_variance_8xh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, uint64_t *sse, + int64_t *sum) { + int32x4_t sum_s32 = vdupq_n_s32(0); + int64x2_t sse_s64 = vdupq_n_s64(0); + + do { + const uint16x8_t s = vld1q_u16(src_ptr); + const uint16x8_t r = vld1q_u16(ref_ptr); + + const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); + sum_s32 = vpadalq_s16(sum_s32, diff); + sse_s64 = vpx_dotq_s16(sse_s64, diff, diff); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--h != 0); + + *sum = horizontal_add_int32x4(sum_s32); + *sse = horizontal_add_int64x2(sse_s64); +} + +static INLINE void highbd_variance_16xh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int64x2_t sse_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + const uint16x8_t s0 = vld1q_u16(src_ptr); + const uint16x8_t s1 = vld1q_u16(src_ptr + 8); + + const uint16x8_t r0 = vld1q_u16(ref_ptr); + const uint16x8_t r1 = vld1q_u16(ref_ptr + 8); + + const int16x8_t diff0 = vreinterpretq_s16_u16(vsubq_u16(s0, r0)); + const int16x8_t diff1 = vreinterpretq_s16_u16(vsubq_u16(s1, r1)); + + sum_s32[0] = vpadalq_s16(sum_s32[0], diff0); + sum_s32[1] = vpadalq_s16(sum_s32[1], diff1); + + sse_s64[0] = vpx_dotq_s16(sse_s64[0], diff0, diff0); + sse_s64[1] = vpx_dotq_s16(sse_s64[1], diff1, diff1); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--h != 0); + + sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[1]); + sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[1]); + + *sum = horizontal_add_int32x4(sum_s32[0]); + *sse = horizontal_add_int64x2(sse_s64[0]); +} + +static INLINE void highbd_variance_wxh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int w, int h, + uint64_t *sse, int64_t *sum) { + int32x4_t sum_s32[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), + vdupq_n_s32(0) }; + int64x2_t sse_s64[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0), + vdupq_n_s64(0) }; + + do { + int i = 0; + do { + const uint16x8_t s0 = vld1q_u16(src_ptr + i); + const uint16x8_t s1 = vld1q_u16(src_ptr + i + 8); + const uint16x8_t s2 = vld1q_u16(src_ptr + i + 16); + const uint16x8_t s3 = vld1q_u16(src_ptr + i + 24); + + const uint16x8_t r0 = vld1q_u16(ref_ptr + i); + const uint16x8_t r1 = vld1q_u16(ref_ptr + i + 8); + const uint16x8_t r2 = vld1q_u16(ref_ptr + i + 16); + const uint16x8_t r3 = vld1q_u16(ref_ptr + i + 24); + + const int16x8_t diff0 = vreinterpretq_s16_u16(vsubq_u16(s0, r0)); + const int16x8_t diff1 = vreinterpretq_s16_u16(vsubq_u16(s1, r1)); + const int16x8_t diff2 = vreinterpretq_s16_u16(vsubq_u16(s2, r2)); + const int16x8_t diff3 = vreinterpretq_s16_u16(vsubq_u16(s3, r3)); + + sum_s32[0] = vpadalq_s16(sum_s32[0], diff0); + sum_s32[1] = vpadalq_s16(sum_s32[1], diff1); + sum_s32[2] = vpadalq_s16(sum_s32[2], diff2); + sum_s32[3] = vpadalq_s16(sum_s32[3], diff3); + + sse_s64[0] = vpx_dotq_s16(sse_s64[0], diff0, diff0); + sse_s64[1] = vpx_dotq_s16(sse_s64[1], diff1, diff1); + sse_s64[2] = vpx_dotq_s16(sse_s64[2], diff2, diff2); + sse_s64[3] = vpx_dotq_s16(sse_s64[3], diff3, diff3); + + i += 32; + } while (i < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--h != 0); + + sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[1]); + sum_s32[2] = vaddq_s32(sum_s32[2], sum_s32[3]); + sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[2]); + + sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[1]); + sse_s64[2] = vaddq_s64(sse_s64[2], sse_s64[3]); + sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[2]); + + *sum = horizontal_add_int32x4(sum_s32[0]); + *sse = horizontal_add_int64x2(sse_s64[0]); +} + +static INLINE void highbd_variance_32xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, + int64_t *sum) { + highbd_variance_wxh_sve(src, src_stride, ref, ref_stride, 32, h, sse, sum); +} + +static INLINE void highbd_variance_64xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, + int64_t *sum) { + highbd_variance_wxh_sve(src, src_stride, ref, ref_stride, 64, h, sse, sum); +} + +#define HBD_VARIANCE_WXH_SVE(w, h) \ + uint32_t vpx_highbd_8_variance##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)sse_long; \ + sum = (int)sum_long; \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h)); \ + } \ + \ + uint32_t vpx_highbd_10_variance##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t vpx_highbd_12_variance##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +HBD_VARIANCE_WXH_SVE(4, 4) +HBD_VARIANCE_WXH_SVE(4, 8) + +HBD_VARIANCE_WXH_SVE(8, 4) +HBD_VARIANCE_WXH_SVE(8, 8) +HBD_VARIANCE_WXH_SVE(8, 16) + +HBD_VARIANCE_WXH_SVE(16, 8) +HBD_VARIANCE_WXH_SVE(16, 16) +HBD_VARIANCE_WXH_SVE(16, 32) + +HBD_VARIANCE_WXH_SVE(32, 16) +HBD_VARIANCE_WXH_SVE(32, 32) +HBD_VARIANCE_WXH_SVE(32, 64) + +HBD_VARIANCE_WXH_SVE(64, 32) +HBD_VARIANCE_WXH_SVE(64, 64) + +#define HIGHBD_GET_VAR_SVE(s) \ + void vpx_highbd_8_get##s##x##s##var_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##s##xh_sve(src, src_stride, ref, ref_stride, s, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)sse_long; \ + *sum = (int)sum_long; \ + } \ + \ + void vpx_highbd_10_get##s##x##s##var_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##s##xh_sve(src, src_stride, ref, ref_stride, s, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \ + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \ + } \ + \ + void vpx_highbd_12_get##s##x##s##var_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##s##xh_sve(src, src_stride, ref, ref_stride, s, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ + } + +HIGHBD_GET_VAR_SVE(8) +HIGHBD_GET_VAR_SVE(16) diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c index 47684473ca..b5a944d299 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c @@ -14,86 +14,51 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" #include "vpx_ports/mem.h" -static INLINE void load_4x4(const int16_t *s, const ptrdiff_t p, - int16x4_t *const s0, int16x4_t *const s1, - int16x4_t *const s2, int16x4_t *const s3) { - *s0 = vld1_s16(s); - s += p; - *s1 = vld1_s16(s); - s += p; - *s2 = vld1_s16(s); - s += p; - *s3 = vld1_s16(s); -} - -static INLINE void load_8x4(const uint16_t *s, const ptrdiff_t p, - uint16x8_t *const s0, uint16x8_t *const s1, - uint16x8_t *const s2, uint16x8_t *const s3) { - *s0 = vld1q_u16(s); - s += p; - *s1 = vld1q_u16(s); - s += p; - *s2 = vld1q_u16(s); - s += p; - *s3 = vld1q_u16(s); -} - -static INLINE void load_8x8(const int16_t *s, const ptrdiff_t p, - int16x8_t *const s0, int16x8_t *const s1, - int16x8_t *const s2, int16x8_t *const s3, - int16x8_t *const s4, int16x8_t *const s5, - int16x8_t *const s6, int16x8_t *const s7) { - *s0 = vld1q_s16(s); - s += p; - *s1 = vld1q_s16(s); - s += p; - *s2 = vld1q_s16(s); - s += p; - *s3 = vld1q_s16(s); - s += p; - *s4 = vld1q_s16(s); - s += p; - *s5 = vld1q_s16(s); - s += p; - *s6 = vld1q_s16(s); - s += p; - *s7 = vld1q_s16(s); +static INLINE uint16x4_t highbd_convolve4_4( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t filters, const uint16x4_t max) { + int32x4_t sum = vmull_lane_s16(s0, filters, 0); + sum = vmlal_lane_s16(sum, s1, filters, 1); + sum = vmlal_lane_s16(sum, s2, filters, 2); + sum = vmlal_lane_s16(sum, s3, filters, 3); + + uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS); + return vmin_u16(res, max); } -static INLINE void store_8x8(uint16_t *s, const ptrdiff_t p, - const uint16x8_t s0, const uint16x8_t s1, - const uint16x8_t s2, const uint16x8_t s3, - const uint16x8_t s4, const uint16x8_t s5, - const uint16x8_t s6, const uint16x8_t s7) { - vst1q_u16(s, s0); - s += p; - vst1q_u16(s, s1); - s += p; - vst1q_u16(s, s2); - s += p; - vst1q_u16(s, s3); - s += p; - vst1q_u16(s, s4); - s += p; - vst1q_u16(s, s5); - s += p; - vst1q_u16(s, s6); - s += p; - vst1q_u16(s, s7); +static INLINE uint16x8_t highbd_convolve4_8( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x4_t filters, const uint16x8_t max) { + int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filters, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters, 3); + + int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filters, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters, 3); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), + vqrshrun_n_s32(sum1, FILTER_BITS)); + return vminq_u16(res, max); } -static INLINE int32x4_t highbd_convolve8_4( - const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, - const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, - const int16x4_t s6, const int16x4_t s7, const int16x8_t filters) { +static INLINE uint16x4_t +highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x8_t filters, const uint16x4_t max) { const int16x4_t filters_lo = vget_low_s16(filters); const int16x4_t filters_hi = vget_high_s16(filters); - int32x4_t sum; - sum = vmull_lane_s16(s0, filters_lo, 0); + int32x4_t sum = vmull_lane_s16(s0, filters_lo, 0); sum = vmlal_lane_s16(sum, s1, filters_lo, 1); sum = vmlal_lane_s16(sum, s2, filters_lo, 2); sum = vmlal_lane_s16(sum, s3, filters_lo, 3); @@ -101,7 +66,9 @@ static INLINE int32x4_t highbd_convolve8_4( sum = vmlal_lane_s16(sum, s5, filters_hi, 1); sum = vmlal_lane_s16(sum, s6, filters_hi, 2); sum = vmlal_lane_s16(sum, s7, filters_hi, 3); - return sum; + + uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS); + return vmin_u16(res, max); } static INLINE uint16x8_t @@ -111,10 +78,8 @@ highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t filters, const uint16x8_t max) { const int16x4_t filters_lo = vget_low_s16(filters); const int16x4_t filters_hi = vget_high_s16(filters); - int32x4_t sum0, sum1; - uint16x8_t d; - sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0); + int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3); @@ -122,7 +87,8 @@ highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3); - sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0); + + int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3); @@ -130,9 +96,152 @@ highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filters_hi, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filters_hi, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filters_hi, 3); - d = vcombine_u16(vqrshrun_n_s32(sum0, 7), vqrshrun_n_s32(sum1, 7)); - d = vminq_u16(d, max); - return d; + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), + vqrshrun_n_s32(sum1, FILTER_BITS)); + return vminq_u16(res, max); +} + +static INLINE void highbd_convolve_4tap_horiz_neon( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x4_t filter, int bd) { + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + do { + int16x4_t s0[4], s1[4], s2[4], s3[4]; + load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = + highbd_convolve4_4(s0[0], s0[1], s0[2], s0[3], filter, max); + uint16x4_t d1 = + highbd_convolve4_4(s1[0], s1[1], s1[2], s1[3], filter, max); + uint16x4_t d2 = + highbd_convolve4_4(s2[0], s2[1], s2[2], s2[3], filter, max); + uint16x4_t d3 = + highbd_convolve4_4(s3[0], s3[1], s3[2], s3[3], filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int width = w; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x8_t d0 = + highbd_convolve4_8(s0[0], s0[1], s0[2], s0[3], filter, max); + uint16x8_t d1 = + highbd_convolve4_8(s1[0], s1[1], s1[2], s1[3], filter, max); + uint16x8_t d2 = + highbd_convolve4_8(s2[0], s2[1], s2[2], s2[3], filter, max); + uint16x8_t d3 = + highbd_convolve4_8(s3[0], s3[1], s3[2], s3[3], filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE void highbd_convolve_8tap_horiz_neon( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) { + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + do { + int16x4_t s0[8], s1[8], s2[8], s3[8]; + load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], s0[6], s0[7], filter, max); + uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4], + s1[5], s1[6], s1[7], filter, max); + uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4], + s2[5], s2[6], s2[7], filter, max); + uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4], + s3[5], s3[6], s3[7], filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int width = w; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], s0[6], s0[7], filter, max); + uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], + s1[5], s1[6], s1[7], filter, max); + uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], + s2[5], s2[6], s2[7], filter, max); + uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], + s3[5], s3[6], s3[7], filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } } void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, @@ -143,202 +252,25 @@ void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, if (x_step_q4 != 16) { vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); - } else { - const int16x8_t filters = vld1q_s16(filter[x0_q4]); - const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); - uint16x8_t t0, t1, t2, t3; - - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); - - src -= 3; - - if (h == 4) { - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int32x4_t d0, d1, d2, d3; - uint16x8_t d01, d23; - - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - load_8x4(src, src_stride, &t0, &t1, &t2, &t3); - transpose_u16_8x4(&t0, &t1, &t2, &t3); - s0 = vreinterpret_s16_u16(vget_low_u16(t0)); - s1 = vreinterpret_s16_u16(vget_low_u16(t1)); - s2 = vreinterpret_s16_u16(vget_low_u16(t2)); - s3 = vreinterpret_s16_u16(vget_low_u16(t3)); - s4 = vreinterpret_s16_u16(vget_high_u16(t0)); - s5 = vreinterpret_s16_u16(vget_high_u16(t1)); - s6 = vreinterpret_s16_u16(vget_high_u16(t2)); - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - src += 7; - - do { - load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10); - transpose_s16_4x4d(&s7, &s8, &s9, &s10); - - d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + return; + } - d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); - d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); - d01 = vminq_u16(d01, max); - d23 = vminq_u16(d23, max); - transpose_u16_4x4q(&d01, &d23); + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); - vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01)); - vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23)); - vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01)); - vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23)); + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; - s0 = s4; - s1 = s5; - s2 = s6; - s3 = s7; - s4 = s8; - s5 = s9; - s6 = s10; - src += 4; - dst += 4; - w -= 4; - } while (w > 0); - } else { - int16x8_t t4, t5, t6, t7; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint16x8_t d0, d1, d2, d3; - - if (w == 4) { - do { - load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, - &s5, &s6, &s7); - transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); - - load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10, - &t4, &t5, &t6, &t7); - src += 8 * src_stride; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(dst + 4 * dst_stride); - __builtin_prefetch(dst + 5 * dst_stride); - __builtin_prefetch(dst + 6 * dst_stride); - __builtin_prefetch(dst + 7 * dst_stride); - transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7); - - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - __builtin_prefetch(src + 7 * src_stride); - d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); - d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); - d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); - d3 = - highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); - - transpose_u16_8x4(&d0, &d1, &d2, &d3); - vst1_u16(dst, vget_low_u16(d0)); - dst += dst_stride; - vst1_u16(dst, vget_low_u16(d1)); - dst += dst_stride; - vst1_u16(dst, vget_low_u16(d2)); - dst += dst_stride; - vst1_u16(dst, vget_low_u16(d3)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d0)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d1)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d2)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d3)); - dst += dst_stride; - h -= 8; - } while (h > 0); - } else { - int width; - const uint16_t *s; - uint16_t *d; - int16x8_t s11, s12, s13, s14; - uint16x8_t d4, d5, d6, d7; - - do { - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - __builtin_prefetch(src + 7 * src_stride); - load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, - &s5, &s6, &s7); - transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); - - width = w; - s = src + 7; - d = dst; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(dst + 4 * dst_stride); - __builtin_prefetch(dst + 5 * dst_stride); - __builtin_prefetch(dst + 6 * dst_stride); - __builtin_prefetch(dst + 7 * dst_stride); - - do { - load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11, - &s12, &s13, &s14); - transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14); - - d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, - max); - d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, - max); - d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, - max); - d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, - max); - d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, - max); - d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, - max); - d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, - max); - d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, - filters, max); - - transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); - store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); - - s0 = s8; - s1 = s9; - s2 = s10; - s3 = s11; - s4 = s12; - s5 = s13; - s6 = s14; - s += 8; - d += 8; - width -= 8; - } while (width > 0); - src += 8 * src_stride; - dst += 8 * dst_stride; - h -= 8; - } while (h > 0); - } - } + if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { + const int16x4_t x_filter_4tap = vld1_s16(filter[x0_q4] + 2); + highbd_convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h, + x_filter_4tap, bd); + } else { + const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]); + highbd_convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h, + x_filter_8tap, bd); } } @@ -352,66 +284,233 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + const int16x8_t filters = vld1q_s16(filter[x0_q4]); + + src -= 3; + + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + do { + int16x4_t s0[8], s1[8], s2[8], s3[8]; + load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], s0[6], s0[7], filters, max); + uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4], + s1[5], s1[6], s1[7], filters, max); + uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4], + s2[5], s2[6], s2[7], filters, max); + uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4], + s3[5], s3[6], s3[7], filters, max); + + d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride)); + d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride)); + d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride)); + d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int width = w; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], s0[6], s0[7], filters, max); + uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], + s1[5], s1[6], s1[7], filters, max); + uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], + s2[5], s2[6], s2[7], filters, max); + uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], + s3[5], s3[6], s3[7], filters, max); + + d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride)); + d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride)); + d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride)); + d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride)); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE void highbd_convolve_4tap_vert_neon( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x4_t filter, int bd) { + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + int16x4_t s0, s1, s2; + load_s16_4x3(s, src_stride, &s0, &s1, &s2); + + s += 3 * src_stride; + + do { + int16x4_t s3, s4, s5, s6; + load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6); + + uint16x4_t d0 = highbd_convolve4_4(s0, s1, s2, s3, filter, max); + uint16x4_t d1 = highbd_convolve4_4(s1, s2, s3, s4, filter, max); + uint16x4_t d2 = highbd_convolve4_4(s2, s3, s4, s5, filter, max); + uint16x4_t d3 = highbd_convolve4_4(s3, s4, s5, s6, filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); } else { - const int16x8_t filters = vld1q_s16(filter[x0_q4]); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); - - src -= 3; - - if (h == 4) { - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int32x4_t d0, d1, d2, d3; - uint16x8_t t0, t1, t2, t3; - uint16x8_t d01, d23, t01, t23; - - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - load_8x4(src, src_stride, &t0, &t1, &t2, &t3); - transpose_u16_8x4(&t0, &t1, &t2, &t3); - s0 = vreinterpret_s16_u16(vget_low_u16(t0)); - s1 = vreinterpret_s16_u16(vget_low_u16(t1)); - s2 = vreinterpret_s16_u16(vget_low_u16(t2)); - s3 = vreinterpret_s16_u16(vget_low_u16(t3)); - s4 = vreinterpret_s16_u16(vget_high_u16(t0)); - s5 = vreinterpret_s16_u16(vget_high_u16(t1)); - s6 = vreinterpret_s16_u16(vget_high_u16(t2)); - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - src += 7; + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x8_t s0, s1, s2; + load_s16_8x3(s, src_stride, &s0, &s1, &s2); + + s += 3 * src_stride; do { - load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10); - transpose_s16_4x4d(&s7, &s8, &s9, &s10); - - d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); - - t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); - t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); - t01 = vminq_u16(t01, max); - t23 = vminq_u16(t23, max); - transpose_u16_4x4q(&t01, &t23); - - d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride), - vld1_u16(dst + 2 * dst_stride)); - d23 = vcombine_u16(vld1_u16(dst + 1 * dst_stride), - vld1_u16(dst + 3 * dst_stride)); - d01 = vrhaddq_u16(d01, t01); - d23 = vrhaddq_u16(d23, t23); - - vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01)); - vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23)); - vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01)); - vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23)); + int16x8_t s3, s4, s5, s6; + load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); + + uint16x8_t d0 = highbd_convolve4_8(s0, s1, s2, s3, filter, max); + uint16x8_t d1 = highbd_convolve4_8(s1, s2, s3, s4, filter, max); + uint16x8_t d2 = highbd_convolve4_8(s2, s3, s4, s5, filter, max); + uint16x8_t d3 = highbd_convolve4_8(s3, s4, s5, s6, filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void highbd_convolve_8tap_vert_neon( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) { + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + + s += 7 * src_stride; + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x4_t d0 = + highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter, max); + uint16x4_t d1 = + highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter, max); + uint16x4_t d2 = + highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter, max); + uint16x4_t d3 = + highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + + s += 7 * src_stride; + + do { + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x8_t d0 = + highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter, max); + uint16x8_t d1 = + highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter, max); + uint16x8_t d2 = + highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter, max); + uint16x8_t d3 = + highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; @@ -420,164 +519,14 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, s4 = s8; s5 = s9; s6 = s10; - src += 4; - dst += 4; - w -= 4; - } while (w > 0); - } else { - int16x8_t t4, t5, t6, t7; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3; - - if (w == 4) { - do { - load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, - &s5, &s6, &s7); - transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); - - load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10, - &t4, &t5, &t6, &t7); - src += 8 * src_stride; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(dst + 4 * dst_stride); - __builtin_prefetch(dst + 5 * dst_stride); - __builtin_prefetch(dst + 6 * dst_stride); - __builtin_prefetch(dst + 7 * dst_stride); - transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7); - - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - __builtin_prefetch(src + 7 * src_stride); - t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); - t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); - t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); - t3 = - highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); - transpose_u16_8x4(&t0, &t1, &t2, &t3); - - d0 = vcombine_u16(vld1_u16(dst + 0 * dst_stride), - vld1_u16(dst + 4 * dst_stride)); - d1 = vcombine_u16(vld1_u16(dst + 1 * dst_stride), - vld1_u16(dst + 5 * dst_stride)); - d2 = vcombine_u16(vld1_u16(dst + 2 * dst_stride), - vld1_u16(dst + 6 * dst_stride)); - d3 = vcombine_u16(vld1_u16(dst + 3 * dst_stride), - vld1_u16(dst + 7 * dst_stride)); - d0 = vrhaddq_u16(d0, t0); - d1 = vrhaddq_u16(d1, t1); - d2 = vrhaddq_u16(d2, t2); - d3 = vrhaddq_u16(d3, t3); - - vst1_u16(dst, vget_low_u16(d0)); - dst += dst_stride; - vst1_u16(dst, vget_low_u16(d1)); - dst += dst_stride; - vst1_u16(dst, vget_low_u16(d2)); - dst += dst_stride; - vst1_u16(dst, vget_low_u16(d3)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d0)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d1)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d2)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d3)); - dst += dst_stride; - h -= 8; - } while (h > 0); - } else { - int width; - const uint16_t *s; - uint16_t *d; - int16x8_t s11, s12, s13, s14; - uint16x8_t d4, d5, d6, d7; - - do { - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - __builtin_prefetch(src + 7 * src_stride); - load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, - &s5, &s6, &s7); - transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); - - width = w; - s = src + 7; - d = dst; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(dst + 4 * dst_stride); - __builtin_prefetch(dst + 5 * dst_stride); - __builtin_prefetch(dst + 6 * dst_stride); - __builtin_prefetch(dst + 7 * dst_stride); - - do { - load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11, - &s12, &s13, &s14); - transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14); - - d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, - max); - d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, - max); - d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, - max); - d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, - max); - d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, - max); - d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, - max); - d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, - max); - d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, - filters, max); - - transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); - - d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride)); - d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride)); - d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride)); - d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride)); - d4 = vrhaddq_u16(d4, vld1q_u16(d + 4 * dst_stride)); - d5 = vrhaddq_u16(d5, vld1q_u16(d + 5 * dst_stride)); - d6 = vrhaddq_u16(d6, vld1q_u16(d + 6 * dst_stride)); - d7 = vrhaddq_u16(d7, vld1q_u16(d + 7 * dst_stride)); - - store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); - - s0 = s8; - s1 = s9; - s2 = s10; - s3 = s11; - s4 = s12; - s5 = s13; - s6 = s14; - s += 8; - d += 8; - width -= 8; - } while (width > 0); - src += 8 * src_stride; - dst += 8 * dst_stride; - h -= 8; - } while (h > 0); - } - } + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); } } @@ -589,160 +538,25 @@ void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride, if (y_step_q4 != 16) { vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); - } else { - const int16x8_t filters = vld1q_s16(filter[y0_q4]); - const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); - - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); - - src -= 3 * src_stride; - - if (w == 4) { - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int32x4_t d0, d1, d2, d3; - uint16x8_t d01, d23; - - s0 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s1 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s2 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s3 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s4 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s5 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s6 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; + return; + } - do { - s7 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s8 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s9 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s10 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); - - d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); - d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); - d01 = vminq_u16(d01, max); - d23 = vminq_u16(d23, max); - vst1_u16(dst, vget_low_u16(d01)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d01)); - dst += dst_stride; - vst1_u16(dst, vget_low_u16(d23)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d23)); - dst += dst_stride; + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); - s0 = s4; - s1 = s5; - s2 = s6; - s3 = s7; - s4 = s8; - s5 = s9; - s6 = s10; - h -= 4; - } while (h > 0); - } else { - int height; - const uint16_t *s; - uint16_t *d; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint16x8_t d0, d1, d2, d3; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; - do { - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - s = src; - s0 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s1 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s2 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s3 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s4 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s5 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s6 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - d = dst; - height = h; - - do { - s7 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s8 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s9 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s10 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - - __builtin_prefetch(d + 0 * dst_stride); - __builtin_prefetch(d + 1 * dst_stride); - __builtin_prefetch(d + 2 * dst_stride); - __builtin_prefetch(d + 3 * dst_stride); - __builtin_prefetch(s + 0 * src_stride); - __builtin_prefetch(s + 1 * src_stride); - __builtin_prefetch(s + 2 * src_stride); - __builtin_prefetch(s + 3 * src_stride); - d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); - d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); - d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); - d3 = - highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); - - vst1q_u16(d, d0); - d += dst_stride; - vst1q_u16(d, d1); - d += dst_stride; - vst1q_u16(d, d2); - d += dst_stride; - vst1q_u16(d, d3); - d += dst_stride; - - s0 = s4; - s1 = s5; - s2 = s6; - s3 = s7; - s4 = s8; - s5 = s9; - s6 = s10; - height -= 4; - } while (height > 0); - src += 8; - dst += 8; - w -= 8; - } while (w > 0); - } + if (vpx_get_filter_taps(filter[y0_q4]) <= 4) { + const int16x4_t y_filter_4tap = vld1_s16(filter[y0_q4] + 2); + highbd_convolve_4tap_vert_neon(src - src_stride, src_stride, dst, + dst_stride, w, h, y_filter_4tap, bd); + } else { + const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]); + highbd_convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst, + dst_stride, w, h, y_filter_8tap, bd); } } @@ -756,78 +570,89 @@ void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src, vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + const int16x8_t filters = vld1q_s16(filter[y0_q4]); + + src -= 3 * src_stride; + + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + + s += 7 * src_stride; + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x4_t d0 = + highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); + uint16x4_t d1 = + highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); + uint16x4_t d2 = + highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); + uint16x4_t d3 = + highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); + + d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride)); + d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride)); + d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride)); + d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); } else { - const int16x8_t filters = vld1q_s16(filter[y0_q4]); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); - - src -= 3 * src_stride; - - if (w == 4) { - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int32x4_t d0, d1, d2, d3; - uint16x8_t d01, d23, t01, t23; - - s0 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s1 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s2 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s3 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s4 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s5 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s6 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + + s += 7 * src_stride; do { - s7 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s8 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s9 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s10 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); - - t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); - t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); - t01 = vminq_u16(t01, max); - t23 = vminq_u16(t23, max); - - d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride), - vld1_u16(dst + 1 * dst_stride)); - d23 = vcombine_u16(vld1_u16(dst + 2 * dst_stride), - vld1_u16(dst + 3 * dst_stride)); - d01 = vrhaddq_u16(d01, t01); - d23 = vrhaddq_u16(d23, t23); - - vst1_u16(dst, vget_low_u16(d01)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d01)); - dst += dst_stride; - vst1_u16(dst, vget_low_u16(d23)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d23)); - dst += dst_stride; + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x8_t d0 = + highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); + uint16x8_t d1 = + highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); + uint16x8_t d2 = + highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); + uint16x8_t d3 = + highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); + + d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride)); + d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride)); + d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride)); + d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride)); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; @@ -836,96 +661,592 @@ void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src, s4 = s8; s5 = s9; s6 = s10; - h -= 4; - } while (h > 0); - } else { - int height; - const uint16_t *s; - uint16_t *d; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} - do { - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - s = src; - s0 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s1 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s2 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s3 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s4 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s5 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s6 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - d = dst; - height = h; - - do { - s7 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s8 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s9 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s10 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - - __builtin_prefetch(d + 0 * dst_stride); - __builtin_prefetch(d + 1 * dst_stride); - __builtin_prefetch(d + 2 * dst_stride); - __builtin_prefetch(d + 3 * dst_stride); - __builtin_prefetch(s + 0 * src_stride); - __builtin_prefetch(s + 1 * src_stride); - __builtin_prefetch(s + 2 * src_stride); - __builtin_prefetch(s + 3 * src_stride); - t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); - t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); - t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); - t3 = - highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); - - d0 = vld1q_u16(d + 0 * dst_stride); - d1 = vld1q_u16(d + 1 * dst_stride); - d2 = vld1q_u16(d + 2 * dst_stride); - d3 = vld1q_u16(d + 3 * dst_stride); - d0 = vrhaddq_u16(d0, t0); - d1 = vrhaddq_u16(d1, t1); - d2 = vrhaddq_u16(d2, t2); - d3 = vrhaddq_u16(d3, t3); - - vst1q_u16(d, d0); - d += dst_stride; - vst1q_u16(d, d1); - d += dst_stride; - vst1q_u16(d, d2); - d += dst_stride; - vst1q_u16(d, d3); - d += dst_stride; - - s0 = s4; - s1 = s5; - s2 = s6; - s3 = s7; - s4 = s8; - s5 = s9; - s6 = s10; - height -= 4; - } while (height > 0); - src += 8; - dst += 8; - w -= 8; - } while (w > 0); - } +static INLINE void highbd_convolve_2d_4tap_neon( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, + const int16x4_t y_filter, int bd) { + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + int16x4_t h_s0[4], h_s1[4], h_s2[4]; + load_s16_4x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]); + load_s16_4x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]); + load_s16_4x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]); + + int16x4_t v_s0 = vreinterpret_s16_u16( + highbd_convolve4_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], x_filter, max)); + int16x4_t v_s1 = vreinterpret_s16_u16( + highbd_convolve4_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], x_filter, max)); + int16x4_t v_s2 = vreinterpret_s16_u16( + highbd_convolve4_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], x_filter, max)); + + s += 3 * src_stride; + + do { + int16x4_t h_s3[4], h_s4[4], h_s5[4], h_s6[4]; + load_s16_4x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], + &h_s3[3]); + load_s16_4x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], + &h_s4[3]); + load_s16_4x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], + &h_s5[3]); + load_s16_4x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], + &h_s6[3]); + + int16x4_t v_s3 = vreinterpret_s16_u16(highbd_convolve4_4( + h_s3[0], h_s3[1], h_s3[2], h_s3[3], x_filter, max)); + int16x4_t v_s4 = vreinterpret_s16_u16(highbd_convolve4_4( + h_s4[0], h_s4[1], h_s4[2], h_s4[3], x_filter, max)); + int16x4_t v_s5 = vreinterpret_s16_u16(highbd_convolve4_4( + h_s5[0], h_s5[1], h_s5[2], h_s5[3], x_filter, max)); + int16x4_t v_s6 = vreinterpret_s16_u16(highbd_convolve4_4( + h_s6[0], h_s6[1], h_s6[2], h_s6[3], x_filter, max)); + + uint16x4_t d0 = highbd_convolve4_4(v_s0, v_s1, v_s2, v_s3, y_filter, max); + uint16x4_t d1 = highbd_convolve4_4(v_s1, v_s2, v_s3, v_s4, y_filter, max); + uint16x4_t d2 = highbd_convolve4_4(v_s2, v_s3, v_s4, v_s5, y_filter, max); + uint16x4_t d3 = highbd_convolve4_4(v_s3, v_s4, v_s5, v_s6, y_filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + + return; + } + + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x8_t h_s0[4], h_s1[4], h_s2[4]; + load_s16_8x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]); + + int16x8_t v_s0 = vreinterpretq_s16_u16( + highbd_convolve4_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], x_filter, max)); + int16x8_t v_s1 = vreinterpretq_s16_u16( + highbd_convolve4_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], x_filter, max)); + int16x8_t v_s2 = vreinterpretq_s16_u16( + highbd_convolve4_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], x_filter, max)); + + s += 3 * src_stride; + + do { + int16x8_t h_s3[4], h_s4[4], h_s5[4], h_s6[4]; + load_s16_8x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], + &h_s3[3]); + load_s16_8x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], + &h_s4[3]); + load_s16_8x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], + &h_s5[3]); + load_s16_8x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], + &h_s6[3]); + + int16x8_t v_s3 = vreinterpretq_s16_u16(highbd_convolve4_8( + h_s3[0], h_s3[1], h_s3[2], h_s3[3], x_filter, max)); + int16x8_t v_s4 = vreinterpretq_s16_u16(highbd_convolve4_8( + h_s4[0], h_s4[1], h_s4[2], h_s4[3], x_filter, max)); + int16x8_t v_s5 = vreinterpretq_s16_u16(highbd_convolve4_8( + h_s5[0], h_s5[1], h_s5[2], h_s5[3], x_filter, max)); + int16x8_t v_s6 = vreinterpretq_s16_u16(highbd_convolve4_8( + h_s6[0], h_s6[1], h_s6[2], h_s6[3], x_filter, max)); + + uint16x8_t d0 = highbd_convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter, max); + uint16x8_t d1 = highbd_convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter, max); + uint16x8_t d2 = highbd_convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter, max); + uint16x8_t d3 = highbd_convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); +} + +static INLINE void highbd_convolve_2d_8tap_neon( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x8_t x_filter, + const int16x8_t y_filter, int bd) { + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + int16x4_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8]; + load_s16_4x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3], + &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]); + load_s16_4x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3], + &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]); + load_s16_4x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3], + &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]); + load_s16_4x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3], + &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]); + load_s16_4x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3], + &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]); + load_s16_4x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3], + &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]); + load_s16_4x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3], + &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]); + + int16x4_t v_s0 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5], + h_s0[6], h_s0[7], x_filter, max)); + int16x4_t v_s1 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5], + h_s1[6], h_s1[7], x_filter, max)); + int16x4_t v_s2 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5], + h_s2[6], h_s2[7], x_filter, max)); + int16x4_t v_s3 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5], + h_s3[6], h_s3[7], x_filter, max)); + int16x4_t v_s4 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5], + h_s4[6], h_s4[7], x_filter, max)); + int16x4_t v_s5 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5], + h_s5[6], h_s5[7], x_filter, max)); + int16x4_t v_s6 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5], + h_s6[6], h_s6[7], x_filter, max)); + + s += 7 * src_stride; + + do { + int16x4_t h_s7[8], h_s8[8], h_s9[8], h_s10[8]; + load_s16_4x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2], + &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]); + load_s16_4x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2], + &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]); + load_s16_4x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2], + &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]); + load_s16_4x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2], + &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]); + + int16x4_t v_s7 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4], + h_s7[5], h_s7[6], h_s7[7], x_filter, max)); + int16x4_t v_s8 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4], + h_s8[5], h_s8[6], h_s8[7], x_filter, max)); + int16x4_t v_s9 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4], + h_s9[5], h_s9[6], h_s9[7], x_filter, max)); + int16x4_t v_s10 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4], + h_s10[5], h_s10[6], h_s10[7], x_filter, max)); + + uint16x4_t d0 = highbd_convolve8_4(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, + v_s6, v_s7, y_filter, max); + uint16x4_t d1 = highbd_convolve8_4(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, + v_s7, v_s8, y_filter, max); + uint16x4_t d2 = highbd_convolve8_4(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7, + v_s8, v_s9, y_filter, max); + uint16x4_t d3 = highbd_convolve8_4(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8, + v_s9, v_s10, y_filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + v_s3 = v_s7; + v_s4 = v_s8; + v_s5 = v_s9; + v_s6 = v_s10; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + + return; + } + + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x8_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8]; + load_s16_8x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3], + &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3], + &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3], + &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3], + &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]); + load_s16_8x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3], + &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]); + load_s16_8x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3], + &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]); + load_s16_8x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3], + &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]); + + int16x8_t v_s0 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5], + h_s0[6], h_s0[7], x_filter, max)); + int16x8_t v_s1 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5], + h_s1[6], h_s1[7], x_filter, max)); + int16x8_t v_s2 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5], + h_s2[6], h_s2[7], x_filter, max)); + int16x8_t v_s3 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5], + h_s3[6], h_s3[7], x_filter, max)); + int16x8_t v_s4 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5], + h_s4[6], h_s4[7], x_filter, max)); + int16x8_t v_s5 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5], + h_s5[6], h_s5[7], x_filter, max)); + int16x8_t v_s6 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5], + h_s6[6], h_s6[7], x_filter, max)); + + s += 7 * src_stride; + + do { + int16x8_t h_s7[8], h_s8[8], h_s9[8], h_s10[8]; + load_s16_8x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2], + &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]); + load_s16_8x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2], + &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]); + load_s16_8x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2], + &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]); + load_s16_8x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2], + &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]); + + int16x8_t v_s7 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4], + h_s7[5], h_s7[6], h_s7[7], x_filter, max)); + int16x8_t v_s8 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4], + h_s8[5], h_s8[6], h_s8[7], x_filter, max)); + int16x8_t v_s9 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4], + h_s9[5], h_s9[6], h_s9[7], x_filter, max)); + int16x8_t v_s10 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4], + h_s10[5], h_s10[6], h_s10[7], x_filter, max)); + + uint16x8_t d0 = highbd_convolve8_8(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, + v_s6, v_s7, y_filter, max); + uint16x8_t d1 = highbd_convolve8_8(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, + v_s7, v_s8, y_filter, max); + uint16x8_t d2 = highbd_convolve8_8(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7, + v_s8, v_s9, y_filter, max); + uint16x8_t d3 = highbd_convolve8_8(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8, + v_s9, v_s10, y_filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + v_s3 = v_s7; + v_s4 = v_s8; + v_s5 = v_s9; + v_s6 = v_s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); +} + +void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { + if (x_step_q4 != 16 || y_step_q4 != 16) { + vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8; + const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8; + // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2 + // lines post both horizontally and vertically. + const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1; + const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride; + + if (x_filter_taps == 4 && y_filter_taps == 4) { + const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2); + const int16x4_t y_filter = vld1_s16(filter[y0_q4] + 2); + + highbd_convolve_2d_4tap_neon(src - horiz_offset - vert_offset, src_stride, + dst, dst_stride, w, h, x_filter, y_filter, bd); + return; + } + + const int16x8_t x_filter = vld1q_s16(filter[x0_q4]); + const int16x8_t y_filter = vld1q_s16(filter[y0_q4]); + + highbd_convolve_2d_8tap_neon(src - horiz_offset - vert_offset, src_stride, + dst, dst_stride, w, h, x_filter, y_filter, bd); +} + +void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + if (x_step_q4 != 16 || y_step_q4 != 16) { + vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; } + + // Averaging convolution always uses an 8-tap filter. + const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1; + const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride; + // Account for needing SUBPEL_TAPS / 2 - 1 lines prior and SUBPEL_TAPS / 2 + // lines post both horizontally and vertically. + src = src - horiz_offset - vert_offset; + + const int16x8_t x_filter = vld1q_s16(filter[x0_q4]); + const int16x8_t y_filter = vld1q_s16(filter[y0_q4]); + + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + int16x4_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8]; + load_s16_4x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3], + &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]); + load_s16_4x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3], + &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]); + load_s16_4x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3], + &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]); + load_s16_4x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3], + &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]); + load_s16_4x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3], + &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]); + load_s16_4x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3], + &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]); + load_s16_4x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3], + &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]); + + int16x4_t v_s0 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5], + h_s0[6], h_s0[7], x_filter, max)); + int16x4_t v_s1 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5], + h_s1[6], h_s1[7], x_filter, max)); + int16x4_t v_s2 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5], + h_s2[6], h_s2[7], x_filter, max)); + int16x4_t v_s3 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5], + h_s3[6], h_s3[7], x_filter, max)); + int16x4_t v_s4 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5], + h_s4[6], h_s4[7], x_filter, max)); + int16x4_t v_s5 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5], + h_s5[6], h_s5[7], x_filter, max)); + int16x4_t v_s6 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5], + h_s6[6], h_s6[7], x_filter, max)); + + s += 7 * src_stride; + + do { + int16x4_t h_s7[8], h_s8[8], h_s9[8], h_s10[8]; + load_s16_4x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2], + &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]); + load_s16_4x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2], + &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]); + load_s16_4x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2], + &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]); + load_s16_4x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2], + &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]); + + int16x4_t v_s7 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4], + h_s7[5], h_s7[6], h_s7[7], x_filter, max)); + int16x4_t v_s8 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4], + h_s8[5], h_s8[6], h_s8[7], x_filter, max)); + int16x4_t v_s9 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4], + h_s9[5], h_s9[6], h_s9[7], x_filter, max)); + int16x4_t v_s10 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4], + h_s10[5], h_s10[6], h_s10[7], x_filter, max)); + + uint16x4_t d0 = highbd_convolve8_4(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, + v_s6, v_s7, y_filter, max); + uint16x4_t d1 = highbd_convolve8_4(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, + v_s7, v_s8, y_filter, max); + uint16x4_t d2 = highbd_convolve8_4(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7, + v_s8, v_s9, y_filter, max); + uint16x4_t d3 = highbd_convolve8_4(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8, + v_s9, v_s10, y_filter, max); + + d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride)); + d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride)); + d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride)); + d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + v_s3 = v_s7; + v_s4 = v_s8; + v_s5 = v_s9; + v_s6 = v_s10; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + + return; + } + + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x8_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8]; + load_s16_8x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3], + &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3], + &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3], + &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3], + &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]); + load_s16_8x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3], + &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]); + load_s16_8x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3], + &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]); + load_s16_8x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3], + &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]); + + int16x8_t v_s0 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5], + h_s0[6], h_s0[7], x_filter, max)); + int16x8_t v_s1 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5], + h_s1[6], h_s1[7], x_filter, max)); + int16x8_t v_s2 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5], + h_s2[6], h_s2[7], x_filter, max)); + int16x8_t v_s3 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5], + h_s3[6], h_s3[7], x_filter, max)); + int16x8_t v_s4 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5], + h_s4[6], h_s4[7], x_filter, max)); + int16x8_t v_s5 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5], + h_s5[6], h_s5[7], x_filter, max)); + int16x8_t v_s6 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5], + h_s6[6], h_s6[7], x_filter, max)); + + s += 7 * src_stride; + + do { + int16x8_t h_s7[8], h_s8[8], h_s9[8], h_s10[8]; + load_s16_8x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2], + &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]); + load_s16_8x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2], + &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]); + load_s16_8x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2], + &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]); + load_s16_8x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2], + &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]); + + int16x8_t v_s7 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4], + h_s7[5], h_s7[6], h_s7[7], x_filter, max)); + int16x8_t v_s8 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4], + h_s8[5], h_s8[6], h_s8[7], x_filter, max)); + int16x8_t v_s9 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4], + h_s9[5], h_s9[6], h_s9[7], x_filter, max)); + int16x8_t v_s10 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4], + h_s10[5], h_s10[6], h_s10[7], x_filter, max)); + + uint16x8_t d0 = highbd_convolve8_8(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, + v_s6, v_s7, y_filter, max); + uint16x8_t d1 = highbd_convolve8_8(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, + v_s7, v_s8, y_filter, max); + uint16x8_t d2 = highbd_convolve8_8(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7, + v_s8, v_s9, y_filter, max); + uint16x8_t d3 = highbd_convolve8_8(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8, + v_s9, v_s10, y_filter, max); + + d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride)); + d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride)); + d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride)); + d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride)); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + v_s3 = v_s7; + v_s4 = v_s8; + v_s5 = v_s9; + v_s6 = v_s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); } diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c new file mode 100644 index 0000000000..7fc0a57c90 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_neon_sve_bridge.h" + +DECLARE_ALIGNED(16, static const uint16_t, kTblConv4_8[8]) = { 0, 2, 4, 6, + 1, 3, 5, 7 }; + +static INLINE uint16x4_t highbd_convolve4_4(const int16x4_t s[4], + const int16x8_t filter, + const uint16x4_t max) { + int16x8_t s01 = vcombine_s16(s[0], s[1]); + int16x8_t s23 = vcombine_s16(s[2], s[3]); + + int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s01, filter, 0); + int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s23, filter, 0); + + int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + + uint16x4_t res_u16 = vqrshrun_n_s32(res_s32, FILTER_BITS); + return vmin_u16(res_u16, max); +} + +static INLINE uint16x8_t highbd_convolve4_8(const int16x8_t s[4], + const int16x8_t filter, + const uint16x8_t max, + uint16x8_t idx) { + int64x2_t sum04 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[0], filter, 0); + int64x2_t sum15 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[1], filter, 0); + int64x2_t sum26 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[2], filter, 0); + int64x2_t sum37 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[3], filter, 0); + + int32x4_t res0 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); + int32x4_t res1 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS), + vqrshrun_n_s32(res1, FILTER_BITS)); + + res = vpx_tbl_u16(res, idx); + + return vminq_u16(res, max); +} + +static INLINE uint16x4_t highbd_convolve8_4(const int16x8_t s[4], + const int16x8_t filter, + const uint16x4_t max) { + int64x2_t sum[4]; + + sum[0] = vpx_dotq_s16(vdupq_n_s64(0), s[0], filter); + sum[1] = vpx_dotq_s16(vdupq_n_s64(0), s[1], filter); + sum[2] = vpx_dotq_s16(vdupq_n_s64(0), s[2], filter); + sum[3] = vpx_dotq_s16(vdupq_n_s64(0), s[3], filter); + + sum[0] = vpaddq_s64(sum[0], sum[1]); + sum[2] = vpaddq_s64(sum[2], sum[3]); + + int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2])); + + uint16x4_t res_u16 = vqrshrun_n_s32(res_s32, FILTER_BITS); + return vmin_u16(res_u16, max); +} + +static INLINE uint16x8_t highbd_convolve8_8(const int16x8_t s[8], + const int16x8_t filter, + const uint16x8_t max) { + int64x2_t sum[8]; + + sum[0] = vpx_dotq_s16(vdupq_n_s64(0), s[0], filter); + sum[1] = vpx_dotq_s16(vdupq_n_s64(0), s[1], filter); + sum[2] = vpx_dotq_s16(vdupq_n_s64(0), s[2], filter); + sum[3] = vpx_dotq_s16(vdupq_n_s64(0), s[3], filter); + sum[4] = vpx_dotq_s16(vdupq_n_s64(0), s[4], filter); + sum[5] = vpx_dotq_s16(vdupq_n_s64(0), s[5], filter); + sum[6] = vpx_dotq_s16(vdupq_n_s64(0), s[6], filter); + sum[7] = vpx_dotq_s16(vdupq_n_s64(0), s[7], filter); + + int64x2_t sum01 = vpaddq_s64(sum[0], sum[1]); + int64x2_t sum23 = vpaddq_s64(sum[2], sum[3]); + int64x2_t sum45 = vpaddq_s64(sum[4], sum[5]); + int64x2_t sum67 = vpaddq_s64(sum[6], sum[7]); + + int32x4_t res0 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t res1 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS), + vqrshrun_n_s32(res1, FILTER_BITS)); + return vminq_u16(res, max); +} + +static INLINE void highbd_convolve_4tap_horiz_sve( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x4_t filters, int bd) { + const int16x8_t filter = vcombine_s16(filters, vdup_n_s16(0)); + + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + do { + int16x4_t s0[4], s1[4], s2[4], s3[4]; + load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = highbd_convolve4_4(s0, filter, max); + uint16x4_t d1 = highbd_convolve4_4(s1, filter, max); + uint16x4_t d2 = highbd_convolve4_4(s2, filter, max); + uint16x4_t d3 = highbd_convolve4_4(s3, filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + const uint16x8_t idx = vld1q_u16(kTblConv4_8); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int width = w; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x8_t d0 = highbd_convolve4_8(s0, filter, max, idx); + uint16x8_t d1 = highbd_convolve4_8(s1, filter, max, idx); + uint16x8_t d2 = highbd_convolve4_8(s2, filter, max, idx); + uint16x8_t d3 = highbd_convolve4_8(s3, filter, max, idx); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE void highbd_convolve_8tap_horiz_sve( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x8_t filters, int bd) { + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = highbd_convolve8_4(s0, filters, max); + uint16x4_t d1 = highbd_convolve8_4(s1, filters, max); + uint16x4_t d2 = highbd_convolve8_4(s2, filters, max); + uint16x4_t d3 = highbd_convolve8_4(s3, filters, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int width = w; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = highbd_convolve8_8(s0, filters, max); + uint16x8_t d1 = highbd_convolve8_8(s1, filters, max); + uint16x8_t d2 = highbd_convolve8_8(s2, filters, max); + uint16x8_t d3 = highbd_convolve8_8(s3, filters, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +void vpx_highbd_convolve8_horiz_sve(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + if (x_step_q4 != 16) { + vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { + const int16x4_t x_filter_4tap = vld1_s16(filter[x0_q4] + 2); + highbd_convolve_4tap_horiz_sve(src - 1, src_stride, dst, dst_stride, w, h, + x_filter_4tap, bd); + } else { + const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]); + highbd_convolve_8tap_horiz_sve(src - 3, src_stride, dst, dst_stride, w, h, + x_filter_8tap, bd); + } +} + +void vpx_highbd_convolve8_avg_horiz_sve(const uint16_t *src, + ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + if (x_step_q4 != 16) { + vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, + bd); + return; + } + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + const int16x8_t filters = vld1q_s16(filter[x0_q4]); + + src -= 3; + + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = highbd_convolve8_4(s0, filters, max); + uint16x4_t d1 = highbd_convolve8_4(s1, filters, max); + uint16x4_t d2 = highbd_convolve8_4(s2, filters, max); + uint16x4_t d3 = highbd_convolve8_4(s3, filters, max); + + d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride)); + d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride)); + d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride)); + d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int width = w; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = highbd_convolve8_8(s0, filters, max); + uint16x8_t d1 = highbd_convolve8_8(s1, filters, max); + uint16x8_t d2 = highbd_convolve8_8(s2, filters, max); + uint16x8_t d3 = highbd_convolve8_8(s3, filters, max); + + d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride)); + d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride)); + d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride)); + d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride)); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c new file mode 100644 index 0000000000..4ed7718f7d --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c @@ -0,0 +1,452 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_neon_sve_bridge.h" +#include "vpx_dsp/arm/vpx_neon_sve2_bridge.h" + +// clang-format off +DECLARE_ALIGNED(16, static const uint16_t, kDotProdMergeBlockTbl[24]) = { + // Shift left and insert new last column in transposed 4x4 block. + 1, 2, 3, 0, 5, 6, 7, 4, + // Shift left and insert two new columns in transposed 4x4 block. + 2, 3, 0, 1, 6, 7, 4, 5, + // Shift left and insert three new columns in transposed 4x4 block. + 3, 0, 1, 2, 7, 4, 5, 6, +}; +// clang-format on + +static INLINE void transpose_concat_4x4(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + int16x8_t res[2]) { + // Transpose 16-bit elements: + // s0: 00, 01, 02, 03 + // s1: 10, 11, 12, 13 + // s2: 20, 21, 22, 23 + // s3: 30, 31, 32, 33 + // + // res[0]: 00 10 20 30 01 11 21 31 + // res[1]: 02 12 22 32 03 13 23 33 + + int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0)); + int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0)); + int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0)); + int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0)); + + int32x4_t s01 = vreinterpretq_s32_s16(vzip1q_s16(s0q, s1q)); + int32x4_t s23 = vreinterpretq_s32_s16(vzip1q_s16(s2q, s3q)); + + int32x4x2_t t0123 = vzipq_s32(s01, s23); + + res[0] = vreinterpretq_s16_s32(t0123.val[0]); + res[1] = vreinterpretq_s16_s32(t0123.val[1]); +} + +static INLINE void transpose_concat_8x4(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + int16x8_t res[4]) { + // Transpose 16-bit elements: + // s0: 00, 01, 02, 03, 04, 05, 06, 07 + // s1: 10, 11, 12, 13, 14, 15, 16, 17 + // s2: 20, 21, 22, 23, 24, 25, 26, 27 + // s3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // res[0]: 00 10 20 30 01 11 21 31 + // res[1]: 02 12 22 32 03 13 23 33 + // res[2]: 04 14 24 34 05 15 25 35 + // res[3]: 06 16 26 36 07 17 27 37 + + int16x8x2_t s01 = vzipq_s16(s0, s1); + int16x8x2_t s23 = vzipq_s16(s2, s3); + + int32x4x2_t t0123_lo = vzipq_s32(vreinterpretq_s32_s16(s01.val[0]), + vreinterpretq_s32_s16(s23.val[0])); + int32x4x2_t t0123_hi = vzipq_s32(vreinterpretq_s32_s16(s01.val[1]), + vreinterpretq_s32_s16(s23.val[1])); + + res[0] = vreinterpretq_s16_s32(t0123_lo.val[0]); + res[1] = vreinterpretq_s16_s32(t0123_lo.val[1]); + res[2] = vreinterpretq_s16_s32(t0123_hi.val[0]); + res[3] = vreinterpretq_s16_s32(t0123_hi.val[1]); +} + +static INLINE void vpx_tbl2x4_s16(int16x8_t s0[4], int16x8_t s1[4], + int16x8_t res[4], uint16x8_t idx) { + res[0] = vpx_tbl2_s16(s0[0], s1[0], idx); + res[1] = vpx_tbl2_s16(s0[1], s1[1], idx); + res[2] = vpx_tbl2_s16(s0[2], s1[2], idx); + res[3] = vpx_tbl2_s16(s0[3], s1[3], idx); +} + +static INLINE void vpx_tbl2x2_s16(int16x8_t s0[2], int16x8_t s1[2], + int16x8_t res[2], uint16x8_t idx) { + res[0] = vpx_tbl2_s16(s0[0], s1[0], idx); + res[1] = vpx_tbl2_s16(s0[1], s1[1], idx); +} + +static INLINE uint16x4_t highbd_convolve8_4_v(int16x8_t s_lo[2], + int16x8_t s_hi[2], + int16x8_t filter, + uint16x4_t max) { + int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[0], filter, 0); + sum01 = vpx_dotq_lane_s16(sum01, s_hi[0], filter, 1); + + int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[1], filter, 0); + sum23 = vpx_dotq_lane_s16(sum23, s_hi[1], filter, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + + uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS); + return vmin_u16(res, max); +} + +static INLINE uint16x8_t highbd_convolve8_8_v(const int16x8_t s_lo[4], + const int16x8_t s_hi[4], + const int16x8_t filter, + const uint16x8_t max) { + int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[0], filter, 0); + sum01 = vpx_dotq_lane_s16(sum01, s_hi[0], filter, 1); + + int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[1], filter, 0); + sum23 = vpx_dotq_lane_s16(sum23, s_hi[1], filter, 1); + + int64x2_t sum45 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[2], filter, 0); + sum45 = vpx_dotq_lane_s16(sum45, s_hi[2], filter, 1); + + int64x2_t sum67 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[3], filter, 0); + sum67 = vpx_dotq_lane_s16(sum67, s_hi[3], filter, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), + vqrshrun_n_s32(sum4567, FILTER_BITS)); + return vminq_u16(res, max); +} + +static INLINE void highbd_convolve8_8tap_vert_sve2( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) { + assert(w >= 4 && h >= 4); + uint16x8x3_t merge_tbl_idx = vld1q_u16_x3(kDotProdMergeBlockTbl); + + // Correct indices by the size of vector length. + merge_tbl_idx.val[0] = vaddq_u16( + merge_tbl_idx.val[0], + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL))); + merge_tbl_idx.val[1] = vaddq_u16( + merge_tbl_idx.val[1], + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL))); + merge_tbl_idx.val[2] = vaddq_u16( + merge_tbl_idx.val[2], + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL))); + + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; + transpose_concat_4x4(s0, s1, s2, s3, s0123); + transpose_concat_4x4(s1, s2, s3, s4, s1234); + transpose_concat_4x4(s2, s3, s4, s5, s2345); + transpose_concat_4x4(s3, s4, s5, s6, s3456); + + do { + int16x4_t s7, s8, s9, sA; + + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &sA); + + int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; + transpose_concat_4x4(s7, s8, s9, sA, s789A); + + vpx_tbl2x2_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]); + vpx_tbl2x2_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]); + vpx_tbl2x2_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]); + + uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, filter, max); + uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, filter, max); + uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, filter, max); + uint16x4_t d3 = highbd_convolve8_4_v(s3456, s789A, filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; + transpose_concat_8x4(s0, s1, s2, s3, s0123); + transpose_concat_8x4(s1, s2, s3, s4, s1234); + transpose_concat_8x4(s2, s3, s4, s5, s2345); + transpose_concat_8x4(s3, s4, s5, s6, s3456); + + do { + int16x8_t s7, s8, s9, sA; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &sA); + + int16x8_t s4567[4], s5678[5], s6789[4], s789A[4]; + transpose_concat_8x4(s7, s8, s9, sA, s789A); + + vpx_tbl2x4_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]); + vpx_tbl2x4_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]); + vpx_tbl2x4_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]); + + uint16x8_t d0 = highbd_convolve8_8_v(s0123, s4567, filter, max); + uint16x8_t d1 = highbd_convolve8_8_v(s1234, s5678, filter, max); + uint16x8_t d2 = highbd_convolve8_8_v(s2345, s6789, filter, max); + uint16x8_t d3 = highbd_convolve8_8_v(s3456, s789A, filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s0123[2] = s4567[2]; + s0123[3] = s4567[3]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s1234[2] = s5678[2]; + s1234[3] = s5678[3]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s2345[2] = s6789[2]; + s2345[3] = s6789[3]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + s3456[2] = s789A[2]; + s3456[3] = s789A[3]; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +void vpx_highbd_convolve8_vert_sve2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + if (y_step_q4 != 16) { + vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[y0_q4]) <= 4) { + vpx_highbd_convolve8_vert_neon(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, + bd); + } else { + const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]); + highbd_convolve8_8tap_vert_sve2(src - 3 * src_stride, src_stride, dst, + dst_stride, w, h, y_filter_8tap, bd); + } +} + +void vpx_highbd_convolve8_avg_vert_sve2(const uint16_t *src, + ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + if (y_step_q4 != 16) { + vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, + bd); + return; + } + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + const int16x8_t filters = vld1q_s16(filter[y0_q4]); + + src -= 3 * src_stride; + + uint16x8x3_t merge_tbl_idx = vld1q_u16_x3(kDotProdMergeBlockTbl); + + // Correct indices by the size of vector length. + merge_tbl_idx.val[0] = vaddq_u16( + merge_tbl_idx.val[0], + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL))); + merge_tbl_idx.val[1] = vaddq_u16( + merge_tbl_idx.val[1], + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL))); + merge_tbl_idx.val[2] = vaddq_u16( + merge_tbl_idx.val[2], + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL))); + + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; + transpose_concat_4x4(s0, s1, s2, s3, s0123); + transpose_concat_4x4(s1, s2, s3, s4, s1234); + transpose_concat_4x4(s2, s3, s4, s5, s2345); + transpose_concat_4x4(s3, s4, s5, s6, s3456); + + do { + int16x4_t s7, s8, s9, sA; + + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &sA); + + int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; + transpose_concat_4x4(s7, s8, s9, sA, s789A); + + vpx_tbl2x2_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]); + vpx_tbl2x2_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]); + vpx_tbl2x2_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]); + + uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, filters, max); + uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, filters, max); + uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, filters, max); + uint16x4_t d3 = highbd_convolve8_4_v(s3456, s789A, filters, max); + + d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride)); + d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride)); + d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride)); + d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; + transpose_concat_8x4(s0, s1, s2, s3, s0123); + transpose_concat_8x4(s1, s2, s3, s4, s1234); + transpose_concat_8x4(s2, s3, s4, s5, s2345); + transpose_concat_8x4(s3, s4, s5, s6, s3456); + + do { + int16x8_t s7, s8, s9, sA; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &sA); + + int16x8_t s4567[4], s5678[5], s6789[4], s789A[4]; + transpose_concat_8x4(s7, s8, s9, sA, s789A); + + vpx_tbl2x4_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]); + vpx_tbl2x4_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]); + vpx_tbl2x4_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]); + + uint16x8_t d0 = highbd_convolve8_8_v(s0123, s4567, filters, max); + uint16x8_t d1 = highbd_convolve8_8_v(s1234, s5678, filters, max); + uint16x8_t d2 = highbd_convolve8_8_v(s2345, s6789, filters, max); + uint16x8_t d3 = highbd_convolve8_8_v(s3456, s789A, filters, max); + + d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride)); + d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride)); + d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride)); + d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride)); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s0123[2] = s4567[2]; + s0123[3] = s4567[3]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s1234[2] = s5678[2]; + s1234[3] = s5678[3]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s2345[2] = s6789[2]; + s2345[3] = s6789[3]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + s3456[2] = s789A[2]; + s3456[3] = s789A[3]; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c deleted file mode 100644 index 414ade3530..0000000000 --- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2016 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_dsp_rtcd.h" -#include "vpx_dsp/vpx_dsp_common.h" -#include "vpx_dsp/vpx_filter.h" -#include "vpx_ports/mem.h" - -void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, - uint16_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h, int bd) { - // + 1 to make it divisible by 4 - uint16_t temp[64 * 136]; - const int intermediate_height = - (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; - - /* Filter starting 3 lines back. The neon implementation will ignore the given - * height and filter a multiple of 4 lines. Since this goes in to the temp - * buffer which has lots of extra room and is subsequently discarded this is - * safe if somewhat less than ideal. */ - vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, - filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, - intermediate_height, bd); - - /* Step into the temp buffer 3 lines to get the actual frame data */ - vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, - x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); -} - -void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride, - uint16_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, - int w, int h, int bd) { - // + 1 to make it divisible by 4 - uint16_t temp[64 * 136]; - const int intermediate_height = - (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; - - /* This implementation has the same issues as above. In addition, we only want - * to average the values after both passes. - */ - vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, - filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, - intermediate_height, bd); - vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, - x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, - bd); -} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c index c54e588239..579096d78a 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c @@ -162,7 +162,7 @@ FUN_FLIP_SIGN(16, q_) // flip_sign_16 #define FUN_FLIP_SIGN_BACK(w, r) \ static INLINE uint8x##w##_t flip_sign_back_##w(const int8x##w##_t v) { \ - const int8x##w##_t sign_bit = vdup##r##n_s8(0x80); \ + const int8x##w##_t sign_bit = vdup##r##n_s8((int8_t)0x80); \ return vreinterpret##r##u8_s8(veor##r##s8(v, sign_bit)); \ } diff --git a/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h index 38b0b6c1a9..268c4bd962 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h +++ b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h @@ -154,11 +154,10 @@ static INLINE void store_u8_4x1_high(uint8_t *buf, uint8x8_t a) { static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, ptrdiff_t stride) { uint32_t a; - uint32x2_t a_u32; - if (stride == 4) return vld1_u8(buf); + uint32x2_t a_u32 = vdup_n_u32(0); memcpy(&a, buf, 4); buf += stride; - a_u32 = vdup_n_u32(a); + a_u32 = vset_lane_u32(a, a_u32, 0); memcpy(&a, buf, 4); a_u32 = vset_lane_u32(a, a_u32, 1); return vreinterpret_u8_u32(a_u32); @@ -177,11 +176,10 @@ static INLINE uint16x4_t load_unaligned_u16(const uint16_t *buf) { static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf, ptrdiff_t stride) { uint64_t a; - uint64x2_t a_u64; - if (stride == 4) return vld1q_u16(buf); + uint64x2_t a_u64 = vdupq_n_u64(0); memcpy(&a, buf, 8); buf += stride; - a_u64 = vdupq_n_u64(a); + a_u64 = vsetq_lane_u64(a, a_u64, 0); memcpy(&a, buf, 8); a_u64 = vsetq_lane_u64(a, a_u64, 1); return vreinterpretq_u16_u64(a_u64); @@ -191,10 +189,6 @@ static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf, static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) { const uint32x2_t a_u32 = vreinterpret_u32_u8(a); - if (stride == 4) { - vst1_u8(buf, a); - return; - } uint32_to_mem(buf, vget_lane_u32(a_u32, 0)); buf += stride; uint32_to_mem(buf, vget_lane_u32(a_u32, 1)); @@ -204,11 +198,10 @@ static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride, static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, ptrdiff_t stride) { uint32_t a; - uint32x4_t a_u32; - if (stride == 4) return vld1q_u8(buf); + uint32x4_t a_u32 = vdupq_n_u32(0); memcpy(&a, buf, 4); buf += stride; - a_u32 = vdupq_n_u32(a); + a_u32 = vsetq_lane_u32(a, a_u32, 0); memcpy(&a, buf, 4); buf += stride; a_u32 = vsetq_lane_u32(a, a_u32, 1); @@ -225,10 +218,6 @@ static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, static INLINE void store_unaligned_u8q(uint8_t *buf, ptrdiff_t stride, const uint8x16_t a) { const uint32x4_t a_u32 = vreinterpretq_u32_u8(a); - if (stride == 4) { - vst1q_u8(buf, a); - return; - } uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0)); buf += stride; uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1)); @@ -449,6 +438,142 @@ static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p, vst1q_u8(s, s7); } +static INLINE void store_u16_4x3(uint16_t *s, const ptrdiff_t p, + const uint16x4_t s0, const uint16x4_t s1, + const uint16x4_t s2) { + vst1_u16(s, s0); + s += p; + vst1_u16(s, s1); + s += p; + vst1_u16(s, s2); +} + +static INLINE void load_s16_4x3(const int16_t *s, const ptrdiff_t p, + int16x4_t *s0, int16x4_t *s1, int16x4_t *s2) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); +} + +static INLINE void load_s16_4x4(const int16_t *s, const ptrdiff_t p, + int16x4_t *s0, int16x4_t *s1, int16x4_t *s2, + int16x4_t *s3) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); + s += p; + *s3 = vld1_s16(s); +} + +static INLINE void store_u16_4x4(uint16_t *s, const ptrdiff_t p, + const uint16x4_t s0, const uint16x4_t s1, + const uint16x4_t s2, const uint16x4_t s3) { + vst1_u16(s, s0); + s += p; + vst1_u16(s, s1); + s += p; + vst1_u16(s, s2); + s += p; + vst1_u16(s, s3); +} + +static INLINE void load_s16_4x7(const int16_t *s, const ptrdiff_t p, + int16x4_t *s0, int16x4_t *s1, int16x4_t *s2, + int16x4_t *s3, int16x4_t *s4, int16x4_t *s5, + int16x4_t *s6) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); + s += p; + *s3 = vld1_s16(s); + s += p; + *s4 = vld1_s16(s); + s += p; + *s5 = vld1_s16(s); + s += p; + *s6 = vld1_s16(s); +} + +static INLINE void load_s16_8x3(const int16_t *s, const ptrdiff_t p, + int16x8_t *s0, int16x8_t *s1, int16x8_t *s2) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); +} + +static INLINE void load_s16_8x4(const int16_t *s, const ptrdiff_t p, + int16x8_t *s0, int16x8_t *s1, int16x8_t *s2, + int16x8_t *s3) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); + s += p; + *s3 = vld1q_s16(s); +} + +static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p, + uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2, + uint16x8_t *s3) { + *s0 = vld1q_u16(s); + s += p; + *s1 = vld1q_u16(s); + s += p; + *s2 = vld1q_u16(s); + s += p; + *s3 = vld1q_u16(s); +} + +static INLINE void store_u16_8x4(uint16_t *s, const ptrdiff_t p, + const uint16x8_t s0, const uint16x8_t s1, + const uint16x8_t s2, const uint16x8_t s3) { + vst1q_u16(s, s0); + s += p; + vst1q_u16(s, s1); + s += p; + vst1q_u16(s, s2); + s += p; + vst1q_u16(s, s3); +} + +static INLINE void store_u16_8x3(uint16_t *s, const ptrdiff_t p, + const uint16x8_t s0, const uint16x8_t s1, + const uint16x8_t s2) { + vst1q_u16(s, s0); + s += p; + vst1q_u16(s, s1); + s += p; + vst1q_u16(s, s2); +} + +static INLINE void load_s16_8x7(const int16_t *s, const ptrdiff_t p, + int16x8_t *s0, int16x8_t *s1, int16x8_t *s2, + int16x8_t *s3, int16x8_t *s4, int16x8_t *s5, + int16x8_t *s6) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); + s += p; + *s3 = vld1q_s16(s); + s += p; + *s4 = vld1q_s16(s); + s += p; + *s5 = vld1q_s16(s); + s += p; + *s6 = vld1q_s16(s); +} + static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p, uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2, uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5, @@ -470,4 +595,46 @@ static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p, *s7 = vld1q_u16(s); } +static INLINE void load_s16_4x8(const int16_t *s, const ptrdiff_t p, + int16x4_t *s0, int16x4_t *s1, int16x4_t *s2, + int16x4_t *s3, int16x4_t *s4, int16x4_t *s5, + int16x4_t *s6, int16x4_t *s7) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); + s += p; + *s3 = vld1_s16(s); + s += p; + *s4 = vld1_s16(s); + s += p; + *s5 = vld1_s16(s); + s += p; + *s6 = vld1_s16(s); + s += p; + *s7 = vld1_s16(s); +} + +static INLINE void load_s16_8x8(const int16_t *s, const ptrdiff_t p, + int16x8_t *s0, int16x8_t *s1, int16x8_t *s2, + int16x8_t *s3, int16x8_t *s4, int16x8_t *s5, + int16x8_t *s6, int16x8_t *s7) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); + s += p; + *s3 = vld1q_s16(s); + s += p; + *s4 = vld1q_s16(s); + s += p; + *s5 = vld1q_s16(s); + s += p; + *s6 = vld1q_s16(s); + s += p; + *s7 = vld1q_s16(s); +} + #endif // VPX_VPX_DSP_ARM_MEM_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c new file mode 100644 index 0000000000..a18cbbd736 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" +#include "vpx_dsp/arm/vpx_neon_sve_bridge.h" + +uint64_t vpx_sum_squares_2d_i16_sve(const int16_t *src, int stride, int size) { + if (size == 4) { + int16x4_t s[4]; + int64x2_t sum = vdupq_n_s64(0); + + s[0] = vld1_s16(src + 0 * stride); + s[1] = vld1_s16(src + 1 * stride); + s[2] = vld1_s16(src + 2 * stride); + s[3] = vld1_s16(src + 3 * stride); + + int16x8_t s01 = vcombine_s16(s[0], s[1]); + int16x8_t s23 = vcombine_s16(s[2], s[3]); + + sum = vpx_dotq_s16(sum, s01, s01); + sum = vpx_dotq_s16(sum, s23, s23); + + return horizontal_add_uint64x2(vreinterpretq_u64_s64(sum)); + } else { + int rows = size; + int64x2_t sum[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0), + vdupq_n_s64(0) }; + + do { + const int16_t *src_ptr = src; + int cols = size; + + do { + int16x8_t s[8]; + load_s16_8x8(src_ptr, stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + + sum[0] = vpx_dotq_s16(sum[0], s[0], s[0]); + sum[1] = vpx_dotq_s16(sum[1], s[1], s[1]); + sum[2] = vpx_dotq_s16(sum[2], s[2], s[2]); + sum[3] = vpx_dotq_s16(sum[3], s[3], s[3]); + sum[0] = vpx_dotq_s16(sum[0], s[4], s[4]); + sum[1] = vpx_dotq_s16(sum[1], s[5], s[5]); + sum[2] = vpx_dotq_s16(sum[2], s[6], s[6]); + sum[3] = vpx_dotq_s16(sum[3], s[7], s[7]); + + src_ptr += 8; + cols -= 8; + } while (cols); + + src += 8 * stride; + rows -= 8; + } while (rows); + + sum[0] = vaddq_s64(sum[0], sum[1]); + sum[2] = vaddq_s64(sum[2], sum[3]); + sum[0] = vaddq_s64(sum[0], sum[2]); + + return horizontal_add_uint64x2(vreinterpretq_u64_s64(sum[0])); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h index 74f85a6bb6..c989a6721b 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h +++ b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h @@ -524,12 +524,20 @@ static INLINE void transpose_s32_8x4(int32x4_t *const a0, int32x4_t *const a1, *a7 = vreinterpretq_s32_s64(c3.val[1]); } -// Note: Using 'd' registers or 'q' registers has almost identical speed. We use -// 'q' registers here to save some instructions. static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5, uint8x8_t *a6, uint8x8_t *a7) { - // Swap 8 bit elements. Goes from: + // Widen to 128-bit registers (usually a no-op once inlined.) + const uint8x16_t a0q = vcombine_u8(*a0, vdup_n_u8(0)); + const uint8x16_t a1q = vcombine_u8(*a1, vdup_n_u8(0)); + const uint8x16_t a2q = vcombine_u8(*a2, vdup_n_u8(0)); + const uint8x16_t a3q = vcombine_u8(*a3, vdup_n_u8(0)); + const uint8x16_t a4q = vcombine_u8(*a4, vdup_n_u8(0)); + const uint8x16_t a5q = vcombine_u8(*a5, vdup_n_u8(0)); + const uint8x16_t a6q = vcombine_u8(*a6, vdup_n_u8(0)); + const uint8x16_t a7q = vcombine_u8(*a7, vdup_n_u8(0)); + + // Zip 8 bit elements. Goes from: // a0: 00 01 02 03 04 05 06 07 // a1: 10 11 12 13 14 15 16 17 // a2: 20 21 22 23 24 25 26 27 @@ -539,43 +547,41 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, // a6: 60 61 62 63 64 65 66 67 // a7: 70 71 72 73 74 75 76 77 // to: - // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56 - // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57 - // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76 - // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77 - - const uint8x16x2_t b0 = - vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5)); - const uint8x16x2_t b1 = - vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7)); - - // Swap 16 bit elements resulting in: - // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74 - // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76 - // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75 - // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77 - - const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), - vreinterpretq_u16_u8(b1.val[0])); - const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), - vreinterpretq_u16_u8(b1.val[1])); - - // Unzip 32 bit elements resulting in: + // b0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // b1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // b2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // b3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + const uint8x16_t b0 = vzipq_u8(a0q, a1q).val[0]; + const uint8x16_t b1 = vzipq_u8(a2q, a3q).val[0]; + const uint8x16_t b2 = vzipq_u8(a4q, a5q).val[0]; + const uint8x16_t b3 = vzipq_u8(a6q, a7q).val[0]; + + // Zip 16 bit elements resulting in: + // c0.val[0]: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + // c0.val[1]: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + // c1.val[0]: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + // c1.val[1]: 44 54 64 74 45 55 65 75 46 66 56 76 47 67 57 77 + const uint16x8x2_t c0 = + vzipq_u16(vreinterpretq_u16_u8(b0), vreinterpretq_u16_u8(b1)); + const uint16x8x2_t c1 = + vzipq_u16(vreinterpretq_u16_u8(b2), vreinterpretq_u16_u8(b3)); + + // Zip 32 bit elements resulting in: // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 - // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // d0.val[1]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // d1.val[0]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 - const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]), + const uint32x4x2_t d0 = vzipq_u32(vreinterpretq_u32_u16(c0.val[0]), vreinterpretq_u32_u16(c1.val[0])); - const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]), + const uint32x4x2_t d1 = vzipq_u32(vreinterpretq_u32_u16(c0.val[1]), vreinterpretq_u32_u16(c1.val[1])); *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0])); *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0])); - *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0])); - *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0])); - *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1])); - *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1])); + *a2 = vreinterpret_u8_u32(vget_low_u32(d0.val[1])); + *a3 = vreinterpret_u8_u32(vget_high_u32(d0.val[1])); + *a4 = vreinterpret_u8_u32(vget_low_u32(d1.val[0])); + *a5 = vreinterpret_u8_u32(vget_high_u32(d1.val[0])); *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1])); *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1])); } diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c index 65fb67c984..037ea1142d 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c @@ -20,44 +20,36 @@ #include "vpx_dsp/vpx_filter.h" #include "vpx_ports/mem.h" -// Note: -// 1. src is not always 32-bit aligned, so don't call vld1_lane_u32(src). -// 2. After refactoring the shared code in kernel loops with inline functions, -// the decoder speed dropped a lot when using gcc compiler. Therefore there is -// no refactoring for those parts by now. -// 3. For horizontal convolve, there is an alternative optimization that -// convolves a single row in each loop. For each row, 8 sample banks with 4 or 8 -// samples in each are read from memory: src, (src+1), (src+2), (src+3), -// (src+4), (src+5), (src+6), (src+7), or prepared by vector extract -// instructions. This optimization is much faster in speed unit test, but slowed -// down the whole decoder by 5%. - -static INLINE void vpx_convolve_4tap_horiz_neon(const uint8_t *src, - ptrdiff_t src_stride, - uint8_t *dst, - ptrdiff_t dst_stride, int w, - int h, const int16x4_t filter) { +static INLINE void convolve_4tap_horiz_neon(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, + const int16x8_t filter) { + // 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const uint8x8_t x_filter = + vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(filter)), 1); + + // Neon does not have lane-referencing multiply or multiply-accumulate + // instructions that operate on vectors of 8-bit elements. This means we have + // to duplicate filter taps into a whole vector and use standard multiply / + // multiply-accumulate instructions. + const uint8x8_t filter_taps[4] = { vdup_lane_u8(x_filter, 2), + vdup_lane_u8(x_filter, 3), + vdup_lane_u8(x_filter, 4), + vdup_lane_u8(x_filter, 5) }; + if (w == 4) { do { - int16x4_t s0[4], s1[4]; - - int16x8_t t0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src))); - s0[0] = vget_low_s16(vextq_s16(t0, t0, 0)); - s0[1] = vget_low_s16(vextq_s16(t0, t0, 1)); - s0[2] = vget_low_s16(vextq_s16(t0, t0, 2)); - s0[3] = vget_low_s16(vextq_s16(t0, t0, 3)); + uint8x8_t s01[4]; - int16x8_t t1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + src_stride))); - s1[0] = vget_low_s16(vextq_s16(t1, t1, 0)); - s1[1] = vget_low_s16(vextq_s16(t1, t1, 1)); - s1[2] = vget_low_s16(vextq_s16(t1, t1, 2)); - s1[3] = vget_low_s16(vextq_s16(t1, t1, 3)); + s01[0] = load_unaligned_u8(src + 0, src_stride); + s01[1] = load_unaligned_u8(src + 1, src_stride); + s01[2] = load_unaligned_u8(src + 2, src_stride); + s01[3] = load_unaligned_u8(src + 3, src_stride); - int16x4_t d0 = convolve4_4(s0[0], s0[1], s0[2], s0[3], filter); - int16x4_t d1 = convolve4_4(s1[0], s1[1], s1[2], s1[3], filter); - uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d01 = convolve4_8(s01[0], s01[1], s01[2], s01[3], filter_taps); - store_u8(dst, dst_stride, d01); + store_unaligned_u8(dst, dst_stride, d01); src += 2 * src_stride; dst += 2 * dst_stride; @@ -70,25 +62,20 @@ static INLINE void vpx_convolve_4tap_horiz_neon(const uint8_t *src, int width = w; do { - int16x8_t t0[2], t1[2]; - int16x8_t s0[4], s1[4]; - - t0[0] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - t0[1] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 8))); - s0[0] = vextq_s16(t0[0], t0[1], 0); - s0[1] = vextq_s16(t0[0], t0[1], 1); - s0[2] = vextq_s16(t0[0], t0[1], 2); - s0[3] = vextq_s16(t0[0], t0[1], 3); - - t1[0] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + src_stride))); - t1[1] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + src_stride + 8))); - s1[0] = vextq_s16(t1[0], t1[1], 0); - s1[1] = vextq_s16(t1[0], t1[1], 1); - s1[2] = vextq_s16(t1[0], t1[1], 2); - s1[3] = vextq_s16(t1[0], t1[1], 3); - - uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter); - uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter); + uint8x8_t s0[4], s1[4]; + + s0[0] = vld1_u8(s + 0); + s0[1] = vld1_u8(s + 1); + s0[2] = vld1_u8(s + 2); + s0[3] = vld1_u8(s + 3); + + s1[0] = vld1_u8(s + src_stride + 0); + s1[1] = vld1_u8(s + src_stride + 1); + s1[2] = vld1_u8(s + src_stride + 2); + s1[3] = vld1_u8(s + src_stride + 3); + + uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter_taps); + uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter_taps); vst1_u8(d, d0); vst1_u8(d + dst_stride, d1); @@ -103,47 +90,41 @@ static INLINE void vpx_convolve_4tap_horiz_neon(const uint8_t *src, } } -static INLINE void vpx_convolve_8tap_horiz_neon(const uint8_t *src, - ptrdiff_t src_stride, - uint8_t *dst, - ptrdiff_t dst_stride, int w, - int h, const int16x8_t filter) { - uint8x8_t t0, t1, t2, t3; - +static INLINE void convolve_8tap_horiz_neon(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, + const int16x8_t filter) { if (h == 4) { - uint8x8_t d01, d23; - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - + uint8x8_t t0, t1, t2, t3; load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); - s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); - s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); - s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); - - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + src += 7; do { - load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); - transpose_u8_8x4(&t0, &t1, &t2, &t3); - s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); - s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); - - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); + + transpose_u8_8x4(&t7, &t8, &t9, &t10); + int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7))); + int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8))); + int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9))); + int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10))); + + int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); + int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); + int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); + int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); transpose_u8_4x4(&d01, &d23); @@ -162,52 +143,33 @@ static INLINE void vpx_convolve_8tap_horiz_neon(const uint8_t *src, w -= 4; } while (w != 0); } else { - int width; - const uint8_t *s; - uint8x8_t t4, t5, t6, t7, d04, d15, d26, d37; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - if (w == 4) { do { + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - src += 8 * src_stride; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(dst + 4 * dst_stride); - __builtin_prefetch(dst + 5 * dst_stride); - __builtin_prefetch(dst + 6 * dst_stride); - __builtin_prefetch(dst + 7 * dst_stride); + transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - __builtin_prefetch(src + 7 * src_stride); - d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + uint8x8_t d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + uint8x8_t d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + uint8x8_t d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + uint8x8_t d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); transpose_u8_8x4(&d04, &d15, &d26, &d37); @@ -216,57 +178,53 @@ static INLINE void vpx_convolve_8tap_horiz_neon(const uint8_t *src, store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26); store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37); + src += 8 * src_stride; dst += 8 * dst_stride; h -= 8; } while (h > 0); } else { - uint8_t *d; - uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7; - int16x8_t s11, s12, s13, s14; - do { + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); - - width = w; - s = src + 7; - d = dst; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(dst + 4 * dst_stride); - __builtin_prefetch(dst + 5 * dst_stride); - __builtin_prefetch(dst + 6 * dst_stride); - __builtin_prefetch(dst + 7 * dst_stride); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + const uint8_t *s = src + 7; + uint8_t *d = dst; + int width = w; do { - load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); - s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); - - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); - d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter); - d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter); - d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter); - d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter); + uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15; + load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14, + &t15); + + transpose_u8_8x8(&t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t8)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t9)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t10)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t11)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t12)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t13)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t14)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t15)); + + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + uint8x8_t d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter); + uint8x8_t d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter); + uint8x8_t d6 = + convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter); + uint8x8_t d7 = + convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter); transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); @@ -304,17 +262,14 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, (void)y0_q4; (void)y_step_q4; + const int16x8_t x_filter = vld1q_s16(filter[x0_q4]); + if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { - /* All 4-tap and bilinear filter values are even, so halve them to reduce - * intermediate precision requirements. - */ - const int16x4_t x_filter_4tap = vshr_n_s16(vld1_s16(filter[x0_q4] + 2), 1); - vpx_convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h, - x_filter_4tap); + convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h, + x_filter); } else { - const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]); - vpx_convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h, - x_filter_8tap); + convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h, + x_filter); } } @@ -324,7 +279,6 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { const int16x8_t filters = vld1q_s16(filter[x0_q4]); - uint8x8_t t0, t1, t2, t3; assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); @@ -337,48 +291,41 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, src -= 3; if (h == 4) { - uint8x8_t d01, d23, dd01, dd23; - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); + uint8x8_t t0, t1, t2, t3; load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); - s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); - s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); - s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); - - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + src += 7; do { - load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); - transpose_u8_8x4(&t0, &t1, &t2, &t3); - s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); - s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); - - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); + + transpose_u8_8x4(&t7, &t8, &t9, &t10); + int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7))); + int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8))); + int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9))); + int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10))); + + int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); transpose_u8_4x4(&d01, &d23); - dd01 = load_u8(dst + 0 * dst_stride, 2 * dst_stride); - dd23 = load_u8(dst + 1 * dst_stride, 2 * dst_stride); + uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, 2 * dst_stride); + uint8x8_t dd23 = load_u8(dst + 1 * dst_stride, 2 * dst_stride); d01 = vrhadd_u8(d01, dd01); d23 = vrhadd_u8(d23, dd23); @@ -398,61 +345,40 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, w -= 4; } while (w != 0); } else { - int width; - const uint8_t *s; - uint8x8_t t4, t5, t6, t7; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - if (w == 4) { - uint8x8_t d04, d15, d26, d37, dd04, dd15, dd26, dd37; - do { + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - src += 8 * src_stride; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(dst + 4 * dst_stride); - __builtin_prefetch(dst + 5 * dst_stride); - __builtin_prefetch(dst + 6 * dst_stride); - __builtin_prefetch(dst + 7 * dst_stride); + transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - __builtin_prefetch(src + 7 * src_stride); - d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + uint8x8_t d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + uint8x8_t d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + uint8x8_t d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + uint8x8_t d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); transpose_u8_8x4(&d04, &d15, &d26, &d37); - dd04 = load_u8(dst + 0 * dst_stride, 4 * dst_stride); - dd15 = load_u8(dst + 1 * dst_stride, 4 * dst_stride); - dd26 = load_u8(dst + 2 * dst_stride, 4 * dst_stride); - dd37 = load_u8(dst + 3 * dst_stride, 4 * dst_stride); + uint8x8_t dd04 = load_u8(dst + 0 * dst_stride, 4 * dst_stride); + uint8x8_t dd15 = load_u8(dst + 1 * dst_stride, 4 * dst_stride); + uint8x8_t dd26 = load_u8(dst + 2 * dst_stride, 4 * dst_stride); + uint8x8_t dd37 = load_u8(dst + 3 * dst_stride, 4 * dst_stride); d04 = vrhadd_u8(d04, dd04); d15 = vrhadd_u8(d15, dd15); @@ -464,65 +390,54 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26); store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37); + src += 8 * src_stride; dst += 8 * dst_stride; h -= 8; } while (h != 0); } else { - uint8_t *d; - uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7; - int16x8_t s11, s12, s13, s14; - do { - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - __builtin_prefetch(src + 7 * src_stride); + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); - - width = w; - s = src + 7; - d = dst; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(dst + 4 * dst_stride); - __builtin_prefetch(dst + 5 * dst_stride); - __builtin_prefetch(dst + 6 * dst_stride); - __builtin_prefetch(dst + 7 * dst_stride); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + const uint8_t *s = src + 7; + uint8_t *d = dst; + int width = w; do { - load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); - s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); - - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); - d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters); - d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters); - d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters); - d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters); + uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15; + load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14, + &t15); + + transpose_u8_8x8(&t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t8)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t9)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t10)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t11)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t12)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t13)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t14)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t15)); + + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + uint8x8_t d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters); + uint8x8_t d5 = + convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters); + uint8x8_t d6 = + convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters); + uint8x8_t d7 = + convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters); transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); @@ -556,152 +471,37 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, } } -static INLINE void vpx_convolve_4tap_vert_neon(const uint8_t *src, - ptrdiff_t src_stride, - uint8_t *dst, - ptrdiff_t dst_stride, int w, - int h, const int16x4_t filter) { - if (w == 4) { - uint8x8_t t0, t1, t2, t3, d01, d23; - int16x4_t s0, s1, s2, s3, s4, s5, s6, d0, d1, d2, d3; - - load_u8_8x3(src, src_stride, &t0, &t1, &t2); - s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); - s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); - s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); - - src += 3 * src_stride; - - do { - load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); - s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); - s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); - s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); - s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); - - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - - d0 = convolve4_4(s0, s1, s2, s3, filter); - d1 = convolve4_4(s1, s2, s3, s4, filter); - d2 = convolve4_4(s2, s3, s4, s5, filter); - d3 = convolve4_4(s3, s4, s5, s6, filter); - /* We halved the filter values so -1 from right shift. */ - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - s0 = s4; - s1 = s5; - s2 = s6; - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h != 0); - } else { - int height; - const uint8_t *s; - uint8_t *d; - uint8x8_t t0, t1, t2, t3, d0, d1, d2, d3; - int16x8_t s0, s1, s2, s3, s4, s5, s6; - - do { - load_u8_8x3(src, src_stride, &t0, &t1, &t2); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - - s = src + 3 * src_stride; - d = dst; - height = h; - - do { - load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); - s3 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t3)); - - __builtin_prefetch(d + 0 * dst_stride); - __builtin_prefetch(d + 1 * dst_stride); - __builtin_prefetch(d + 2 * dst_stride); - __builtin_prefetch(d + 3 * dst_stride); - __builtin_prefetch(s + 0 * src_stride); - __builtin_prefetch(s + 1 * src_stride); - __builtin_prefetch(s + 2 * src_stride); - __builtin_prefetch(s + 3 * src_stride); - - d0 = convolve4_8(s0, s1, s2, s3, filter); - d1 = convolve4_8(s1, s2, s3, s4, filter); - d2 = convolve4_8(s2, s3, s4, s5, filter); - d3 = convolve4_8(s3, s4, s5, s6, filter); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - s0 = s4; - s1 = s5; - s2 = s6; - s += 4 * src_stride; - d += 4 * dst_stride; - height -= 4; - } while (height != 0); - src += 8; - dst += 8; - w -= 8; - } while (w != 0); - } -} - -static INLINE void vpx_convolve_8tap_vert_neon(const uint8_t *src, - ptrdiff_t src_stride, - uint8_t *dst, - ptrdiff_t dst_stride, int w, - int h, const int16x8_t filter) { +static INLINE void convolve_8tap_vert_neon(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, + const int16x8_t filter) { if (w == 4) { - uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23; - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); - s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); - s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); - s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); - s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); - s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); - s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); + int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); + int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); src += 7 * src_stride; do { - load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); - s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); - s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); - s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); - s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); - - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7))); + int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8))); + int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9))); + int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10))); + + int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); + int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); + int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); + int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8(dst + 0 * dst_stride, dst_stride, d01); store_u8(dst + 2 * dst_stride, dst_stride, d23); @@ -718,54 +518,33 @@ static INLINE void vpx_convolve_8tap_vert_neon(const uint8_t *src, h -= 4; } while (h != 0); } else { - int height; - const uint8_t *s; - uint8_t *d; - uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - do { - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); - - s = src + 7 * src_stride; - d = dst; - height = h; + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + const uint8_t *s = src + 7 * src_stride; + uint8_t *d = dst; + int height = h; do { - load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - - __builtin_prefetch(d + 0 * dst_stride); - __builtin_prefetch(d + 1 * dst_stride); - __builtin_prefetch(d + 2 * dst_stride); - __builtin_prefetch(d + 3 * dst_stride); - __builtin_prefetch(s + 0 * src_stride); - __builtin_prefetch(s + 1 * src_stride); - __builtin_prefetch(s + 2 * src_stride); - __builtin_prefetch(s + 3 * src_stride); - - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); + + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -800,17 +579,14 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, (void)x_step_q4; (void)y_step_q4; + const int16x8_t y_filter = vld1q_s16(filter[y0_q4]); + if (vpx_get_filter_taps(filter[y0_q4]) <= 4) { - /* All 4-tap and bilinear filter values are even, so halve them to reduce - * intermediate precision requirements. - */ - const int16x4_t y_filter_4tap = vshr_n_s16(vld1_s16(filter[y0_q4] + 2), 1); - vpx_convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, - w, h, y_filter_4tap); + convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h, + y_filter); } else { - const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]); - vpx_convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst, - dst_stride, w, h, y_filter_8tap); + convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst, dst_stride, + w, h, y_filter); } } @@ -832,45 +608,35 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, src -= 3 * src_stride; if (w == 4) { - uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23, dd01, dd23; - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); - s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); - s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); - s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); - s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); - s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); - s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); + int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); + int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); src += 7 * src_stride; do { - load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); - s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); - s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); - s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); - s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); - - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - - dd01 = load_u8(dst + 0 * dst_stride, dst_stride); - dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7))); + int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8))); + int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9))); + int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10))); + + int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride); d01 = vrhadd_u8(d01, dd01); d23 = vrhadd_u8(d23, dd23); @@ -890,54 +656,33 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, h -= 4; } while (h != 0); } else { - int height; - const uint8_t *s; - uint8_t *d; - uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - do { - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); - - s = src + 7 * src_stride; - d = dst; - height = h; + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + const uint8_t *s = src + 7 * src_stride; + uint8_t *d = dst; + int height = h; do { - load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - - __builtin_prefetch(d + 0 * dst_stride); - __builtin_prefetch(d + 1 * dst_stride); - __builtin_prefetch(d + 2 * dst_stride); - __builtin_prefetch(d + 3 * dst_stride); - __builtin_prefetch(s + 0 * src_stride); - __builtin_prefetch(s + 1 * src_stride); - __builtin_prefetch(s + 2 * src_stride); - __builtin_prefetch(s + 3 * src_stride); - - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); + + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride)); d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride)); diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h index 4ecaee0f99..10cc761ccd 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h @@ -17,360 +17,6 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/vpx_filter.h" -#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD) - -void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src, - ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, - int y_step_q4, int w, int h); - -static INLINE int16x4_t convolve4_4_sdot_partial(const int8x16_t samples, - const int32x4_t correction, - const int8x8_t filters) { - /* Accumulate dot product into 'correction' to account for range clamp. */ - int32x4_t sum = vdotq_lane_s32(correction, samples, filters, 0); - - /* Further narrowing and packing is performed by the caller. */ - return vmovn_s32(sum); -} - -static INLINE int16x4_t convolve4_4_sdot(const uint8x16_t samples, - const int8x8_t filters, - const int32x4_t correction, - const uint8x16_t range_limit, - const uint8x16_t permute_tbl) { - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - int8x16_t clamped_samples = - vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); - - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl); - - /* Accumulate dot product into 'correction' to account for range clamp. */ - int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, filters, 0); - - /* Further narrowing and packing is performed by the caller. */ - return vmovn_s32(sum); -} - -static INLINE uint8x8_t convolve4_8_sdot_partial(const int8x16_t samples_lo, - const int8x16_t samples_hi, - const int32x4_t correction, - const int8x8_t filters) { - /* Sample range-clamping and permutation are performed by the caller. */ - /* Accumulate dot product into 'correction' to account for range clamp. */ - /* First 4 output values. */ - int32x4_t sum0 = vdotq_lane_s32(correction, samples_lo, filters, 0); - /* Second 4 output values. */ - int32x4_t sum1 = vdotq_lane_s32(correction, samples_hi, filters, 0); - - /* Narrow and re-pack. */ - int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); - /* We halved the filter values so -1 from right shift. */ - return vqrshrun_n_s16(sum, FILTER_BITS - 1); -} - -static INLINE uint8x8_t convolve4_8_sdot(const uint8x16_t samples, - const int8x8_t filters, - const int32x4_t correction, - const uint8x16_t range_limit, - const uint8x16x2_t permute_tbl) { - int8x16_t clamped_samples, permuted_samples[2]; - - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); - - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); - /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ - permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); - - /* Accumulate dot product into 'correction' to account for range clamp. */ - /* First 4 output values. */ - int32x4_t sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0); - /* Second 4 output values. */ - int32x4_t sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0); - - /* Narrow and re-pack. */ - int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); - /* We halved the filter values so -1 from right shift. */ - return vqrshrun_n_s16(sum, FILTER_BITS - 1); -} - -static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo, - const int8x16_t samples_hi, - const int32x4_t correction, - const int8x8_t filters) { - /* Sample range-clamping and permutation are performed by the caller. */ - int32x4_t sum; - - /* Accumulate dot product into 'correction' to account for range clamp. */ - sum = vdotq_lane_s32(correction, samples_lo, filters, 0); - sum = vdotq_lane_s32(sum, samples_hi, filters, 1); - - /* Further narrowing and packing is performed by the caller. */ - return vqmovn_s32(sum); -} - -static INLINE int16x4_t convolve8_4_sdot(const uint8x16_t samples, - const int8x8_t filters, - const int32x4_t correction, - const uint8x16_t range_limit, - const uint8x16x2_t permute_tbl) { - int8x16_t clamped_samples, permuted_samples[2]; - int32x4_t sum; - - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); - - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); - /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ - permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); - - /* Accumulate dot product into 'correction' to account for range clamp. */ - sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0); - sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1); - - /* Further narrowing and packing is performed by the caller. */ - return vqmovn_s32(sum); -} - -static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo, - const int8x16_t samples0_hi, - const int8x16_t samples1_lo, - const int8x16_t samples1_hi, - const int32x4_t correction, - const int8x8_t filters) { - /* Sample range-clamping and permutation are performed by the caller. */ - int32x4_t sum0, sum1; - int16x8_t sum; - - /* Accumulate dot product into 'correction' to account for range clamp. */ - /* First 4 output values. */ - sum0 = vdotq_lane_s32(correction, samples0_lo, filters, 0); - sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1); - /* Second 4 output values. */ - sum1 = vdotq_lane_s32(correction, samples1_lo, filters, 0); - sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1); - - /* Narrow and re-pack. */ - sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); - return vqrshrun_n_s16(sum, FILTER_BITS); -} - -static INLINE uint8x8_t convolve8_8_sdot(const uint8x16_t samples, - const int8x8_t filters, - const int32x4_t correction, - const uint8x16_t range_limit, - const uint8x16x3_t permute_tbl) { - int8x16_t clamped_samples, permuted_samples[3]; - int32x4_t sum0, sum1; - int16x8_t sum; - - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); - - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); - /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ - permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); - /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */ - permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); - - /* Accumulate dot product into 'correction' to account for range clamp. */ - /* First 4 output values. */ - sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0); - sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filters, 1); - /* Second 4 output values. */ - sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0); - sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filters, 1); - - /* Narrow and re-pack. */ - sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); - return vqrshrun_n_s16(sum, FILTER_BITS); -} - -#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD) - -#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8) - -void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, - int w, int h); - -static INLINE int16x4_t convolve4_4_usdot_partial(const uint8x16_t samples, - const int8x8_t filters) { - int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples, filters, 0); - - /* Further narrowing and packing is performed by the caller. */ - return vmovn_s32(sum); -} - -static INLINE int16x4_t convolve4_4_usdot(const uint8x16_t samples, - const int8x8_t filters, - const uint8x16_t permute_tbl) { - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl); - - int32x4_t sum = - vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples, filters, 0); - - /* Further narrowing and packing is performed by the caller. */ - return vmovn_s32(sum); -} - -static INLINE uint8x8_t convolve4_8_usdot_partial(const uint8x16_t samples_lo, - const uint8x16_t samples_hi, - const int8x8_t filters) { - /* Sample permutation is performed by the caller. */ - /* First 4 output values. */ - int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0); - /* Second 4 output values. */ - int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples_hi, filters, 0); - - /* Narrow and re-pack. */ - int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); - /* We halved the filter values so -1 from right shift. */ - return vqrshrun_n_s16(sum, FILTER_BITS - 1); -} - -static INLINE uint8x8_t convolve4_8_usdot(const uint8x16_t samples, - const int8x8_t filters, - const uint8x16x2_t permute_tbl) { - uint8x16_t permuted_samples[2]; - - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); - /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ - permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); - - /* First 4 output values. */ - int32x4_t sum0 = - vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); - /* Second 4 output values. */ - int32x4_t sum1 = - vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0); - - /* Narrow and re-pack. */ - int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); - /* We halved the filter values so -1 from right shift. */ - return vqrshrun_n_s16(sum, FILTER_BITS - 1); -} - -static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo, - const uint8x16_t samples_hi, - const int8x8_t filters) { - /* Sample permutation is performed by the caller. */ - int32x4_t sum; - - sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0); - sum = vusdotq_lane_s32(sum, samples_hi, filters, 1); - - /* Further narrowing and packing is performed by the caller. */ - return vqmovn_s32(sum); -} - -static INLINE int16x4_t convolve8_4_usdot(const uint8x16_t samples, - const int8x8_t filters, - const uint8x16x2_t permute_tbl) { - uint8x16_t permuted_samples[2]; - int32x4_t sum; - - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); - /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ - permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); - - sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); - sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1); - - /* Further narrowing and packing is performed by the caller. */ - return vqmovn_s32(sum); -} - -static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo, - const uint8x16_t samples0_hi, - const uint8x16_t samples1_lo, - const uint8x16_t samples1_hi, - const int8x8_t filters) { - /* Sample permutation is performed by the caller. */ - int32x4_t sum0, sum1; - int16x8_t sum; - - /* First 4 output values. */ - sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0); - sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1); - /* Second 4 output values. */ - sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0); - sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1); - - /* Narrow and re-pack. */ - sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); - return vqrshrun_n_s16(sum, FILTER_BITS); -} - -static INLINE uint8x8_t convolve8_8_usdot(const uint8x16_t samples, - const int8x8_t filters, - const uint8x16x3_t permute_tbl) { - uint8x16_t permuted_samples[3]; - int32x4_t sum0, sum1; - int16x8_t sum; - - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); - /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ - permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); - /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */ - permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); - - /* First 4 output values. */ - sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); - sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1); - /* Second 4 output values. */ - sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0); - sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1); - - /* Narrow and re-pack. */ - sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); - return vqrshrun_n_s16(sum, FILTER_BITS); -} - -#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8) - -static INLINE int16x4_t convolve4_4(const int16x4_t s0, const int16x4_t s1, - const int16x4_t s2, const int16x4_t s3, - const int16x4_t filters) { - int16x4_t sum = vmul_lane_s16(s0, filters, 0); - sum = vmla_lane_s16(sum, s1, filters, 1); - sum = vmla_lane_s16(sum, s2, filters, 2); - sum = vmla_lane_s16(sum, s3, filters, 3); - return sum; -} - -static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1, - const int16x8_t s2, const int16x8_t s3, - const int16x4_t filters) { - int16x8_t sum = vmulq_lane_s16(s0, filters, 0); - sum = vmlaq_lane_s16(sum, s1, filters, 1); - sum = vmlaq_lane_s16(sum, s2, filters, 2); - sum = vmlaq_lane_s16(sum, s3, filters, 3); - /* We halved the filter values so -1 from right shift. */ - return vqrshrun_n_s16(sum, FILTER_BITS - 1); -} - static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, @@ -428,4 +74,99 @@ static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s, filters); } +// 2-tap (bilinear) filter values are always positive, but 4-tap filter values +// are negative on the outer edges (taps 0 and 3), with taps 1 and 2 having much +// greater positive values to compensate. To use instructions that operate on +// 8-bit types we also need the types to be unsigned. Subtracting the products +// of taps 0 and 3 from the products of taps 1 and 2 always works given that +// 2-tap filters are 0-padded. +static INLINE uint8x8_t convolve4_8(const uint8x8_t s0, const uint8x8_t s1, + const uint8x8_t s2, const uint8x8_t s3, + const uint8x8_t filter_taps[4]) { + uint16x8_t sum = vmull_u8(s1, filter_taps[1]); + sum = vmlal_u8(sum, s2, filter_taps[2]); + sum = vmlsl_u8(sum, s0, filter_taps[0]); + sum = vmlsl_u8(sum, s3, filter_taps[3]); + // We halved the filter values so -1 from right shift. + return vqrshrun_n_s16(vreinterpretq_s16_u16(sum), FILTER_BITS - 1); +} + +static INLINE void convolve_4tap_vert_neon(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, + const int16x8_t filter) { + // 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const uint8x8_t y_filter = + vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(filter)), 1); + + // Neon does not have lane-referencing multiply or multiply-accumulate + // instructions that operate on vectors of 8-bit elements. This means we have + // to duplicate filter taps into a whole vector and use standard multiply / + // multiply-accumulate instructions. + const uint8x8_t filter_taps[4] = { vdup_lane_u8(y_filter, 2), + vdup_lane_u8(y_filter, 3), + vdup_lane_u8(y_filter, 4), + vdup_lane_u8(y_filter, 5) }; + + if (w == 4) { + uint8x8_t s01 = load_unaligned_u8(src + 0 * src_stride, src_stride); + uint8x8_t s12 = load_unaligned_u8(src + 1 * src_stride, src_stride); + + src += 2 * src_stride; + + do { + uint8x8_t s23 = load_unaligned_u8(src + 0 * src_stride, src_stride); + uint8x8_t s34 = load_unaligned_u8(src + 1 * src_stride, src_stride); + uint8x8_t s45 = load_unaligned_u8(src + 2 * src_stride, src_stride); + uint8x8_t s56 = load_unaligned_u8(src + 3 * src_stride, src_stride); + + uint8x8_t d01 = convolve4_8(s01, s12, s23, s34, filter_taps); + uint8x8_t d23 = convolve4_8(s23, s34, s45, s56, filter_taps); + + store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01); + store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23); + + s01 = s45; + s12 = s56; + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + + uint8x8_t s0, s1, s2; + load_u8_8x3(s, src_stride, &s0, &s1, &s2); + + s += 3 * src_stride; + + do { + uint8x8_t s3, s4, s5, s6; + load_u8_8x4(s, src_stride, &s3, &s4, &s5, &s6); + + uint8x8_t d0 = convolve4_8(s0, s1, s2, s3, filter_taps); + uint8x8_t d1 = convolve4_8(s1, s2, s3, s4, filter_taps); + uint8x8_t d2 = convolve4_8(s2, s3, s4, s5, filter_taps); + uint8x8_t d3 = convolve4_8(s3, s4, s5, s6, filter_taps); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + #endif // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c index 00bac3b9cf..b05a49d3fe 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c @@ -20,270 +20,139 @@ #include "vpx_dsp/vpx_filter.h" #include "vpx_ports/mem.h" +// Filter values always sum to 128. +#define FILTER_SUM 128 + DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = { - 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, - 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 -}; - DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { - /* Shift left and insert new last column in transposed 4x4 block. */ + // Shift left and insert new last column in transposed 4x4 block. 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, - /* Shift left and insert two new columns in transposed 4x4 block. */ + // Shift left and insert two new columns in transposed 4x4 block. 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, - /* Shift left and insert three new columns in transposed 4x4 block. */ + // Shift left and insert three new columns in transposed 4x4 block. 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 }; -static INLINE void vpx_convolve_4tap_2d_horiz_neon_dotprod( - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, - const int32x4_t correction, const uint8x16_t range_limit) { - uint8x16_t s0, s1, s2, s3; - - if (w == 4) { - const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl); - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; - - do { - load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl); - d1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl); - d2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl); - d3 = convolve4_4_sdot(s3, filter, correction, range_limit, perm_tbl); - /* We halved the filter values so -1 from right shift. */ - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 3); - - /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for - * further details on possible values of block height. */ - load_u8_16x3(src, src_stride, &s0, &s1, &s2); - - d0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl); - d1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl); - d2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); - d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8_4x1(dst + 2 * dst_stride, d23); - } else { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; - - do { - width = w; - s = src; - d = dst; - do { - load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl); - d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl); - d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl); - d3 = convolve4_8_sdot(s3, filter, correction, range_limit, perm_tbl); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 3); - - /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for - * further details on possible values of block height. */ - width = w; - s = src; - d = dst; - do { - load_u8_16x3(s, src_stride, &s0, &s1, &s2); +static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); - d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl); - d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl); - d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl); + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl); - store_u8_8x3(d, dst_stride, d0, d1, d2); + // Accumulate into 128 * FILTER_SUM to account for range transform. (Divide + // by 2 since we halved the filter values.) + int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM / 2); + int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0); - s += 8; - d += 8; - width -= 8; - } while (width != 0); - } + // Further narrowing and packing is performed by the caller. + return vmovn_s32(sum); } -static INLINE void vpx_convolve_8tap_2d_horiz_neon_dotprod( - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, - const int32x4_t correction, const uint8x16_t range_limit) { - uint8x16_t s0, s1, s2, s3; - - if (w == 4) { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; - - do { - load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl); - d1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl); - d2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl); - d3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 3); - - /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for - * further details on possible values of block height. */ - load_u8_16x3(src, src_stride, &s0, &s1, &s2); - - d0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl); - d1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl); - d2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8_4x1(dst + 2 * dst_stride, d23); - } else { - const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; - - do { - width = w; - s = src; - d = dst; - do { - load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl); - d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl); - d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl); - d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 3); - - /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for - * further details on possible values of block height. */ - width = w; - s = src; - d = dst; - do { - load_u8_16x3(s, src_stride, &s0, &s1, &s2); - - d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl); - d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl); - d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl); - - store_u8_8x3(d, dst_stride, d0, d1, d2); - - s += 8; - d += 8; - width -= 8; - } while (width != 0); - } +static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), + vqtbl1q_s8(samples_128, permute_tbl.val[1]) }; + + // Accumulate into 128 * FILTER_SUM to account for range transform. (Divide + // by 2 since we halved the filter values.) + int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM / 2); + // First 4 output values. + int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0); + // Second 4 output values. + int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); + // We halved the filter values so -1 from right shift. + return vqrshrun_n_s16(sum, FILTER_BITS - 1); } -void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src, - ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, - int y_step_q4, int w, int h) { - const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); - const int32x4_t correction_8tap = - vdupq_n_s32(vaddlvq_s16(vshll_n_s8(x_filter_8tap, FILTER_BITS))); - const uint8x16_t range_limit = vdupq_n_u8(128); - - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - assert(x_step_q4 == 16); - - (void)x_step_q4; - (void)y0_q4; - (void)y_step_q4; - - if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { - /* All 4-tap and bilinear filter values are even, so halve them to reduce - * intermediate precision requirements. Also slide the filter values so the - * the 4 taps exist in the first 4 elements of the vector. - */ - const int8x8_t x_filter_4tap = - vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2); - const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1); - vpx_convolve_4tap_2d_horiz_neon_dotprod(src - 1, src_stride, dst, - dst_stride, w, h, x_filter_4tap, - correction_4tap, range_limit); +static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), + vqtbl1q_s8(samples_128, permute_tbl.val[1]) }; + + // Accumulate into 128 * FILTER_SUM to account for range transform. + int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM); + int32x4_t sum = vdotq_lane_s32(acc, perm_samples[0], filters, 0); + sum = vdotq_lane_s32(sum, perm_samples[1], filters, 1); + + // Further narrowing and packing is performed by the caller. + return vshrn_n_s32(sum, 1); +} - } else { - vpx_convolve_8tap_2d_horiz_neon_dotprod(src - 3, src_stride, dst, - dst_stride, w, h, x_filter_8tap, - correction_8tap, range_limit); - } +static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x3_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), + vqtbl1q_s8(samples_128, permute_tbl.val[1]), + vqtbl1q_s8(samples_128, permute_tbl.val[2]) }; + + // Accumulate into 128 * FILTER_SUM to account for range transform. + int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM); + // First 4 output values. + int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0); + sum0 = vdotq_lane_s32(sum0, perm_samples[1], filters, 1); + // Second 4 output values. + int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0); + sum1 = vdotq_lane_s32(sum1, perm_samples[2], filters, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1)); + return vqrshrun_n_s16(sum, FILTER_BITS - 1); } -static INLINE void vpx_convolve_4tap_horiz_neon_dotprod( +static INLINE void convolve_4tap_horiz_neon_dotprod( const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, - const int32x4_t correction, const uint8x16_t range_limit) { - uint8x16_t s0, s1, s2, s3; - + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { if (w == 4) { - const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl); - do { - int16x4_t t0, t1, t2, t3; - uint8x8_t d01, d23; + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl); - t1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl); - t2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl); - t3 = convolve4_4_sdot(s3, filter, correction, range_limit, perm_tbl); - /* We halved the filter values so -1 from right shift. */ - d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); - d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); + int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl); + int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl); + int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl); + int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl); + // We halved the filter values so -1 from right shift. + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); store_u8(dst + 0 * dst_stride, dst_stride, d01); store_u8(dst + 2 * dst_stride, dst_stride, d23); @@ -293,23 +162,21 @@ static INLINE void vpx_convolve_4tap_horiz_neon_dotprod( h -= 4; } while (h != 0); } else { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); do { - width = w; - s = src; - d = dst; + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl); - d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl); - d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl); - d3 = convolve4_8_sdot(s3, filter, correction, range_limit, perm_tbl); + uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl); + uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -324,26 +191,22 @@ static INLINE void vpx_convolve_4tap_horiz_neon_dotprod( } } -static INLINE void vpx_convolve_8tap_horiz_neon_dotprod( +static INLINE void convolve_8tap_horiz_neon_dotprod( const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, - const int32x4_t correction, const uint8x16_t range_limit) { - uint8x16_t s0, s1, s2, s3; - + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { if (w == 4) { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - do { - int16x4_t t0, t1, t2, t3; - uint8x8_t d01, d23; + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl); - t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl); - t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl); - t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + int16x4_t t0 = convolve8_4_h(s0, filter, permute_tbl); + int16x4_t t1 = convolve8_4_h(s1, filter, permute_tbl); + int16x4_t t2 = convolve8_4_h(s2, filter, permute_tbl); + int16x4_t t3 = convolve8_4_h(s3, filter, permute_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); store_u8(dst + 0 * dst_stride, dst_stride, d01); store_u8(dst + 2 * dst_stride, dst_stride, d23); @@ -353,23 +216,21 @@ static INLINE void vpx_convolve_8tap_horiz_neon_dotprod( h -= 4; } while (h != 0); } else { - const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); do { - width = w; - s = src; - d = dst; + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl); - d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl); - d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl); - d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl); + uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl); + uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -389,11 +250,6 @@ void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); - const int32x4_t correction_8tap = - vdupq_n_s32(vaddlvq_s16(vshll_n_s8(x_filter_8tap, FILTER_BITS))); - const uint8x16_t range_limit = vdupq_n_u8(128); - assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); assert(x_step_q4 == 16); @@ -403,21 +259,21 @@ void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, (void)y_step_q4; if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { - /* All 4-tap and bilinear filter values are even, so halve them to reduce - * intermediate precision requirements. Also slide the filter values so the - * the 4 taps exist in the first 4 elements of the vector. - */ + // Load 4-tap filter into first 4 elements of the vector. + // All 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2); const int8x8_t x_filter_4tap = - vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2); - const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1); - vpx_convolve_4tap_horiz_neon_dotprod(src - 1, src_stride, dst, dst_stride, - w, h, x_filter_4tap, correction_4tap, - range_limit); + vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); + + convolve_4tap_horiz_neon_dotprod(src - 1, src_stride, dst, dst_stride, w, h, + x_filter_4tap); } else { - vpx_convolve_8tap_horiz_neon_dotprod(src - 3, src_stride, dst, dst_stride, - w, h, x_filter_8tap, correction_8tap, - range_limit); + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + + convolve_8tap_horiz_neon_dotprod(src - 3, src_stride, dst, dst_stride, w, h, + x_filter_8tap); } } @@ -428,10 +284,6 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); - const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128); - const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); - const uint8x16_t range_limit = vdupq_n_u8(128); - uint8x16_t s0, s1, s2, s3; assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); @@ -444,22 +296,21 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src, src -= 3; if (w == 4) { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - do { - int16x4_t t0, t1, t2, t3; - uint8x8_t d01, d23, dd01, dd23; + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl); - t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl); - t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl); - t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + int16x4_t t0 = convolve8_4_h(s0, filters, permute_tbl); + int16x4_t t1 = convolve8_4_h(s1, filters, permute_tbl); + int16x4_t t2 = convolve8_4_h(s2, filters, permute_tbl); + int16x4_t t3 = convolve8_4_h(s3, filters, permute_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); - dd01 = load_u8(dst + 0 * dst_stride, dst_stride); - dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride); d01 = vrhadd_u8(d01, dd01); d23 = vrhadd_u8(d23, dd23); @@ -472,24 +323,23 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src, h -= 4; } while (h != 0); } else { - const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); do { - width = w; - s = src; - d = dst; + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl); - d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl); - d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl); - d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl); + uint8x8_t d0 = convolve8_8_h(s0, filters, permute_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filters, permute_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filters, permute_tbl); + uint8x8_t d3 = convolve8_8_h(s3, filters, permute_tbl); + uint8x8_t dd0, dd1, dd2, dd3; load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); d0 = vrhadd_u8(d0, dd0); @@ -511,260 +361,142 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src, } static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, - int8x8_t a3, int8x16_t *b, - const uint8x16_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, XX, XX, XX, XX - * a1: 10, 11, 12, 13, XX, XX, XX, XX - * a2: 20, 21, 22, 23, XX, XX, XX, XX - * a3: 30, 31, 32, 33, XX, XX, XX, XX - * - * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; - *b = vqtbl2q_s8(samples, permute_tbl); + int8x8_t a3, int8x16_t *b) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, XX, XX, XX, XX + // a1: 10, 11, 12, 13, XX, XX, XX, XX + // a2: 20, 21, 22, 23, XX, XX, XX, XX + // a3: 30, 31, 32, 33, XX, XX, XX, XX + // + // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + + int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); + int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); + int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); + int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); + + int8x16_t a01 = vzipq_s8(a0q, a1q).val[0]; + int8x16_t a23 = vzipq_s8(a2q, a3q).val[0]; + + int16x8_t a0123 = + vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)).val[0]; + + *b = vreinterpretq_s8_s16(a0123); } static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, int8x8_t a3, int8x16_t *b0, - int8x16_t *b1, - const uint8x16x2_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, 04, 05, 06, 07 - * a1: 10, 11, 12, 13, 14, 15, 16, 17 - * a2: 20, 21, 22, 23, 24, 25, 26, 27 - * a3: 30, 31, 32, 33, 34, 35, 36, 37 - * - * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; - *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]); - *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]); + int8x16_t *b1) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, 04, 05, 06, 07 + // a1: 10, 11, 12, 13, 14, 15, 16, 17 + // a2: 20, 21, 22, 23, 24, 25, 26, 27 + // a3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + + int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); + int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); + int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); + int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); + + int8x16_t a01 = vzipq_s8(a0q, a1q).val[0]; + int8x16_t a23 = vzipq_s8(a2q, a3q).val[0]; + + int16x8x2_t a0123 = + vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)); + + *b0 = vreinterpretq_s8_s16(a0123.val[0]); + *b1 = vreinterpretq_s8_s16(a0123.val[1]); } -static INLINE void vpx_convolve_4tap_vert_neon_dotprod( - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, - const int32x4_t correction, const uint8x8_t range_limit) { - const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t t0, t1, t2, t3, t4, t5, t6; - int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int8x16x2_t samples_LUT; - - if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - int8x16_t s0123, s1234, s2345, s3456, s78910; - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; - - load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - src += 7 * src_stride; +static INLINE int16x4_t convolve8_4_v(const int8x16_t samples_lo, + const int8x16_t samples_hi, + const int8x8_t filters) { + // The sample range transform and permutation are performed by the caller. - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + // Accumulate into 128 * FILTER_SUM to account for range transform. + int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM); + int32x4_t sum = vdotq_lane_s32(acc, samples_lo, filters, 0); + sum = vdotq_lane_s32(sum, samples_hi, filters, 1); - do { - uint8x8_t t7, t8, t9, t10; - load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); - - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); - - d0 = convolve4_4_sdot_partial(s0123, correction, filter); - d1 = convolve4_4_sdot_partial(s1234, correction, filter); - d2 = convolve4_4_sdot_partial(s2345, correction, filter); - d3 = convolve4_4_sdot_partial(s3456, correction, filter); - /* We halved the filter values so -1 from right shift. */ - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456; - samples_LUT.val[1] = s78910; - s0123 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s1234 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s2345 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - s3456 = s78910; - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h != 0); - } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3; - const uint8_t *s; - uint8_t *d; - int height; - - do { - height = h; - s = src; - d = dst; - - load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - s += 7 * src_stride; - - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); - - do { - uint8x8_t t7, t8, t9, t10; - load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); - - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); - - d0 = convolve4_8_sdot_partial(s0123_lo, s0123_hi, correction, filter); - d1 = convolve4_8_sdot_partial(s1234_lo, s1234_hi, correction, filter); - d2 = convolve4_8_sdot_partial(s2345_lo, s2345_hi, correction, filter); - d3 = convolve4_8_sdot_partial(s3456_lo, s3456_hi, correction, filter); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456_lo; - samples_LUT.val[1] = s78910_lo; - s0123_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s1234_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s2345_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - s3456_lo = s78910_lo; - - samples_LUT.val[0] = s3456_hi; - samples_LUT.val[1] = s78910_hi; - s0123_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s1234_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s2345_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - s3456_hi = s78910_hi; + // Further narrowing and packing is performed by the caller. + return vshrn_n_s32(sum, 1); +} - s += 4 * src_stride; - d += 4 * dst_stride; - height -= 4; - } while (height != 0); - src += 8; - dst += 8; - w -= 8; - } while (w != 0); - } +static INLINE uint8x8_t convolve8_8_v(const int8x16_t samples0_lo, + const int8x16_t samples0_hi, + const int8x16_t samples1_lo, + const int8x16_t samples1_hi, + const int8x8_t filters) { + // The sample range transform and permutation are performed by the caller. + + // Accumulate into 128 * FILTER_SUM to account for range transform. + int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM); + // First 4 output values. + int32x4_t sum0 = vdotq_lane_s32(acc, samples0_lo, filters, 0); + sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1); + // Second 4 output values. + int32x4_t sum1 = vdotq_lane_s32(acc, samples1_lo, filters, 0); + sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1)); + return vqrshrun_n_s16(sum, FILTER_BITS - 1); } -static INLINE void vpx_convolve_8tap_vert_neon_dotprod( +static INLINE void convolve_8tap_vert_neon_dotprod( const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, - const int32x4_t correction, const uint8x8_t range_limit) { + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t t0, t1, t2, t3, t4, t5, t6; - int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int8x16x2_t samples_LUT; if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; - + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); src += 7 * src_stride; - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + int8x16_t s0123, s1234, s2345, s3456; + transpose_concat_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_4x4(s3, s4, s5, s6, &s3456); do { uint8x8_t t7, t8, t9, t10; - load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); + int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + int8x16_t s78910; + transpose_concat_4x4(s7, s8, s9, s10, &s78910); - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456; - samples_LUT.val[1] = s78910; - s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + // Merge new data into block from previous iteration. + int8x16x2_t samples_LUT = { { s3456, s78910 } }; + int8x16_t s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + int8x16_t s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + int8x16_t s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter); - d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter); - d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter); - d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + int16x4_t d0 = convolve8_4_v(s0123, s4567, filter); + int16x4_t d1 = convolve8_4_v(s1234, s5678, filter); + int16x4_t d2 = convolve8_4_v(s2345, s6789, filter); + int16x4_t d3 = convolve8_4_v(s3456, s78910, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); store_u8(dst + 0 * dst_stride, dst_stride, d01); store_u8(dst + 2 * dst_stride, dst_stride, d23); @@ -781,83 +513,70 @@ static INLINE void vpx_convolve_8tap_vert_neon_dotprod( h -= 4; } while (h != 0); } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, - s6789_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3; - const uint8_t *s; - uint8_t *d; - int height; - do { - height = h; - s = src; - d = dst; + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); s += 7 * src_stride; - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi; + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); do { uint8x8_t t7, t8, t9, t10; - load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); + int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); + int8x16_t s78910_lo, s78910_hi; + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456_lo; - samples_LUT.val[1] = s78910_lo; - s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + // Merge new data into block from previous iteration. + int8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } }; + int8x16_t s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + int8x16_t s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + int8x16_t s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); samples_LUT.val[0] = s3456_hi; samples_LUT.val[1] = s78910_hi; - s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - correction, filter); - d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - correction, filter); - d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - correction, filter); - d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - correction, filter); + int8x16_t s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + int8x16_t s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + int8x16_t s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + uint8x8_t d0 = + convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter); + uint8x8_t d1 = + convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter); + uint8x8_t d2 = + convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter); + uint8x8_t d3 = + convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. s0123_lo = s4567_lo; s0123_hi = s4567_hi; s1234_lo = s5678_lo; @@ -883,11 +602,6 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4])); - const int32x4_t correction_8tap = - vdupq_n_s32(vaddlvq_s16(vshll_n_s8(y_filter_8tap, FILTER_BITS))); - const uint8x8_t range_limit = vdup_n_u8(128); - assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); assert(y_step_q4 == 16); @@ -897,20 +611,15 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, (void)y_step_q4; if (vpx_get_filter_taps(filter[y0_q4]) <= 4) { - /* All 4-tap and bilinear filter values are even, so halve them to reduce - * intermediate precision requirements. Also slide the filter values so the - * the 4 taps exist in the first 4 elements of the vector. - */ - const int8x8_t y_filter_4tap = - vext_s8(vshr_n_s8(y_filter_8tap, 1), vdup_n_s8(0), 2); - const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1); - vpx_convolve_4tap_vert_neon_dotprod(src - src_stride, src_stride, dst, - dst_stride, w, h, y_filter_4tap, - correction_4tap, range_limit); + const int16x8_t y_filter = vld1q_s16(filter[y0_q4]); + + convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h, + y_filter); } else { - vpx_convolve_8tap_vert_neon_dotprod(src - 3 * src_stride, src_stride, dst, - dst_stride, w, h, y_filter_8tap, - correction_8tap, range_limit); + const int8x8_t y_filter = vmovn_s16(vld1q_s16(filter[y0_q4])); + + convolve_8tap_vert_neon_dotprod(src - 3 * src_stride, src_stride, dst, + dst_stride, w, h, y_filter); } } @@ -921,13 +630,7 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); - const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128); - const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); - const uint8x8_t range_limit = vdup_n_u8(128); const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t t0, t1, t2, t3, t4, t5, t6; - int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int8x16x2_t samples_LUT; assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); @@ -940,59 +643,54 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, src -= 3 * src_stride; if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23, dd01, dd23; - + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); src += 7 * src_stride; - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + int8x16_t s0123, s1234, s2345, s3456; + transpose_concat_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_4x4(s3, s4, s5, s6, &s3456); do { uint8x8_t t7, t8, t9, t10; - load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); + int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + int8x16_t s78910; + transpose_concat_4x4(s7, s8, s9, s10, &s78910); - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456; - samples_LUT.val[1] = s78910; - s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + // Merge new data into block from previous iteration. + int8x16x2_t samples_LUT = { { s3456, s78910 } }; + int8x16_t s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + int8x16_t s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + int8x16_t s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters); - d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters); - d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters); - d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + int16x4_t d0 = convolve8_4_v(s0123, s4567, filters); + int16x4_t d1 = convolve8_4_v(s1234, s5678, filters); + int16x4_t d2 = convolve8_4_v(s2345, s6789, filters); + int16x4_t d3 = convolve8_4_v(s3456, s78910, filters); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); - dd01 = load_u8(dst + 0 * dst_stride, dst_stride); - dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride); d01 = vrhadd_u8(d01, dd01); d23 = vrhadd_u8(d23, dd23); @@ -1000,8 +698,8 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, store_u8(dst + 0 * dst_stride, dst_stride, d01); store_u8(dst + 2 * dst_stride, dst_stride, d23); - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. s0123 = s4567; s1234 = s5678; s2345 = s6789; @@ -1012,79 +710,67 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, h -= 4; } while (h != 0); } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, - s6789_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; - const uint8_t *s; - uint8_t *d; - int height; - do { - height = h; - s = src; - d = dst; + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); s += 7 * src_stride; - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi; + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); do { uint8x8_t t7, t8, t9, t10; - load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); + int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); + int8x16_t s78910_lo, s78910_hi; + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456_lo; - samples_LUT.val[1] = s78910_lo; - s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + // Merge new data into block from previous iteration. + int8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } }; + int8x16_t s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + int8x16_t s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + int8x16_t s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); samples_LUT.val[0] = s3456_hi; samples_LUT.val[1] = s78910_hi; - s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - correction, filters); - d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - correction, filters); - d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - correction, filters); - d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - correction, filters); - + int8x16_t s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + int8x16_t s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + int8x16_t s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + uint8x8_t d0 = + convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filters); + uint8x8_t d1 = + convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filters); + uint8x8_t d2 = + convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filters); + uint8x8_t d3 = + convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filters); + + uint8x8_t dd0, dd1, dd2, dd3; load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); d0 = vrhadd_u8(d0, dd0); @@ -1094,8 +780,8 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. s0123_lo = s4567_lo; s0123_hi = s4567_hi; s1234_lo = s5678_lo; @@ -1115,3 +801,275 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, } while (w != 0); } } + +static INLINE void convolve_4tap_2d_neon_dotprod(const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, int w, + int h, const int8x8_t x_filter, + const uint8x8_t y_filter) { + // Neon does not have lane-referencing multiply or multiply-accumulate + // instructions that operate on vectors of 8-bit elements. This means we have + // to duplicate filter taps into a whole vector and use standard multiply / + // multiply-accumulate instructions. + const uint8x8_t y_filter_taps[4] = { vdup_lane_u8(y_filter, 2), + vdup_lane_u8(y_filter, 3), + vdup_lane_u8(y_filter, 4), + vdup_lane_u8(y_filter, 5) }; + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + + uint8x16_t h_s0, h_s1, h_s2; + load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2); + + int16x4_t t0 = convolve4_4_h(h_s0, x_filter, permute_tbl); + int16x4_t t1 = convolve4_4_h(h_s1, x_filter, permute_tbl); + int16x4_t t2 = convolve4_4_h(h_s2, x_filter, permute_tbl); + // We halved the filter values so -1 from right shift. + uint8x8_t v_s01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t v_s12 = vqrshrun_n_s16(vcombine_s16(t1, t2), FILTER_BITS - 1); + + src += 3 * src_stride; + + do { + uint8x16_t h_s3, h_s4, h_s5, h_s6; + load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6); + + int16x4_t t3 = convolve4_4_h(h_s3, x_filter, permute_tbl); + int16x4_t t4 = convolve4_4_h(h_s4, x_filter, permute_tbl); + int16x4_t t5 = convolve4_4_h(h_s5, x_filter, permute_tbl); + int16x4_t t6 = convolve4_4_h(h_s6, x_filter, permute_tbl); + // We halved the filter values so -1 from right shift. + uint8x8_t v_s34 = vqrshrun_n_s16(vcombine_s16(t3, t4), FILTER_BITS - 1); + uint8x8_t v_s56 = vqrshrun_n_s16(vcombine_s16(t5, t6), FILTER_BITS - 1); + uint8x8_t v_s23 = vext_u8(v_s12, v_s34, 4); + uint8x8_t v_s45 = vext_u8(v_s34, v_s56, 4); + + uint8x8_t d01 = convolve4_8(v_s01, v_s12, v_s23, v_s34, y_filter_taps); + uint8x8_t d23 = convolve4_8(v_s23, v_s34, v_s45, v_s56, y_filter_taps); + + store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01); + store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23); + + v_s01 = v_s45; + v_s12 = v_s56; + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + + uint8x16_t h_s0, h_s1, h_s2; + load_u8_16x3(s, src_stride, &h_s0, &h_s1, &h_s2); + + uint8x8_t v_s0 = convolve4_8_h(h_s0, x_filter, permute_tbl); + uint8x8_t v_s1 = convolve4_8_h(h_s1, x_filter, permute_tbl); + uint8x8_t v_s2 = convolve4_8_h(h_s2, x_filter, permute_tbl); + + s += 3 * src_stride; + + do { + uint8x16_t h_s3, h_s4, h_s5, h_s6; + load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6); + + uint8x8_t v_s3 = convolve4_8_h(h_s3, x_filter, permute_tbl); + uint8x8_t v_s4 = convolve4_8_h(h_s4, x_filter, permute_tbl); + uint8x8_t v_s5 = convolve4_8_h(h_s5, x_filter, permute_tbl); + uint8x8_t v_s6 = convolve4_8_h(h_s6, x_filter, permute_tbl); + + uint8x8_t d0 = convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter_taps); + uint8x8_t d1 = convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter_taps); + uint8x8_t d2 = convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter_taps); + uint8x8_t d3 = convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter_taps); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void convolve_8tap_2d_horiz_neon_dotprod( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { + if (w == 4) { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl); + int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl); + int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl); + int16x4_t d3 = convolve8_4_h(s3, filter, permute_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm() + // below for further details on possible values of block height. + uint8x16_t s0, s1, s2; + load_u8_16x3(src, src_stride, &s0, &s1, &s2); + + int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl); + int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl); + int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = + vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8_4x1(dst + 2 * dst_stride, d23); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl); + uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm() + // below for further details on possible values of block height. + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0, s1, s2; + load_u8_16x3(s, src_stride, &s0, &s1, &s2); + + uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl); + + store_u8_8x3(d, dst_stride, d0, d1, d2); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + } +} + +void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + + (void)x_step_q4; + (void)y_step_q4; + + const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8; + const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8; + // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2 + // lines post both horizontally and vertically. + const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1; + const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride; + + if (x_filter_taps == 4 && y_filter_taps == 4) { + const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2); + const int16x8_t y_filter = vld1q_s16(filter[y0_q4]); + + // 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const int8x8_t x_filter_4tap = + vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); + const uint8x8_t y_filter_4tap = + vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(y_filter)), 1); + + convolve_4tap_2d_neon_dotprod(src - horiz_offset - vert_offset, src_stride, + dst, dst_stride, w, h, x_filter_4tap, + y_filter_4tap); + return; + } + + // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the + // maximum buffer size to 64 * (64 + 7). + DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]); + const int im_stride = 64; + const int im_height = h + SUBPEL_TAPS - 1; + + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4])); + + convolve_8tap_2d_horiz_neon_dotprod(src - horiz_offset - vert_offset, + src_stride, im_block, im_stride, w, + im_height, x_filter_8tap); + + convolve_8tap_vert_neon_dotprod(im_block, im_stride, dst, dst_stride, w, h, + y_filter_8tap); +} + +void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]); + const int im_stride = 64; + + // Averaging convolution always uses an 8-tap filter. + // Account for the vertical phase needing 3 lines prior and 4 lines post. + const int im_height = h + SUBPEL_TAPS - 1; + const ptrdiff_t offset = SUBPEL_TAPS / 2 - 1; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + + convolve_8tap_2d_horiz_neon_dotprod(src - offset - offset * src_stride, + src_stride, im_block, im_stride, w, + im_height, x_filter_8tap); + + vpx_convolve8_avg_vert_neon_dotprod(im_block + offset * im_stride, im_stride, + dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c index bcad1dd121..e582004133 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c @@ -26,255 +26,112 @@ DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = { - 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, - 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 -}; - DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { - /* Shift left and insert new last column in transposed 4x4 block. */ + // Shift left and insert new last column in transposed 4x4 block. 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, - /* Shift left and insert two new columns in transposed 4x4 block. */ + // Shift left and insert two new columns in transposed 4x4 block. 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, - /* Shift left and insert three new columns in transposed 4x4 block. */ + // Shift left and insert three new columns in transposed 4x4 block. 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 }; -static INLINE void vpx_convolve_4tap_2d_horiz_neon_i8mm( - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { - uint8x16_t s0, s1, s2, s3; +static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16_t permute_tbl) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl); - if (w == 4) { - const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl); - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; + int32x4_t sum = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples, filters, 0); - do { - load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve4_4_usdot(s0, filter, perm_tbl); - d1 = convolve4_4_usdot(s1, filter, perm_tbl); - d2 = convolve4_4_usdot(s2, filter, perm_tbl); - d3 = convolve4_4_usdot(s3, filter, perm_tbl); - /* We halved the filter values so -1 from right shift. */ - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 3); - - /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for - * further details on possible values of block height. */ - load_u8_16x3(src, src_stride, &s0, &s1, &s2); - - d0 = convolve4_4_usdot(s0, filter, perm_tbl); - d1 = convolve4_4_usdot(s1, filter, perm_tbl); - d2 = convolve4_4_usdot(s2, filter, perm_tbl); - /* We halved the filter values so -1 from right shift. */ - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); - d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8_4x1(dst + 2 * dst_stride, d23); - } else { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; - - do { - width = w; - s = src; - d = dst; - do { - load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve4_8_usdot(s0, filter, perm_tbl); - d1 = convolve4_8_usdot(s1, filter, perm_tbl); - d2 = convolve4_8_usdot(s2, filter, perm_tbl); - d3 = convolve4_8_usdot(s3, filter, perm_tbl); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width > 0); - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 3); - - /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for - * further details on possible values of block height. */ - width = w; - s = src; - d = dst; - do { - load_u8_16x3(s, src_stride, &s0, &s1, &s2); - - d0 = convolve4_8_usdot(s0, filter, perm_tbl); - d1 = convolve4_8_usdot(s1, filter, perm_tbl); - d2 = convolve4_8_usdot(s2, filter, perm_tbl); - - store_u8_8x3(d, dst_stride, d0, d1, d2); - - s += 8; - d += 8; - width -= 8; - } while (width > 0); - } + // Further narrowing and packing is performed by the caller. + return vmovn_s32(sum); } -static INLINE void vpx_convolve_8tap_2d_horiz_neon_i8mm( - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { - uint8x16_t s0, s1, s2, s3; - - if (w == 4) { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; - - do { - load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve8_4_usdot(s0, filter, perm_tbl); - d1 = convolve8_4_usdot(s1, filter, perm_tbl); - d2 = convolve8_4_usdot(s2, filter, perm_tbl); - d3 = convolve8_4_usdot(s3, filter, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 3); - - /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for - * further details on possible values of block height. */ - load_u8_16x3(src, src_stride, &s0, &s1, &s2); - - d0 = convolve8_4_usdot(s0, filter, perm_tbl); - d1 = convolve8_4_usdot(s1, filter, perm_tbl); - d2 = convolve8_4_usdot(s2, filter, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8_4x1(dst + 2 * dst_stride, d23); - } else { - const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; - - do { - width = w; - s = src; - d = dst; - do { - load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve8_8_usdot(s0, filter, perm_tbl); - d1 = convolve8_8_usdot(s1, filter, perm_tbl); - d2 = convolve8_8_usdot(s2, filter, perm_tbl); - d3 = convolve8_8_usdot(s3, filter, perm_tbl); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width > 0); - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 3); - - /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for - * further details on possible values of block height. */ - width = w; - s = src; - d = dst; - do { - load_u8_16x3(s, src_stride, &s0, &s1, &s2); - - d0 = convolve8_8_usdot(s0, filter, perm_tbl); - d1 = convolve8_8_usdot(s1, filter, perm_tbl); - d2 = convolve8_8_usdot(s2, filter, perm_tbl); - - store_u8_8x3(d, dst_stride, d0, d1, d2); - - s += 8; - d += 8; - width -= 8; - } while (width > 0); - } +static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), + vqtbl1q_u8(samples, permute_tbl.val[1]) }; + + // First 4 output values. + int32x4_t sum0 = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + // Second 4 output values. + int32x4_t sum1 = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); + // We halved the filter values so -1 from right shift. + return vqrshrun_n_s16(sum, FILTER_BITS - 1); } -void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, - int w, int h) { - const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); - - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - assert(x_step_q4 == 16); - - (void)x_step_q4; - (void)y0_q4; - (void)y_step_q4; - - if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { - /* All 4-tap and bilinear filter values are even, so halve them to reduce - * intermediate precision requirements. Also slide the filter values so the - * the 4 taps exist in the first 4 elements of the vector. - */ - const int8x8_t x_filter_4tap = - vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2); - vpx_convolve_4tap_2d_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride, - w, h, x_filter_4tap); - - } else { - vpx_convolve_8tap_2d_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride, - w, h, x_filter_8tap); - } +static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), + vqtbl1q_u8(samples, permute_tbl.val[1]) }; + + int32x4_t sum = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1); + + // Further narrowing and packing is performed by the caller. + return vshrn_n_s32(sum, 1); } -static INLINE void vpx_convolve_4tap_horiz_neon_i8mm( - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { - uint8x16_t s0, s1, s2, s3; +static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x3_t permute_tbl) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + uint8x16_t permuted_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]), + vqtbl1q_u8(samples, permute_tbl.val[1]), + vqtbl1q_u8(samples, permute_tbl.val[2]) }; + + // First 4 output values. + int32x4_t sum0 = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1); + // Second 4 output values. + int32x4_t sum1 = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0); + sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1)); + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} +static INLINE void convolve_4tap_horiz_neon_i8mm(const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, int w, + int h, const int8x8_t filter) { if (w == 4) { - const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl); - do { - int16x4_t t0, t1, t2, t3; - uint8x8_t d01, d23; + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve4_4_usdot(s0, filter, perm_tbl); - t1 = convolve4_4_usdot(s1, filter, perm_tbl); - t2 = convolve4_4_usdot(s2, filter, perm_tbl); - t3 = convolve4_4_usdot(s3, filter, perm_tbl); - /* We halved the filter values so -1 from right shift. */ - d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); - d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); + int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl); + int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl); + int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl); + int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl); + // We halved the filter values so -1 from right shift. + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); store_u8(dst + 0 * dst_stride, dst_stride, d01); store_u8(dst + 2 * dst_stride, dst_stride, d23); @@ -284,23 +141,21 @@ static INLINE void vpx_convolve_4tap_horiz_neon_i8mm( h -= 4; } while (h != 0); } else { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); do { - width = w; - s = src; - d = dst; + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve4_8_usdot(s0, filter, perm_tbl); - d1 = convolve4_8_usdot(s1, filter, perm_tbl); - d2 = convolve4_8_usdot(s2, filter, perm_tbl); - d3 = convolve4_8_usdot(s3, filter, perm_tbl); + uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl); + uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -315,25 +170,24 @@ static INLINE void vpx_convolve_4tap_horiz_neon_i8mm( } } -static INLINE void vpx_convolve_8tap_horiz_neon_i8mm( - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { - uint8x16_t s0, s1, s2, s3; - +static INLINE void convolve_8tap_horiz_neon_i8mm(const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, int w, + int h, const int8x8_t filter) { if (w == 4) { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - do { - int16x4_t t0, t1, t2, t3; - uint8x8_t d01, d23; + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve8_4_usdot(s0, filter, perm_tbl); - t1 = convolve8_4_usdot(s1, filter, perm_tbl); - t2 = convolve8_4_usdot(s2, filter, perm_tbl); - t3 = convolve8_4_usdot(s3, filter, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + int16x4_t t0 = convolve8_4_h(s0, filter, permute_tbl); + int16x4_t t1 = convolve8_4_h(s1, filter, permute_tbl); + int16x4_t t2 = convolve8_4_h(s2, filter, permute_tbl); + int16x4_t t3 = convolve8_4_h(s3, filter, permute_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); store_u8(dst + 0 * dst_stride, dst_stride, d01); store_u8(dst + 2 * dst_stride, dst_stride, d23); @@ -343,23 +197,21 @@ static INLINE void vpx_convolve_8tap_horiz_neon_i8mm( h -= 4; } while (h != 0); } else { - const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); do { - width = w; - s = src; - d = dst; + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_8_usdot(s0, filter, perm_tbl); - d1 = convolve8_8_usdot(s1, filter, perm_tbl); - d2 = convolve8_8_usdot(s2, filter, perm_tbl); - d3 = convolve8_8_usdot(s3, filter, perm_tbl); + uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl); + uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -379,8 +231,6 @@ void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); - assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); assert(x_step_q4 == 16); @@ -390,18 +240,21 @@ void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, (void)y_step_q4; if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { - /* All 4-tap and bilinear filter values are even, so halve them to reduce - * intermediate precision requirements. Also slide the filter values so the - * the 4 taps exist in the first 4 elements of the vector. - */ + // Load 4-tap filter into first 4 elements of the vector. + // All 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2); const int8x8_t x_filter_4tap = - vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2); - vpx_convolve_4tap_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride, w, - h, x_filter_4tap); + vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); + + convolve_4tap_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride, w, h, + x_filter_4tap); } else { - vpx_convolve_8tap_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride, w, - h, x_filter_8tap); + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + + convolve_8tap_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride, w, h, + x_filter_8tap); } } @@ -411,7 +264,6 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); - uint8x16_t s0, s1, s2, s3; assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); @@ -424,22 +276,21 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, src -= 3; if (w == 4) { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - do { - int16x4_t t0, t1, t2, t3; - uint8x8_t d01, d23, dd01, dd23; + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve8_4_usdot(s0, filters, perm_tbl); - t1 = convolve8_4_usdot(s1, filters, perm_tbl); - t2 = convolve8_4_usdot(s2, filters, perm_tbl); - t3 = convolve8_4_usdot(s3, filters, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + int16x4_t t0 = convolve8_4_h(s0, filters, permute_tbl); + int16x4_t t1 = convolve8_4_h(s1, filters, permute_tbl); + int16x4_t t2 = convolve8_4_h(s2, filters, permute_tbl); + int16x4_t t3 = convolve8_4_h(s3, filters, permute_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); - dd01 = load_u8(dst + 0 * dst_stride, dst_stride); - dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride); d01 = vrhadd_u8(d01, dd01); d23 = vrhadd_u8(d23, dd23); @@ -452,24 +303,23 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, h -= 4; } while (h != 0); } else { - const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); do { - width = w; - s = src; - d = dst; + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_8_usdot(s0, filters, perm_tbl); - d1 = convolve8_8_usdot(s1, filters, perm_tbl); - d2 = convolve8_8_usdot(s2, filters, perm_tbl); - d3 = convolve8_8_usdot(s3, filters, perm_tbl); + uint8x8_t d0 = convolve8_8_h(s0, filters, permute_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filters, permute_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filters, permute_tbl); + uint8x8_t d3 = convolve8_8_h(s3, filters, permute_tbl); + uint8x8_t dd0, dd1, dd2, dd3; load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); d0 = vrhadd_u8(d0, dd0); @@ -492,216 +342,130 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, - uint8x16_t *b, - const uint8x16_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, XX, XX, XX, XX - * a1: 10, 11, 12, 13, XX, XX, XX, XX - * a2: 20, 21, 22, 23, XX, XX, XX, XX - * a3: 30, 31, 32, 33, XX, XX, XX, XX - * - * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; - *b = vqtbl2q_u8(samples, permute_tbl); + uint8x16_t *b) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, XX, XX, XX, XX + // a1: 10, 11, 12, 13, XX, XX, XX, XX + // a2: 20, 21, 22, 23, XX, XX, XX, XX + // a3: 30, 31, 32, 33, XX, XX, XX, XX + // + // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + + uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); + uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); + uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); + uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); + + uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0]; + uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0]; + + uint16x8_t a0123 = + vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)).val[0]; + + *b = vreinterpretq_u8_u16(a0123); } static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, - uint8x16_t *b0, uint8x16_t *b1, - const uint8x16x2_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, 04, 05, 06, 07 - * a1: 10, 11, 12, 13, 14, 15, 16, 17 - * a2: 20, 21, 22, 23, 24, 25, 26, 27 - * a3: 30, 31, 32, 33, 34, 35, 36, 37 - * - * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; - *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]); - *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]); + uint8x16_t *b0, uint8x16_t *b1) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, 04, 05, 06, 07 + // a1: 10, 11, 12, 13, 14, 15, 16, 17 + // a2: 20, 21, 22, 23, 24, 25, 26, 27 + // a3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + + uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); + uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); + uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); + uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); + + uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0]; + uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0]; + + uint16x8x2_t a0123 = + vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)); + + *b0 = vreinterpretq_u8_u16(a0123.val[0]); + *b1 = vreinterpretq_u8_u16(a0123.val[1]); } -static INLINE void vpx_convolve_4tap_vert_neon_i8mm( - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { - const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint8x16x2_t samples_LUT; - - if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - uint8x16_t s0123, s1234, s2345, s3456, s78910; - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; - - load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); - src += 7 * src_stride; - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); - - do { - load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); - - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); - - d0 = convolve4_4_usdot_partial(s0123, filter); - d1 = convolve4_4_usdot_partial(s1234, filter); - d2 = convolve4_4_usdot_partial(s2345, filter); - d3 = convolve4_4_usdot_partial(s3456, filter); - /* We halved the filter values so -1 from right shift. */ - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456; - samples_LUT.val[1] = s78910; - s0123 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s1234 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s2345 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - s3456 = s78910; - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h != 0); - } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3; - const uint8_t *s; - uint8_t *d; - int height; - - do { - height = h; - s = src; - d = dst; +static INLINE int16x4_t convolve8_4_v(const uint8x16_t samples_lo, + const uint8x16_t samples_hi, + const int8x8_t filters) { + // Sample permutation is performed by the caller. + int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0); + sum = vusdotq_lane_s32(sum, samples_hi, filters, 1); - load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); - s += 7 * src_stride; - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); - - do { - load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); - - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); - - d0 = convolve4_8_usdot_partial(s0123_lo, s0123_hi, filter); - d1 = convolve4_8_usdot_partial(s1234_lo, s1234_hi, filter); - d2 = convolve4_8_usdot_partial(s2345_lo, s2345_hi, filter); - d3 = convolve4_8_usdot_partial(s3456_lo, s3456_hi, filter); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456_lo; - samples_LUT.val[1] = s78910_lo; - s0123_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s1234_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s2345_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - s3456_lo = s78910_lo; - - samples_LUT.val[0] = s3456_hi; - samples_LUT.val[1] = s78910_hi; - s0123_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s1234_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s2345_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - s3456_hi = s78910_hi; + // Further narrowing and packing is performed by the caller. + return vshrn_n_s32(sum, 1); +} - s += 4 * src_stride; - d += 4 * dst_stride; - height -= 4; - } while (height != 0); - src += 8; - dst += 8; - w -= 8; - } while (w != 0); - } +static INLINE uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo, + const uint8x16_t samples0_hi, + const uint8x16_t samples1_lo, + const uint8x16_t samples1_hi, + const int8x8_t filters) { + // Sample permutation is performed by the caller. + + // First 4 output values. + int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0); + sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1); + // Second 4 output values. + int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0); + sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1)); + return vqrshrun_n_s16(sum, FILTER_BITS - 1); } -static INLINE void vpx_convolve_8tap_vert_neon_i8mm( - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { +static INLINE void convolve_8tap_vert_neon_i8mm(const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, int w, + int h, const int8x8_t filter) { const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint8x16x2_t samples_LUT; - if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; - + uint8x8_t s0, s1, s2, s3, s4, s5, s6; load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); src += 7 * src_stride; - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + uint8x16_t s0123, s1234, s2345, s3456; + transpose_concat_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_4x4(s3, s4, s5, s6, &s3456); do { + uint8x8_t s7, s8, s9, s10; load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + uint8x16_t s78910; + transpose_concat_4x4(s7, s8, s9, s10, &s78910); - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456; - samples_LUT.val[1] = s78910; - s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + // Merge new data into block from previous iteration. + uint8x16x2_t samples_LUT = { { s3456, s78910 } }; + uint8x16_t s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + uint8x16_t s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + uint8x16_t s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_4_usdot_partial(s0123, s4567, filter); - d1 = convolve8_4_usdot_partial(s1234, s5678, filter); - d2 = convolve8_4_usdot_partial(s2345, s6789, filter); - d3 = convolve8_4_usdot_partial(s3456, s78910, filter); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + int16x4_t d0 = convolve8_4_v(s0123, s4567, filter); + int16x4_t d1 = convolve8_4_v(s1234, s5678, filter); + int16x4_t d2 = convolve8_4_v(s2345, s6789, filter); + int16x4_t d3 = convolve8_4_v(s3456, s78910, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); store_u8(dst + 0 * dst_stride, dst_stride, d01); store_u8(dst + 2 * dst_stride, dst_stride, d23); - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. s0123 = s4567; s1234 = s5678; s2345 = s6789; @@ -712,67 +476,56 @@ static INLINE void vpx_convolve_8tap_vert_neon_i8mm( h -= 4; } while (h != 0); } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, - s6789_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3; - const uint8_t *s; - uint8_t *d; - int height; - do { - height = h; - s = src; - d = dst; + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + uint8x8_t s0, s1, s2, s3, s4, s5, s6; load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi; + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); do { + uint8x8_t s7, s8, s9, s10; load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); + uint8x16_t s78910_lo, s78910_hi; + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456_lo; - samples_LUT.val[1] = s78910_lo; - s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + // Merge new data into block from previous iteration. + uint8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } }; + uint8x16_t s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + uint8x16_t s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + uint8x16_t s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); samples_LUT.val[0] = s3456_hi; samples_LUT.val[1] = s78910_hi; - s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - filter); - d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - filter); - d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - filter); - d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - filter); + uint8x16_t s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + uint8x16_t s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + uint8x16_t s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + uint8x8_t d0 = + convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter); + uint8x8_t d1 = + convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter); + uint8x8_t d2 = + convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter); + uint8x8_t d3 = + convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. s0123_lo = s4567_lo; s0123_hi = s4567_hi; s1234_lo = s5678_lo; @@ -798,8 +551,6 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4])); - assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); assert(y_step_q4 == 16); @@ -809,17 +560,15 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, (void)y_step_q4; if (vpx_get_filter_taps(filter[y0_q4]) <= 4) { - /* All 4-tap and bilinear filter values are even, so halve them to reduce - * intermediate precision requirements. Also slide the filter values so the - * the 4 taps exist in the first 4 elements of the vector. - */ - const int8x8_t y_filter_4tap = - vext_s8(vshr_n_s8(y_filter_8tap, 1), vdup_n_s8(0), 2); - vpx_convolve_4tap_vert_neon_i8mm(src - src_stride, src_stride, dst, - dst_stride, w, h, y_filter_4tap); + const int16x8_t y_filter = vld1q_s16(filter[y0_q4]); + + convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h, + y_filter); } else { - vpx_convolve_8tap_vert_neon_i8mm(src - 3 * src_stride, src_stride, dst, - dst_stride, w, h, y_filter_8tap); + const int8x8_t y_filter = vmovn_s16(vld1q_s16(filter[y0_q4])); + + convolve_8tap_vert_neon_i8mm(src - 3 * src_stride, src_stride, dst, + dst_stride, w, h, y_filter); } } @@ -830,8 +579,6 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, int w, int h) { const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint8x16x2_t samples_LUT; assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); @@ -844,43 +591,40 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, src -= 3 * src_stride; if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23, dd01, dd23; - + uint8x8_t s0, s1, s2, s3, s4, s5, s6; load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); src += 7 * src_stride; - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + uint8x16_t s0123, s1234, s2345, s3456; + transpose_concat_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_4x4(s3, s4, s5, s6, &s3456); do { + uint8x8_t s7, s8, s9, s10; load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + uint8x16_t s78910; + transpose_concat_4x4(s7, s8, s9, s10, &s78910); - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456; - samples_LUT.val[1] = s78910; - s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + // Merge new data into block from previous iteration. + uint8x16x2_t samples_LUT = { { s3456, s78910 } }; + uint8x16_t s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + uint8x16_t s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + uint8x16_t s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_4_usdot_partial(s0123, s4567, filters); - d1 = convolve8_4_usdot_partial(s1234, s5678, filters); - d2 = convolve8_4_usdot_partial(s2345, s6789, filters); - d3 = convolve8_4_usdot_partial(s3456, s78910, filters); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + int16x4_t d0 = convolve8_4_v(s0123, s4567, filters); + int16x4_t d1 = convolve8_4_v(s1234, s5678, filters); + int16x4_t d2 = convolve8_4_v(s2345, s6789, filters); + int16x4_t d3 = convolve8_4_v(s3456, s78910, filters); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); - dd01 = load_u8(dst + 0 * dst_stride, dst_stride); - dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride); d01 = vrhadd_u8(d01, dd01); d23 = vrhadd_u8(d23, dd23); @@ -888,8 +632,8 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, store_u8(dst + 0 * dst_stride, dst_stride, d01); store_u8(dst + 2 * dst_stride, dst_stride, d23); - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. s0123 = s4567; s1234 = s5678; s2345 = s6789; @@ -900,63 +644,53 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, h -= 4; } while (h != 0); } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, - s6789_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; - const uint8_t *s; - uint8_t *d; - int height; - do { - height = h; - s = src; - d = dst; + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + uint8x8_t s0, s1, s2, s3, s4, s5, s6; load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi; + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); do { + uint8x8_t s7, s8, s9, s10; load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); + uint8x16_t s78910_lo, s78910_hi; + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456_lo; - samples_LUT.val[1] = s78910_lo; - s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + // Merge new data into block from previous iteration. + uint8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } }; + uint8x16_t s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + uint8x16_t s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + uint8x16_t s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); samples_LUT.val[0] = s3456_hi; samples_LUT.val[1] = s78910_hi; - s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - filters); - d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - filters); - d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - filters); - d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - filters); - + uint8x16_t s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + uint8x16_t s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + uint8x16_t s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + uint8x8_t d0 = + convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filters); + uint8x8_t d1 = + convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filters); + uint8x8_t d2 = + convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filters); + uint8x8_t d3 = + convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filters); + + uint8x8_t dd0, dd1, dd2, dd3; load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); d0 = vrhadd_u8(d0, dd0); @@ -987,3 +721,275 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, } while (w != 0); } } + +static INLINE void convolve_4tap_2d_neon_i8mm(const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, int w, + int h, const int8x8_t x_filter, + const uint8x8_t y_filter) { + // Neon does not have lane-referencing multiply or multiply-accumulate + // instructions that operate on vectors of 8-bit elements. This means we have + // to duplicate filter taps into a whole vector and use standard multiply / + // multiply-accumulate instructions. + const uint8x8_t y_filter_taps[4] = { vdup_lane_u8(y_filter, 2), + vdup_lane_u8(y_filter, 3), + vdup_lane_u8(y_filter, 4), + vdup_lane_u8(y_filter, 5) }; + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + + uint8x16_t h_s0, h_s1, h_s2; + load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2); + + int16x4_t t0 = convolve4_4_h(h_s0, x_filter, permute_tbl); + int16x4_t t1 = convolve4_4_h(h_s1, x_filter, permute_tbl); + int16x4_t t2 = convolve4_4_h(h_s2, x_filter, permute_tbl); + // We halved the filter values so -1 from right shift. + uint8x8_t v_s01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t v_s12 = vqrshrun_n_s16(vcombine_s16(t1, t2), FILTER_BITS - 1); + + src += 3 * src_stride; + + do { + uint8x16_t h_s3, h_s4, h_s5, h_s6; + load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6); + + int16x4_t t3 = convolve4_4_h(h_s3, x_filter, permute_tbl); + int16x4_t t4 = convolve4_4_h(h_s4, x_filter, permute_tbl); + int16x4_t t5 = convolve4_4_h(h_s5, x_filter, permute_tbl); + int16x4_t t6 = convolve4_4_h(h_s6, x_filter, permute_tbl); + // We halved the filter values so -1 from right shift. + uint8x8_t v_s34 = vqrshrun_n_s16(vcombine_s16(t3, t4), FILTER_BITS - 1); + uint8x8_t v_s56 = vqrshrun_n_s16(vcombine_s16(t5, t6), FILTER_BITS - 1); + uint8x8_t v_s23 = vext_u8(v_s12, v_s34, 4); + uint8x8_t v_s45 = vext_u8(v_s34, v_s56, 4); + + uint8x8_t d01 = convolve4_8(v_s01, v_s12, v_s23, v_s34, y_filter_taps); + uint8x8_t d23 = convolve4_8(v_s23, v_s34, v_s45, v_s56, y_filter_taps); + + store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01); + store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23); + + v_s01 = v_s45; + v_s12 = v_s56; + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + + uint8x16_t h_s0, h_s1, h_s2; + load_u8_16x3(s, src_stride, &h_s0, &h_s1, &h_s2); + + uint8x8_t v_s0 = convolve4_8_h(h_s0, x_filter, permute_tbl); + uint8x8_t v_s1 = convolve4_8_h(h_s1, x_filter, permute_tbl); + uint8x8_t v_s2 = convolve4_8_h(h_s2, x_filter, permute_tbl); + + s += 3 * src_stride; + + do { + uint8x16_t h_s3, h_s4, h_s5, h_s6; + load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6); + + uint8x8_t v_s3 = convolve4_8_h(h_s3, x_filter, permute_tbl); + uint8x8_t v_s4 = convolve4_8_h(h_s4, x_filter, permute_tbl); + uint8x8_t v_s5 = convolve4_8_h(h_s5, x_filter, permute_tbl); + uint8x8_t v_s6 = convolve4_8_h(h_s6, x_filter, permute_tbl); + + uint8x8_t d0 = convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter_taps); + uint8x8_t d1 = convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter_taps); + uint8x8_t d2 = convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter_taps); + uint8x8_t d3 = convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter_taps); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void convolve_8tap_2d_horiz_neon_i8mm( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { + if (w == 4) { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl); + int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl); + int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl); + int16x4_t d3 = convolve8_4_h(s3, filter, permute_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm() + // below for further details on possible values of block height. + uint8x16_t s0, s1, s2; + load_u8_16x3(src, src_stride, &s0, &s1, &s2); + + int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl); + int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl); + int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = + vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8_4x1(dst + 2 * dst_stride, d23); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl); + uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm() + // below for further details on possible values of block height. + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0, s1, s2; + load_u8_16x3(s, src_stride, &s0, &s1, &s2); + + uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl); + + store_u8_8x3(d, dst_stride, d0, d1, d2); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + } +} + +void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + + (void)x_step_q4; + (void)y_step_q4; + + const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8; + const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8; + // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2 + // lines post both horizontally and vertically. + const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1; + const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride; + + if (x_filter_taps == 4 && y_filter_taps == 4) { + const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2); + const int16x8_t y_filter = vld1q_s16(filter[y0_q4]); + + // 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const int8x8_t x_filter_4tap = + vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); + const uint8x8_t y_filter_4tap = + vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(y_filter)), 1); + + convolve_4tap_2d_neon_i8mm(src - horiz_offset - vert_offset, src_stride, + dst, dst_stride, w, h, x_filter_4tap, + y_filter_4tap); + return; + } + + // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the + // maximum buffer size to 64 * (64 + 7). + DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]); + const int im_stride = 64; + const int im_height = h + SUBPEL_TAPS - 1; + + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4])); + + convolve_8tap_2d_horiz_neon_i8mm(src - horiz_offset - vert_offset, src_stride, + im_block, im_stride, w, im_height, + x_filter_8tap); + + convolve_8tap_vert_neon_i8mm(im_block, im_stride, dst, dst_stride, w, h, + y_filter_8tap); +} + +void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]); + const int im_stride = 64; + + // Averaging convolution always uses an 8-tap filter. + // Account for the vertical phase needing 3 lines prior and 4 lines post. + const int im_height = h + SUBPEL_TAPS - 1; + const ptrdiff_t offset = SUBPEL_TAPS / 2 - 1; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + + convolve_8tap_2d_horiz_neon_i8mm(src - offset - offset * src_stride, + src_stride, im_block, im_stride, w, + im_height, x_filter_8tap); + + vpx_convolve8_avg_vert_neon_i8mm(im_block + offset * im_stride, im_stride, + dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c index 57772ea668..de5fa29471 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c @@ -19,31 +19,32 @@ void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the - * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4). - */ - uint8_t temp[64 * 72]; + // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the + // maximum buffer size to 64 * (64 + 7) (+1 row to make it divisible by 4). + DECLARE_ALIGNED(32, uint8_t, im_block[64 * 72]); + const int im_stride = 64; const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8; - /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior - * and vert_filter_taps / 2 lines post. (+1 to make total divisible by 4.) */ - const int intermediate_height = h + vert_filter_taps; + // Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior + // and vert_filter_taps / 2 lines post. (+1 to make total divisible by 4.) + const int im_height = h + vert_filter_taps; const ptrdiff_t border_offset = vert_filter_taps / 2 - 1; assert(y_step_q4 == 16); assert(x_step_q4 == 16); - /* Filter starting border_offset lines back. The Neon implementation will - * ignore the given height and filter a multiple of 4 lines. Since this goes - * in to the temp buffer which has lots of extra room and is subsequently - * discarded this is safe if somewhat less than ideal. */ - vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride, temp, - w, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, - intermediate_height); + // Filter starting border_offset rows back. The Neon implementation will + // ignore the given height and filter a multiple of 4 lines. Since this goes + // into the temporary buffer which has lots of extra room and is subsequently + // discarded this is safe if somewhat less than ideal. + vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride, + im_block, im_stride, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, im_height); - /* Step into the temp buffer border_offset lines to get actual frame data. */ - vpx_convolve8_vert_neon(temp + w * border_offset, w, dst, dst_stride, filter, - x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + // Step into the temporary buffer border_offset rows to get actual frame data. + vpx_convolve8_vert_neon(im_block + im_stride * border_offset, im_stride, dst, + dst_stride, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, h); } void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, @@ -51,18 +52,21 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - uint8_t temp[64 * 72]; - const int intermediate_height = h + 8; + DECLARE_ALIGNED(32, uint8_t, im_block[64 * 72]); + const int im_stride = 64; + const int im_height = h + SUBPEL_TAPS; + const ptrdiff_t border_offset = SUBPEL_TAPS / 2 - 1; assert(y_step_q4 == 16); assert(x_step_q4 == 16); - /* This implementation has the same issues as above. In addition, we only want - * to average the values after both passes. - */ - vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter, - x0_q4, x_step_q4, y0_q4, y_step_q4, w, - intermediate_height); - vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4, - x_step_q4, y0_q4, y_step_q4, w, h); + // This implementation has the same issues as above. In addition, we only want + // to average the values after both passes. + vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride, + im_block, im_stride, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, im_height); + + vpx_convolve8_avg_vert_neon(im_block + im_stride * border_offset, im_stride, + dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, h); } diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c deleted file mode 100644 index 9d754fde17..0000000000 --- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2023 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <assert.h> - -#include "./vpx_dsp_rtcd.h" -#include "vpx_dsp/arm/vpx_convolve8_neon.h" -#include "vpx_dsp/vpx_dsp_common.h" -#include "vpx_dsp/vpx_filter.h" -#include "vpx_ports/mem.h" - -void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { - /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the - * maximum buffer size to 64 * (64 + 7). */ - uint8_t temp[64 * 71]; - - const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8; - /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior - * and vert_filter_taps / 2 lines post. */ - const int intermediate_height = h + vert_filter_taps - 1; - const ptrdiff_t border_offset = vert_filter_taps / 2 - 1; - - assert(y_step_q4 == 16); - assert(x_step_q4 == 16); - - vpx_convolve8_2d_horiz_neon_dotprod( - src - src_stride * border_offset, src_stride, temp, w, filter, x0_q4, - x_step_q4, y0_q4, y_step_q4, w, intermediate_height); - - vpx_convolve8_vert_neon_dotprod(temp + w * border_offset, w, dst, dst_stride, - filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, - h); -} - -void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, - int w, int h) { - uint8_t temp[64 * 71]; - - /* Averaging convolution always uses an 8-tap filter. */ - /* Account for the vertical phase needing 3 lines prior and 4 lines post. */ - const int intermediate_height = h + 7; - - assert(y_step_q4 == 16); - assert(x_step_q4 == 16); - - vpx_convolve8_2d_horiz_neon_dotprod(src - src_stride * 3, src_stride, temp, w, - filter, x0_q4, x_step_q4, y0_q4, - y_step_q4, w, intermediate_height); - - vpx_convolve8_avg_vert_neon_dotprod(temp + w * 3, w, dst, dst_stride, filter, - x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); -} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c deleted file mode 100644 index d7cbb09ea6..0000000000 --- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2023 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <assert.h> - -#include "./vpx_dsp_rtcd.h" -#include "vpx_dsp/arm/vpx_convolve8_neon.h" -#include "vpx_dsp/vpx_dsp_common.h" -#include "vpx_dsp/vpx_filter.h" -#include "vpx_ports/mem.h" - -void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { - /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the - * maximum buffer size to 64 * (64 + 7). */ - uint8_t temp[64 * 71]; - - const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8; - /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior - * and vert_filter_taps / 2 lines post. */ - const int intermediate_height = h + vert_filter_taps - 1; - const ptrdiff_t border_offset = vert_filter_taps / 2 - 1; - - assert(y_step_q4 == 16); - assert(x_step_q4 == 16); - - vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * border_offset, src_stride, - temp, w, filter, x0_q4, x_step_q4, y0_q4, - y_step_q4, w, intermediate_height); - - vpx_convolve8_vert_neon_i8mm(temp + w * border_offset, w, dst, dst_stride, - filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, - h); -} - -void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { - uint8_t temp[64 * 71]; - - /* Averaging convolution always uses an 8-tap filter. */ - /* Account for the vertical phase needing 3 lines prior and 4 lines post. */ - const int intermediate_height = h + 7; - - assert(y_step_q4 == 16); - assert(x_step_q4 == 16); - - vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * 3, src_stride, temp, w, - filter, x0_q4, x_step_q4, y0_q4, y_step_q4, - w, intermediate_height); - - vpx_convolve8_avg_vert_neon_i8mm(temp + w * 3, w, dst, dst_stride, filter, - x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); -} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h new file mode 100644 index 0000000000..bf9f18c7e6 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_ +#define VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_ + +#include <arm_neon.h> +#include <arm_sve.h> +#include <arm_neon_sve_bridge.h> + +// Some very useful instructions are exclusive to the SVE2 instruction set. +// However, we can access these instructions from a predominantly Neon context +// by making use of the Neon-SVE bridge intrinsics to reinterpret Neon vectors +// as SVE vectors - with the high part of the SVE vector (if it's longer than +// 128 bits) being "don't care". + +static INLINE int16x8_t vpx_tbl2_s16(int16x8_t s0, int16x8_t s1, + uint16x8_t tbl) { + svint16x2_t samples = svcreate2_s16(svset_neonq_s16(svundef_s16(), s0), + svset_neonq_s16(svundef_s16(), s1)); + return svget_neonq_s16( + svtbl2_s16(samples, svset_neonq_u16(svundef_u16(), tbl))); +} + +#endif // VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h new file mode 100644 index 0000000000..48534fb70e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_ +#define VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_ + +#include <arm_neon.h> +#include <arm_sve.h> +#include <arm_neon_sve_bridge.h> + +// Dot product instructions operating on 16-bit input elements are exclusive to +// the SVE instruction set. However, we can access these instructions from a +// predominantly Neon context by making use of the Neon-SVE bridge intrinsics +// to reinterpret Neon vectors as SVE vectors - with the high part of the SVE +// vector (if it's longer than 128 bits) being "don't care". + +// While sub-optimal on machines that have SVE vector length > 128-bit - as the +// remainder of the vector is unused - this approach is still beneficial when +// compared to a Neon-only solution. + +static INLINE uint64x2_t vpx_dotq_u16(uint64x2_t acc, uint16x8_t x, + uint16x8_t y) { + return svget_neonq_u64(svdot_u64(svset_neonq_u64(svundef_u64(), acc), + svset_neonq_u16(svundef_u16(), x), + svset_neonq_u16(svundef_u16(), y))); +} + +static INLINE int64x2_t vpx_dotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) { + return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc), + svset_neonq_s16(svundef_s16(), x), + svset_neonq_s16(svundef_s16(), y))); +} + +#define vpx_dotq_lane_s16(acc, x, y, lane) \ + svget_neonq_s64(svdot_lane_s64(svset_neonq_s64(svundef_s64(), acc), \ + svset_neonq_s16(svundef_s16(), x), \ + svset_neonq_s16(svundef_s16(), y), lane)) + +static INLINE uint16x8_t vpx_tbl_u16(uint16x8_t data, uint16x8_t indices) { + return svget_neonq_u16(svtbl_u16(svset_neonq_u16(svundef_u16(), data), + svset_neonq_u16(svundef_u16(), indices))); +} + +#endif // VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c index b8e3c5e540..9bd5ec285c 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c @@ -20,263 +20,271 @@ #include "vpx_dsp/arm/vpx_convolve8_neon.h" #include "vpx_ports/mem.h" -static INLINE void scaledconvolve_horiz_w4( +static INLINE void scaledconvolve_horiz_neon( const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, - const ptrdiff_t dst_stride, const InterpKernel *const x_filters, - const int x0_q4, const int x_step_q4, const int w, const int h) { - DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); - int x, y, z; + const ptrdiff_t dst_stride, const InterpKernel *const x_filter, + const int x0_q4, const int x_step_q4, int w, int h) { + DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); src -= SUBPEL_TAPS / 2 - 1; - y = h; - do { - int x_q4 = x0_q4; - x = 0; + if (w == 4) { do { - // process 4 src_x steps - for (z = 0; z < 4; ++z) { - const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + int x_q4 = x0_q4; + + // Process a 4x4 tile. + for (int r = 0; r < 4; ++r) { + const uint8_t *s = &src[x_q4 >> SUBPEL_BITS]; + if (x_q4 & SUBPEL_MASK) { - const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]); - uint8x8_t s[8], d; - int16x8_t ss[4]; - int16x4_t t[8], tt; - - load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]); - transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]); - - ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0])); - ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1])); - ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2])); - ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3])); - t[0] = vget_low_s16(ss[0]); - t[1] = vget_low_s16(ss[1]); - t[2] = vget_low_s16(ss[2]); - t[3] = vget_low_s16(ss[3]); - t[4] = vget_high_s16(ss[0]); - t[5] = vget_high_s16(ss[1]); - t[6] = vget_high_s16(ss[2]); - t[7] = vget_high_s16(ss[3]); - - tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], - filters); - d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); - vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0); + const int16x8_t filter = vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]); + + uint8x8_t t0, t1, t2, t3; + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); + + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + int16x4_t dd0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); + uint8x8_t d0 = + vqrshrun_n_s16(vcombine_s16(dd0, vdup_n_s16(0)), FILTER_BITS); + + store_u8_4x1(&temp[4 * r], d0); } else { - int i; - for (i = 0; i < 4; ++i) { - temp[z * 4 + i] = src_x[i * src_stride + 3]; + // Memcpy for non-subpel locations. + s += SUBPEL_TAPS / 2 - 1; + + for (int c = 0; c < 4; ++c) { + temp[r * 4 + c] = s[c * src_stride]; } } x_q4 += x_step_q4; } - // transpose the 4x4 filters values back to dst - { - const uint8x8x4_t d4 = vld4_u8(temp); - vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride], - vreinterpret_u32_u8(d4.val[0]), 0); - vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride], - vreinterpret_u32_u8(d4.val[1]), 0); - vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride], - vreinterpret_u32_u8(d4.val[2]), 0); - vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride], - vreinterpret_u32_u8(d4.val[3]), 0); - } - x += 4; - } while (x < w); + // Transpose the 4x4 result tile and store. + uint8x8_t d01 = vld1_u8(temp + 0); + uint8x8_t d23 = vld1_u8(temp + 8); - src += src_stride * 4; - dst += dst_stride * 4; - y -= 4; - } while (y > 0); -} + transpose_u8_4x4(&d01, &d23); -static INLINE void scaledconvolve_horiz_w8( - const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, - const ptrdiff_t dst_stride, const InterpKernel *const x_filters, - const int x0_q4, const int x_step_q4, const int w, const int h) { - DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); - int x, y, z; - src -= SUBPEL_TAPS / 2 - 1; + store_u8_4x1(dst + 0 * dst_stride, d01); + store_u8_4x1(dst + 1 * dst_stride, d23); + store_u8_4x1_high(dst + 2 * dst_stride, d01); + store_u8_4x1_high(dst + 3 * dst_stride, d23); - // This function processes 8x8 areas. The intermediate height is not always - // a multiple of 8, so force it to be a multiple of 8 here. - y = (h + 7) & ~7; + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + return; + } do { int x_q4 = x0_q4; - x = 0; + uint8_t *d = dst; + int width = w; + do { - uint8x8_t d[8]; - // process 8 src_x steps - for (z = 0; z < 8; ++z) { - const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + // Process an 8x8 tile. + for (int r = 0; r < 8; ++r) { + const uint8_t *s = &src[x_q4 >> SUBPEL_BITS]; if (x_q4 & SUBPEL_MASK) { - const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]); - uint8x8_t s[8]; - load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], - &s[5], &s[6], &s[7]); - transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], - &s[7]); - d[0] = scale_filter_8(s, filters); - vst1_u8(&temp[8 * z], d[0]); + const int16x8_t filter = vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]); + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + + vst1_u8(&temp[r * 8], d0); } else { - int i; - for (i = 0; i < 8; ++i) { - temp[z * 8 + i] = src_x[i * src_stride + 3]; + // Memcpy for non-subpel locations. + s += SUBPEL_TAPS / 2 - 1; + + for (int c = 0; c < 8; ++c) { + temp[r * 8 + c] = s[c * src_stride]; } } x_q4 += x_step_q4; } - // transpose the 8x8 filters values back to dst - load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], - &d[7]); - transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); - vst1_u8(&dst[x + 0 * dst_stride], d[0]); - vst1_u8(&dst[x + 1 * dst_stride], d[1]); - vst1_u8(&dst[x + 2 * dst_stride], d[2]); - vst1_u8(&dst[x + 3 * dst_stride], d[3]); - vst1_u8(&dst[x + 4 * dst_stride], d[4]); - vst1_u8(&dst[x + 5 * dst_stride], d[5]); - vst1_u8(&dst[x + 6 * dst_stride], d[6]); - vst1_u8(&dst[x + 7 * dst_stride], d[7]); - x += 8; - } while (x < w); - - src += src_stride * 8; - dst += dst_stride * 8; - } while (y -= 8); -} + // Transpose the 8x8 result tile and store. + uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7; + load_u8_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); -static INLINE void scaledconvolve_vert_w4( - const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, - const ptrdiff_t dst_stride, const InterpKernel *const y_filters, - const int y0_q4, const int y_step_q4, const int w, const int h) { - int y; - int y_q4 = y0_q4; + transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - y = h; - do { - const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); - if (y_q4 & SUBPEL_MASK) { - const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); - uint8x8_t s[8], d; - int16x4_t t[8], tt; - - load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], - &s[6], &s[7]); - t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0]))); - t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1]))); - t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2]))); - t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3]))); - t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4]))); - t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5]))); - t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6]))); - t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7]))); - - tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters); - d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); - } else { - memcpy(dst, &src_y[3 * src_stride], w); - } + d += 8; + width -= 8; + } while (width != 0); - dst += dst_stride; - y_q4 += y_step_q4; - } while (--y); + src += 8 * src_stride; + dst += 8 * dst_stride; + h -= 8; + } while (h > 0); } -static INLINE void scaledconvolve_vert_w8( +static INLINE void scaledconvolve_vert_neon( const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, - const ptrdiff_t dst_stride, const InterpKernel *const y_filters, - const int y0_q4, const int y_step_q4, const int w, const int h) { - int y; + const ptrdiff_t dst_stride, const InterpKernel *const y_filter, + const int y0_q4, const int y_step_q4, int w, int h) { int y_q4 = y0_q4; - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - y = h; - do { - const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; - if (y_q4 & SUBPEL_MASK) { - const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); - uint8x8_t s[8], d; - load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], - &s[6], &s[7]); - d = scale_filter_8(s, filters); - vst1_u8(dst, d); - } else { - memcpy(dst, &src_y[3 * src_stride], w); - } - dst += dst_stride; - y_q4 += y_step_q4; - } while (--y); -} + if (w == 4) { + do { + const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; -static INLINE void scaledconvolve_vert_w16( - const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, - const ptrdiff_t dst_stride, const InterpKernel *const y_filters, - const int y0_q4, const int y_step_q4, const int w, const int h) { - int x, y; - int y_q4 = y0_q4; + if (y_q4 & SUBPEL_MASK) { + const int16x8_t filter = vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]); + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4))); + int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); + int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6))); + int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7))); + + int16x4_t dd0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); + uint8x8_t d0 = + vqrshrun_n_s16(vcombine_s16(dd0, vdup_n_s16(0)), FILTER_BITS); + + store_u8_4x1(dst, d0); + } else { + // Memcpy for non-subpel locations. + memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 4); + } + + y_q4 += y_step_q4; + dst += dst_stride; + } while (--h != 0); + return; + } + + if (w == 8) { + do { + const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + + if (y_q4 & SUBPEL_MASK) { + const int16x8_t filter = vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]); + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + + vst1_u8(dst, d0); + } else { + // Memcpy for non-subpel locations. + memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 8); + } + + y_q4 += y_step_q4; + dst += dst_stride; + } while (--h != 0); + return; + } - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - y = h; do { - const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + uint8_t *d = dst; + int width = w; + if (y_q4 & SUBPEL_MASK) { - x = 0; do { - const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); - uint8x16_t ss[8]; - uint8x8_t s[8], d[2]; - load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4], - &ss[5], &ss[6], &ss[7]); - s[0] = vget_low_u8(ss[0]); - s[1] = vget_low_u8(ss[1]); - s[2] = vget_low_u8(ss[2]); - s[3] = vget_low_u8(ss[3]); - s[4] = vget_low_u8(ss[4]); - s[5] = vget_low_u8(ss[5]); - s[6] = vget_low_u8(ss[6]); - s[7] = vget_low_u8(ss[7]); - d[0] = scale_filter_8(s, filters); - - s[0] = vget_high_u8(ss[0]); - s[1] = vget_high_u8(ss[1]); - s[2] = vget_high_u8(ss[2]); - s[3] = vget_high_u8(ss[3]); - s[4] = vget_high_u8(ss[4]); - s[5] = vget_high_u8(ss[5]); - s[6] = vget_high_u8(ss[6]); - s[7] = vget_high_u8(ss[7]); - d[1] = scale_filter_8(s, filters); - vst1q_u8(&dst[x], vcombine_u8(d[0], d[1])); - src_y += 16; - x += 16; - } while (x < w); + const int16x8_t filter = vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]); + + uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_16x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; + s0[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0))); + s1[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t1))); + s2[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t2))); + s3[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t3))); + s4[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t4))); + s5[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t5))); + s6[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t6))); + s7[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t7))); + + s0[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0))); + s1[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t1))); + s2[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t2))); + s3[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t3))); + s4[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t4))); + s5[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t5))); + s6[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t6))); + s7[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t7))); + + uint8x8_t d0 = convolve8_8(s0[0], s1[0], s2[0], s3[0], s4[0], s5[0], + s6[0], s7[0], filter); + uint8x8_t d1 = convolve8_8(s0[1], s1[1], s2[1], s3[1], s4[1], s5[1], + s6[1], s7[1], filter); + + vst1q_u8(d, vcombine_u8(d0, d1)); + + s += 16; + d += 16; + width -= 16; + } while (width != 0); } else { - memcpy(dst, &src_y[3 * src_stride], w); + // Memcpy for non-subpel locations. + s += (SUBPEL_TAPS / 2 - 1) * src_stride; + + do { + uint8x16_t s0 = vld1q_u8(s); + vst1q_u8(d, s0); + s += 16; + d += 16; + width -= 16; + } while (width != 0); } - dst += dst_stride; + y_q4 += y_step_q4; - } while (--y); + dst += dst_stride; + } while (--h != 0); } void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // Fixed size intermediate buffer, im_block, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. // (2) Interpolate temp vertically to derive the sub-pixel result. - // Deriving the maximum number of rows in the temp buffer (135): + // Deriving the maximum number of rows in the im_block buffer (135): // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). // --Largest block size is 64x64 pixels. // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the @@ -288,33 +296,20 @@ void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, // When calling in frame scaling function, the smallest scaling factor is x1/4 // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still // big enough. - DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); - const int intermediate_height = + DECLARE_ALIGNED(16, uint8_t, im_block[(135 + 8) * 64]); + const int im_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + const ptrdiff_t im_stride = 64; assert(w <= 64); assert(h <= 64); assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); assert(x_step_q4 <= 64); - if (w >= 8) { - scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, temp, 64, filter, x0_q4, x_step_q4, w, - intermediate_height); - } else { - scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, temp, 64, filter, x0_q4, x_step_q4, w, - intermediate_height); - } + scaledconvolve_horiz_neon(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, im_block, im_stride, filter, x0_q4, + x_step_q4, w, im_height); - if (w >= 16) { - scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, - dst_stride, filter, y0_q4, y_step_q4, w, h); - } else if (w == 8) { - scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, - dst_stride, filter, y0_q4, y_step_q4, w, h); - } else { - scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, - dst_stride, filter, y0_q4, y_step_q4, w, h); - } + scaledconvolve_vert_neon(im_block, im_stride, dst, dst_stride, filter, y0_q4, + y_step_q4, w, h); } diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk index 2bee91f449..916dc62cef 100644 --- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk +++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk @@ -112,7 +112,8 @@ DSP_SRCS-$(HAVE_AVX2) += x86/highbd_convolve_avx2.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_copy_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_avg_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve8_neon.c -DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_neon.c +DSP_SRCS-$(HAVE_SVE) += arm/highbd_vpx_convolve8_sve.c +DSP_SRCS-$(HAVE_SVE2) += arm/highbd_vpx_convolve8_sve2.c endif DSP_SRCS-$(HAVE_SSE2) += x86/vpx_convolve_copy_sse2.asm @@ -139,9 +140,7 @@ DSP_SRCS-yes += arm/vpx_convolve8_neon.c DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c DSP_SRCS-yes += arm/vpx_convolve_neon.c DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve8_neon_dotprod.c -DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve_neon_dotprod.c DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve8_neon_i8mm.c -DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve_neon_i8mm.c endif # HAVE_NEON endif # HAVE_NEON_ASM @@ -374,6 +373,7 @@ DSP_SRCS-yes += sad.c DSP_SRCS-yes += subtract.c DSP_SRCS-yes += sum_squares.c DSP_SRCS-$(HAVE_NEON) += arm/sum_squares_neon.c +DSP_SRCS-$(HAVE_SVE) += arm/sum_squares_sve.c DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c DSP_SRCS-$(HAVE_MSA) += mips/sum_squares_msa.c @@ -454,6 +454,8 @@ DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm DSP_SRCS-$(HAVE_NEON) += arm/highbd_avg_pred_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_sse_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_variance_neon.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/highbd_variance_neon_dotprod.c +DSP_SRCS-$(HAVE_SVE) += arm/highbd_variance_sve.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_subpel_variance_neon.c endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c index 030c456d39..2b8c656afb 100644 --- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c +++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c @@ -12,4 +12,4 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_ports/vpx_once.h" -void vpx_dsp_rtcd() { once(setup_rtcd_internal); } +void vpx_dsp_rtcd(void) { once(setup_rtcd_internal); } diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl index 18087e25d9..f40f85c036 100644 --- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -427,19 +427,19 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_convolve8 avx2 neon/, "$sse2_x86_64"; add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; - specialize qw/vpx_highbd_convolve8_horiz avx2 neon/, "$sse2_x86_64"; + specialize qw/vpx_highbd_convolve8_horiz avx2 neon sve/, "$sse2_x86_64"; add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; - specialize qw/vpx_highbd_convolve8_vert avx2 neon/, "$sse2_x86_64"; + specialize qw/vpx_highbd_convolve8_vert avx2 neon sve2/, "$sse2_x86_64"; add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; specialize qw/vpx_highbd_convolve8_avg avx2 neon/, "$sse2_x86_64"; add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; - specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon/, "$sse2_x86_64"; + specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon sve/, "$sse2_x86_64"; add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; - specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon/, "$sse2_x86_64"; + specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon sve2/, "$sse2_x86_64"; } # CONFIG_VP9_HIGHBITDEPTH if (vpx_config("CONFIG_VP9") eq "yes") { @@ -1009,7 +1009,7 @@ add_proto qw/void vpx_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, specialize qw/vpx_sad_skip_4x4x4d neon/; add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size"; -specialize qw/vpx_sum_squares_2d_i16 neon sse2 msa/; +specialize qw/vpx_sum_squares_2d_i16 neon sve sse2 msa/; # # Structured Similarity (SSIM) @@ -1411,163 +1411,163 @@ add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, i if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance64x64 sse2 neon/; + specialize qw/vpx_highbd_12_variance64x64 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance64x32 sse2 neon/; + specialize qw/vpx_highbd_12_variance64x32 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance32x64 sse2 neon/; + specialize qw/vpx_highbd_12_variance32x64 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance32x32 sse2 neon/; + specialize qw/vpx_highbd_12_variance32x32 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance32x16 sse2 neon/; + specialize qw/vpx_highbd_12_variance32x16 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance16x32 sse2 neon/; + specialize qw/vpx_highbd_12_variance16x32 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance16x16 sse2 neon/; + specialize qw/vpx_highbd_12_variance16x16 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance16x8 sse2 neon/; + specialize qw/vpx_highbd_12_variance16x8 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance8x16 sse2 neon/; + specialize qw/vpx_highbd_12_variance8x16 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance8x8 sse2 neon/; + specialize qw/vpx_highbd_12_variance8x8 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance8x4 neon/; + specialize qw/vpx_highbd_12_variance8x4 neon sve/; add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance4x8 neon/; + specialize qw/vpx_highbd_12_variance4x8 neon sve/; add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance4x4 neon/; + specialize qw/vpx_highbd_12_variance4x4 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance64x64 sse2 neon/; + specialize qw/vpx_highbd_10_variance64x64 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance64x32 sse2 neon/; + specialize qw/vpx_highbd_10_variance64x32 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance32x64 sse2 neon/; + specialize qw/vpx_highbd_10_variance32x64 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance32x32 sse2 neon/; + specialize qw/vpx_highbd_10_variance32x32 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance32x16 sse2 neon/; + specialize qw/vpx_highbd_10_variance32x16 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance16x32 sse2 neon/; + specialize qw/vpx_highbd_10_variance16x32 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance16x16 sse2 neon/; + specialize qw/vpx_highbd_10_variance16x16 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance16x8 sse2 neon/; + specialize qw/vpx_highbd_10_variance16x8 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance8x16 sse2 neon/; + specialize qw/vpx_highbd_10_variance8x16 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance8x8 sse2 neon/; + specialize qw/vpx_highbd_10_variance8x8 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance8x4 neon/; + specialize qw/vpx_highbd_10_variance8x4 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance4x8 neon/; + specialize qw/vpx_highbd_10_variance4x8 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance4x4 neon/; + specialize qw/vpx_highbd_10_variance4x4 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance64x64 sse2 neon/; + specialize qw/vpx_highbd_8_variance64x64 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance64x32 sse2 neon/; + specialize qw/vpx_highbd_8_variance64x32 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance32x64 sse2 neon/; + specialize qw/vpx_highbd_8_variance32x64 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance32x32 sse2 neon/; + specialize qw/vpx_highbd_8_variance32x32 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance32x16 sse2 neon/; + specialize qw/vpx_highbd_8_variance32x16 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance16x32 sse2 neon/; + specialize qw/vpx_highbd_8_variance16x32 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance16x16 sse2 neon/; + specialize qw/vpx_highbd_8_variance16x16 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance16x8 sse2 neon/; + specialize qw/vpx_highbd_8_variance16x8 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance8x16 sse2 neon/; + specialize qw/vpx_highbd_8_variance8x16 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance8x8 sse2 neon/; + specialize qw/vpx_highbd_8_variance8x8 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance8x4 neon/; + specialize qw/vpx_highbd_8_variance8x4 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance4x8 neon/; + specialize qw/vpx_highbd_8_variance4x8 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance4x4 neon/; + specialize qw/vpx_highbd_8_variance4x4 neon sve/; add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_8_get16x16var sse2 neon/; + specialize qw/vpx_highbd_8_get16x16var sse2 neon sve/; add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_8_get8x8var sse2 neon/; + specialize qw/vpx_highbd_8_get8x8var sse2 neon sve/; add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_10_get16x16var sse2 neon/; + specialize qw/vpx_highbd_10_get16x16var sse2 neon sve/; add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_10_get8x8var sse2 neon/; + specialize qw/vpx_highbd_10_get8x8var sse2 neon sve/; add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_12_get16x16var sse2 neon/; + specialize qw/vpx_highbd_12_get16x16var sse2 neon sve/; add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_12_get8x8var sse2 neon/; + specialize qw/vpx_highbd_12_get8x8var sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_mse16x16 sse2 neon/; + specialize qw/vpx_highbd_8_mse16x16 sse2 neon neon_dotprod/; add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_mse16x8 neon/; + specialize qw/vpx_highbd_8_mse16x8 neon neon_dotprod/; add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_mse8x16 neon/; + specialize qw/vpx_highbd_8_mse8x16 neon neon_dotprod/; add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_mse8x8 sse2 neon/; + specialize qw/vpx_highbd_8_mse8x8 sse2 neon neon_dotprod/; add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_mse16x16 sse2 neon/; + specialize qw/vpx_highbd_10_mse16x16 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_mse16x8 neon/; + specialize qw/vpx_highbd_10_mse16x8 neon sve/; add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_mse8x16 neon/; + specialize qw/vpx_highbd_10_mse8x16 neon sve/; add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_mse8x8 sse2 neon/; + specialize qw/vpx_highbd_10_mse8x8 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_mse16x16 sse2 neon/; + specialize qw/vpx_highbd_12_mse16x16 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_mse16x8 neon/; + specialize qw/vpx_highbd_12_mse16x8 neon sve/; add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_mse8x16 neon/; + specialize qw/vpx_highbd_12_mse8x16 neon sve/; add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_mse8x8 sse2 neon/; + specialize qw/vpx_highbd_12_mse8x8 sse2 neon sve/; add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride"; specialize qw/vpx_highbd_comp_avg_pred neon sse2/; diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_filter.h b/media/libvpx/libvpx/vpx_dsp/vpx_filter.h index 0cddcb6991..eb8ff06cd7 100644 --- a/media/libvpx/libvpx/vpx_dsp/vpx_filter.h +++ b/media/libvpx/libvpx/vpx_dsp/vpx_filter.h @@ -28,7 +28,6 @@ extern "C" { typedef int16_t InterpKernel[SUBPEL_TAPS]; static INLINE int vpx_get_filter_taps(const int16_t *const filter) { - assert(filter[3] != 128); if (filter[0] | filter[7]) { return 8; } diff --git a/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c b/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c index 539d09bb39..eba12d312a 100644 --- a/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c +++ b/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c @@ -15,7 +15,7 @@ #include <sys/sysctl.h> #endif -#if !CONFIG_RUNTIME_CPU_DETECT +#if !CONFIG_RUNTIME_CPU_DETECT || defined(__OpenBSD__) static int arm_get_cpu_caps(void) { // This function should actually be a no-op. There is no way to adjust any of @@ -28,7 +28,7 @@ static int arm_get_cpu_caps(void) { return flags; } -#elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT +#elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT || defined(__OpenBSD__) // sysctlbyname() parameter documentation for instruction set characteristics: // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics @@ -99,14 +99,17 @@ static int arm_get_cpu_caps(void) { // hwcap values are not defined should not prevent features from being enabled. #define VPX_AARCH64_HWCAP_ASIMDDP (1 << 20) #define VPX_AARCH64_HWCAP_SVE (1 << 22) +#define VPX_AARCH64_HWCAP2_SVE2 (1 << 1) #define VPX_AARCH64_HWCAP2_I8MM (1 << 13) static int arm_get_cpu_caps(void) { int flags = 0; +#if HAVE_NEON_DOTPROD || HAVE_SVE unsigned long hwcap = getauxval(AT_HWCAP); -#if HAVE_NEON_I8MM +#endif // HAVE_NEON_DOTPROD || HAVE_SVE +#if HAVE_NEON_I8MM || HAVE_SVE2 unsigned long hwcap2 = getauxval(AT_HWCAP2); -#endif // HAVE_NEON_I8MM +#endif // HAVE_NEON_I8MM || HAVE_SVE2 #if HAVE_NEON flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. #endif // HAVE_NEON @@ -125,6 +128,11 @@ static int arm_get_cpu_caps(void) { flags |= HAS_SVE; } #endif // HAVE_SVE +#if HAVE_SVE2 + if (hwcap2 & VPX_AARCH64_HWCAP2_SVE2) { + flags |= HAS_SVE2; + } +#endif // HAVE_SVE2 return flags; } @@ -195,5 +203,10 @@ int arm_cpu_caps(void) { flags &= ~HAS_SVE; } + // Restrict flags: FEAT_SVE2 assumes that FEAT_SVE is available. + if (!(flags & HAS_SVE)) { + flags &= ~HAS_SVE2; + } + return flags; } diff --git a/media/libvpx/libvpx/vpx_ports/arm.h b/media/libvpx/libvpx/vpx_ports/arm.h index 39365d18ee..814c3cc408 100644 --- a/media/libvpx/libvpx/vpx_ports/arm.h +++ b/media/libvpx/libvpx/vpx_ports/arm.h @@ -25,6 +25,8 @@ extern "C" { #define HAS_NEON_I8MM (1 << 2) // Armv8.2-A optional SVE instructions, mandatory from Armv9.0-A. #define HAS_SVE (1 << 3) +// Armv9.0-A SVE2 instructions. +#define HAS_SVE2 (1 << 4) int arm_cpu_caps(void); diff --git a/media/libvpx/libvpx/vpx_ports/emms_mmx.c b/media/libvpx/libvpx/vpx_ports/emms_mmx.c index f1036b98ed..79b98a75f1 100644 --- a/media/libvpx/libvpx/vpx_ports/emms_mmx.c +++ b/media/libvpx/libvpx/vpx_ports/emms_mmx.c @@ -12,4 +12,4 @@ #include "vpx_ports/system_state.h" -void vpx_clear_system_state() { _mm_empty(); } +void vpx_clear_system_state(void) { _mm_empty(); } diff --git a/media/libvpx/libvpx/vpx_ports/mem.h b/media/libvpx/libvpx/vpx_ports/mem.h index 5eccfe8f50..ee9e095633 100644 --- a/media/libvpx/libvpx/vpx_ports/mem.h +++ b/media/libvpx/libvpx/vpx_ports/mem.h @@ -23,7 +23,13 @@ #define DECLARE_ALIGNED(n, typ, val) typ val #endif -#if HAVE_NEON && defined(_MSC_VER) +#if defined(__has_builtin) +#define VPX_HAS_BUILTIN(x) __has_builtin(x) +#else +#define VPX_HAS_BUILTIN(x) 0 +#endif + +#if !VPX_HAS_BUILTIN(__builtin_prefetch) && !defined(__GNUC__) #define __builtin_prefetch(x) #endif diff --git a/media/libvpx/libvpx/vpx_ports/vpx_once.h b/media/libvpx/libvpx/vpx_ports/vpx_once.h index d8a8ed89fe..d33eff4397 100644 --- a/media/libvpx/libvpx/vpx_ports/vpx_once.h +++ b/media/libvpx/libvpx/vpx_ports/vpx_once.h @@ -91,29 +91,6 @@ static void once(void (*func)(void)) { return; } -#elif CONFIG_MULTITHREAD && defined(__OS2__) -#define INCL_DOS -#include <os2.h> -static void once(void (*func)(void)) { - static volatile int done; - - /* If the initialization is complete, return early. */ - if (done) return; - - /* Causes all other threads in the process to block themselves - * and give up their time slice. - */ - DosEnterCritSec(); - - if (!done) { - func(); - done = 1; - } - - /* Restores normal thread dispatching for the current process. */ - DosExitCritSec(); -} - #elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H #include <pthread.h> static void once(void (*func)(void)) { diff --git a/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c b/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c index dc4d9593a8..706b0770c8 100644 --- a/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c +++ b/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c @@ -12,4 +12,4 @@ #include "./vpx_scale_rtcd.h" #include "vpx_ports/vpx_once.h" -void vpx_scale_rtcd() { once(setup_rtcd_internal); } +void vpx_scale_rtcd(void) { once(setup_rtcd_internal); } diff --git a/media/libvpx/libvpx/vpx_util/vpx_pthread.h b/media/libvpx/libvpx/vpx_util/vpx_pthread.h new file mode 100644 index 0000000000..cdd18d0f30 --- /dev/null +++ b/media/libvpx/libvpx/vpx_util/vpx_pthread.h @@ -0,0 +1,157 @@ +// Copyright 2024 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// pthread.h wrapper + +#ifndef VPX_VPX_UTIL_VPX_PTHREAD_H_ +#define VPX_VPX_UTIL_VPX_PTHREAD_H_ + +#include "./vpx_config.h" + +#if CONFIG_MULTITHREAD + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_WIN32) && !HAVE_PTHREAD_H +// Prevent leaking max/min macros. +#undef NOMINMAX +#define NOMINMAX +#undef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#include <errno.h> // NOLINT +#include <process.h> // NOLINT +#include <stddef.h> // NOLINT +#include <windows.h> // NOLINT +typedef HANDLE pthread_t; +typedef CRITICAL_SECTION pthread_mutex_t; + +#if _WIN32_WINNT < 0x0600 +#error _WIN32_WINNT must target Windows Vista / Server 2008 or newer. +#endif +typedef CONDITION_VARIABLE pthread_cond_t; + +#ifndef WINAPI_FAMILY_PARTITION +#define WINAPI_PARTITION_DESKTOP 1 +#define WINAPI_FAMILY_PARTITION(x) x +#endif + +#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +#define USE_CREATE_THREAD +#endif + +//------------------------------------------------------------------------------ +// simplistic pthread emulation layer + +// _beginthreadex requires __stdcall +#if defined(__GNUC__) && \ + (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) +#define THREADFN __attribute__((force_align_arg_pointer)) unsigned int __stdcall +#else +#define THREADFN unsigned int __stdcall +#endif +#define THREAD_EXIT_SUCCESS 0 + +static INLINE int pthread_create(pthread_t *const thread, const void *attr, + unsigned int(__stdcall *start)(void *), + void *arg) { + (void)attr; +#ifdef USE_CREATE_THREAD + *thread = CreateThread(NULL, /* lpThreadAttributes */ + 0, /* dwStackSize */ + start, arg, 0, /* dwStackSize */ + NULL); /* lpThreadId */ +#else + *thread = (pthread_t)_beginthreadex(NULL, /* void *security */ + 0, /* unsigned stack_size */ + start, arg, 0, /* unsigned initflag */ + NULL); /* unsigned *thrdaddr */ +#endif + if (*thread == NULL) return 1; + SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL); + return 0; +} + +static INLINE int pthread_join(pthread_t thread, void **value_ptr) { + (void)value_ptr; + return (WaitForSingleObjectEx(thread, INFINITE, FALSE /*bAlertable*/) != + WAIT_OBJECT_0 || + CloseHandle(thread) == 0); +} + +// Mutex +static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex, + void *mutexattr) { + (void)mutexattr; + InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/); + return 0; +} + +static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) { + return TryEnterCriticalSection(mutex) ? 0 : EBUSY; +} + +static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) { + EnterCriticalSection(mutex); + return 0; +} + +static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) { + LeaveCriticalSection(mutex); + return 0; +} + +static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) { + DeleteCriticalSection(mutex); + return 0; +} + +// Condition +static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) { + (void)condition; + return 0; +} + +static INLINE int pthread_cond_init(pthread_cond_t *const condition, + void *cond_attr) { + (void)cond_attr; + InitializeConditionVariable(condition); + return 0; +} + +static INLINE int pthread_cond_signal(pthread_cond_t *const condition) { + WakeConditionVariable(condition); + return 0; +} + +static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) { + WakeAllConditionVariable(condition); + return 0; +} + +static INLINE int pthread_cond_wait(pthread_cond_t *const condition, + pthread_mutex_t *const mutex) { + int ok; + ok = SleepConditionVariableCS(condition, mutex, INFINITE); + return !ok; +} +#else // _WIN32 +#include <pthread.h> // NOLINT +#define THREADFN void * +#define THREAD_EXIT_SUCCESS NULL +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // CONFIG_MULTITHREAD + +#endif // VPX_VPX_UTIL_VPX_PTHREAD_H_ diff --git a/media/libvpx/libvpx/vpx_util/vpx_thread.c b/media/libvpx/libvpx/vpx_util/vpx_thread.c index 04c5fb6f26..0d0e2f5766 100644 --- a/media/libvpx/libvpx/vpx_util/vpx_thread.c +++ b/media/libvpx/libvpx/vpx_util/vpx_thread.c @@ -12,10 +12,18 @@ // Original source: // https://chromium.googlesource.com/webm/libwebp +// Enable GNU extensions in glibc so that we can call pthread_setname_np(). +// This must be before any #include statements. +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + #include <assert.h> #include <string.h> // for memset() +#include "./vpx_config.h" #include "./vpx_thread.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_util/vpx_pthread.h" #if CONFIG_MULTITHREAD @@ -31,23 +39,54 @@ static void execute(VPxWorker *const worker); // Forward declaration. static THREADFN thread_loop(void *ptr) { VPxWorker *const worker = (VPxWorker *)ptr; - int done = 0; - while (!done) { - pthread_mutex_lock(&worker->impl_->mutex_); - while (worker->status_ == OK) { // wait in idling mode +#ifdef __APPLE__ + if (worker->thread_name != NULL) { + // Apple's version of pthread_setname_np takes one argument and operates on + // the current thread only. The maximum size of the thread_name buffer was + // noted in the Chromium source code and was confirmed by experiments. If + // thread_name is too long, pthread_setname_np returns -1 with errno + // ENAMETOOLONG (63). + char thread_name[64]; + strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1); + thread_name[sizeof(thread_name) - 1] = '\0'; + pthread_setname_np(thread_name); + } +#elif (defined(__GLIBC__) && !defined(__GNU__)) || defined(__BIONIC__) + if (worker->thread_name != NULL) { + // Linux and Android require names (with nul) fit in 16 chars, otherwise + // pthread_setname_np() returns ERANGE (34). + char thread_name[16]; + strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1); + thread_name[sizeof(thread_name) - 1] = '\0'; + pthread_setname_np(pthread_self(), thread_name); + } +#endif + pthread_mutex_lock(&worker->impl_->mutex_); + for (;;) { + while (worker->status_ == VPX_WORKER_STATUS_OK) { // wait in idling mode pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); } - if (worker->status_ == WORK) { + if (worker->status_ == VPX_WORKER_STATUS_WORKING) { + // When worker->status_ is VPX_WORKER_STATUS_WORKING, the main thread + // doesn't change worker->status_ and will wait until the worker changes + // worker->status_ to VPX_WORKER_STATUS_OK. See change_state(). So the + // worker can safely call execute() without holding worker->impl_->mutex_. + // When the worker reacquires worker->impl_->mutex_, worker->status_ must + // still be VPX_WORKER_STATUS_WORKING. + pthread_mutex_unlock(&worker->impl_->mutex_); execute(worker); - worker->status_ = OK; - } else if (worker->status_ == NOT_OK) { // finish the worker - done = 1; + pthread_mutex_lock(&worker->impl_->mutex_); + assert(worker->status_ == VPX_WORKER_STATUS_WORKING); + worker->status_ = VPX_WORKER_STATUS_OK; + // signal to the main thread that we're done (for sync()) + pthread_cond_signal(&worker->impl_->condition_); + } else { + assert(worker->status_ == VPX_WORKER_STATUS_NOT_OK); // finish the worker + break; } - // signal to the main thread that we're done (for sync()) - pthread_cond_signal(&worker->impl_->condition_); - pthread_mutex_unlock(&worker->impl_->mutex_); } - return THREAD_RETURN(NULL); // Thread is finished + pthread_mutex_unlock(&worker->impl_->mutex_); + return THREAD_EXIT_SUCCESS; // Thread is finished } // main thread state control @@ -58,13 +97,13 @@ static void change_state(VPxWorker *const worker, VPxWorkerStatus new_status) { if (worker->impl_ == NULL) return; pthread_mutex_lock(&worker->impl_->mutex_); - if (worker->status_ >= OK) { + if (worker->status_ >= VPX_WORKER_STATUS_OK) { // wait for the worker to finish - while (worker->status_ != OK) { + while (worker->status_ != VPX_WORKER_STATUS_OK) { pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); } // assign new status and release the working thread if needed - if (new_status != OK) { + if (new_status != VPX_WORKER_STATUS_OK) { worker->status_ = new_status; pthread_cond_signal(&worker->impl_->condition_); } @@ -78,21 +117,21 @@ static void change_state(VPxWorker *const worker, VPxWorkerStatus new_status) { static void init(VPxWorker *const worker) { memset(worker, 0, sizeof(*worker)); - worker->status_ = NOT_OK; + worker->status_ = VPX_WORKER_STATUS_NOT_OK; } static int sync(VPxWorker *const worker) { #if CONFIG_MULTITHREAD - change_state(worker, OK); + change_state(worker, VPX_WORKER_STATUS_OK); #endif - assert(worker->status_ <= OK); + assert(worker->status_ <= VPX_WORKER_STATUS_OK); return !worker->had_error; } static int reset(VPxWorker *const worker) { int ok = 1; worker->had_error = 0; - if (worker->status_ < OK) { + if (worker->status_ < VPX_WORKER_STATUS_OK) { #if CONFIG_MULTITHREAD worker->impl_ = (VPxWorkerImpl *)vpx_calloc(1, sizeof(*worker->impl_)); if (worker->impl_ == NULL) { @@ -107,7 +146,7 @@ static int reset(VPxWorker *const worker) { } pthread_mutex_lock(&worker->impl_->mutex_); ok = !pthread_create(&worker->impl_->thread_, NULL, thread_loop, worker); - if (ok) worker->status_ = OK; + if (ok) worker->status_ = VPX_WORKER_STATUS_OK; pthread_mutex_unlock(&worker->impl_->mutex_); if (!ok) { pthread_mutex_destroy(&worker->impl_->mutex_); @@ -118,12 +157,12 @@ static int reset(VPxWorker *const worker) { return 0; } #else - worker->status_ = OK; + worker->status_ = VPX_WORKER_STATUS_OK; #endif - } else if (worker->status_ > OK) { + } else if (worker->status_ > VPX_WORKER_STATUS_OK) { ok = sync(worker); } - assert(!ok || (worker->status_ == OK)); + assert(!ok || (worker->status_ == VPX_WORKER_STATUS_OK)); return ok; } @@ -135,7 +174,7 @@ static void execute(VPxWorker *const worker) { static void launch(VPxWorker *const worker) { #if CONFIG_MULTITHREAD - change_state(worker, WORK); + change_state(worker, VPX_WORKER_STATUS_WORKING); #else execute(worker); #endif @@ -144,7 +183,7 @@ static void launch(VPxWorker *const worker) { static void end(VPxWorker *const worker) { #if CONFIG_MULTITHREAD if (worker->impl_ != NULL) { - change_state(worker, NOT_OK); + change_state(worker, VPX_WORKER_STATUS_NOT_OK); pthread_join(worker->impl_->thread_, NULL); pthread_mutex_destroy(&worker->impl_->mutex_); pthread_cond_destroy(&worker->impl_->condition_); @@ -152,10 +191,10 @@ static void end(VPxWorker *const worker) { worker->impl_ = NULL; } #else - worker->status_ = NOT_OK; + worker->status_ = VPX_WORKER_STATUS_NOT_OK; assert(worker->impl_ == NULL); #endif - assert(worker->status_ == NOT_OK); + assert(worker->status_ == VPX_WORKER_STATUS_NOT_OK); } //------------------------------------------------------------------------------ diff --git a/media/libvpx/libvpx/vpx_util/vpx_thread.h b/media/libvpx/libvpx/vpx_util/vpx_thread.h index 6d308e949b..11a1d74387 100644 --- a/media/libvpx/libvpx/vpx_util/vpx_thread.h +++ b/media/libvpx/libvpx/vpx_util/vpx_thread.h @@ -15,370 +15,22 @@ #ifndef VPX_VPX_UTIL_VPX_THREAD_H_ #define VPX_VPX_UTIL_VPX_THREAD_H_ -#include "./vpx_config.h" - #ifdef __cplusplus extern "C" { #endif -// Set maximum decode threads to be 8 due to the limit of frame buffers -// and not enough semaphores in the emulation layer on windows. -#define MAX_DECODE_THREADS 8 - -#if CONFIG_MULTITHREAD - -#if defined(_WIN32) && !HAVE_PTHREAD_H -#include <errno.h> // NOLINT -#include <process.h> // NOLINT -#include <windows.h> // NOLINT -typedef HANDLE pthread_t; -typedef CRITICAL_SECTION pthread_mutex_t; - -#if _WIN32_WINNT >= 0x0600 // Windows Vista / Server 2008 or greater -#define USE_WINDOWS_CONDITION_VARIABLE -typedef CONDITION_VARIABLE pthread_cond_t; -#else -typedef struct { - HANDLE waiting_sem_; - HANDLE received_sem_; - HANDLE signal_event_; -} pthread_cond_t; -#endif // _WIN32_WINNT >= 0x600 - -#ifndef WINAPI_FAMILY_PARTITION -#define WINAPI_PARTITION_DESKTOP 1 -#define WINAPI_FAMILY_PARTITION(x) x -#endif - -#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) -#define USE_CREATE_THREAD -#endif - -//------------------------------------------------------------------------------ -// simplistic pthread emulation layer - -// _beginthreadex requires __stdcall -#if defined(__GNUC__) && \ - (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) -#define THREADFN __attribute__((force_align_arg_pointer)) unsigned int __stdcall -#else -#define THREADFN unsigned int __stdcall -#endif -#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val) - -#if _WIN32_WINNT >= 0x0501 // Windows XP or greater -#define WaitForSingleObject(obj, timeout) \ - WaitForSingleObjectEx(obj, timeout, FALSE /*bAlertable*/) -#endif - -static INLINE int pthread_create(pthread_t *const thread, const void *attr, - unsigned int(__stdcall *start)(void *), - void *arg) { - (void)attr; -#ifdef USE_CREATE_THREAD - *thread = CreateThread(NULL, /* lpThreadAttributes */ - 0, /* dwStackSize */ - start, arg, 0, /* dwStackSize */ - NULL); /* lpThreadId */ -#else - *thread = (pthread_t)_beginthreadex(NULL, /* void *security */ - 0, /* unsigned stack_size */ - start, arg, 0, /* unsigned initflag */ - NULL); /* unsigned *thrdaddr */ -#endif - if (*thread == NULL) return 1; - SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL); - return 0; -} - -static INLINE int pthread_join(pthread_t thread, void **value_ptr) { - (void)value_ptr; - return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 || - CloseHandle(thread) == 0); -} - -// Mutex -static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex, - void *mutexattr) { - (void)mutexattr; -#if _WIN32_WINNT >= 0x0600 // Windows Vista / Server 2008 or greater - InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/); -#else - InitializeCriticalSection(mutex); -#endif - return 0; -} - -static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) { - return TryEnterCriticalSection(mutex) ? 0 : EBUSY; -} - -static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) { - EnterCriticalSection(mutex); - return 0; -} - -static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) { - LeaveCriticalSection(mutex); - return 0; -} - -static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) { - DeleteCriticalSection(mutex); - return 0; -} - -// Condition -static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) { - int ok = 1; -#ifdef USE_WINDOWS_CONDITION_VARIABLE - (void)condition; -#else - ok &= (CloseHandle(condition->waiting_sem_) != 0); - ok &= (CloseHandle(condition->received_sem_) != 0); - ok &= (CloseHandle(condition->signal_event_) != 0); -#endif - return !ok; -} - -static INLINE int pthread_cond_init(pthread_cond_t *const condition, - void *cond_attr) { - (void)cond_attr; -#ifdef USE_WINDOWS_CONDITION_VARIABLE - InitializeConditionVariable(condition); -#else - condition->waiting_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL); - condition->received_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL); - condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL); - if (condition->waiting_sem_ == NULL || condition->received_sem_ == NULL || - condition->signal_event_ == NULL) { - pthread_cond_destroy(condition); - return 1; - } -#endif - return 0; -} - -static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) { - int ok = 1; -#ifdef USE_WINDOWS_CONDITION_VARIABLE - WakeAllConditionVariable(condition); -#else - while (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) { - // a thread is waiting in pthread_cond_wait: allow it to be notified - ok &= SetEvent(condition->signal_event_); - // wait until the event is consumed so the signaler cannot consume - // the event via its own pthread_cond_wait. - ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) != - WAIT_OBJECT_0); - } -#endif - return !ok; -} - -static INLINE int pthread_cond_signal(pthread_cond_t *const condition) { - int ok = 1; -#ifdef USE_WINDOWS_CONDITION_VARIABLE - WakeConditionVariable(condition); -#else - if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) { - // a thread is waiting in pthread_cond_wait: allow it to be notified - ok = SetEvent(condition->signal_event_); - // wait until the event is consumed so the signaler cannot consume - // the event via its own pthread_cond_wait. - ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) != - WAIT_OBJECT_0); - } -#endif - return !ok; -} - -static INLINE int pthread_cond_wait(pthread_cond_t *const condition, - pthread_mutex_t *const mutex) { - int ok; -#ifdef USE_WINDOWS_CONDITION_VARIABLE - ok = SleepConditionVariableCS(condition, mutex, INFINITE); -#else - // note that there is a consumer available so the signal isn't dropped in - // pthread_cond_signal - if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL)) return 1; - // now unlock the mutex so pthread_cond_signal may be issued - pthread_mutex_unlock(mutex); - ok = (WaitForSingleObject(condition->signal_event_, INFINITE) == - WAIT_OBJECT_0); - ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL); - pthread_mutex_lock(mutex); -#endif - return !ok; -} - -#elif defined(__OS2__) -#define INCL_DOS -#include <os2.h> // NOLINT - -#include <errno.h> // NOLINT -#include <stdlib.h> // NOLINT -#include <sys/builtin.h> // NOLINT - -#if defined(__STRICT_ANSI__) -// _beginthread() is not declared on __STRICT_ANSI__ mode. Declare here. -int _beginthread(void (*)(void *), void *, unsigned, void *); -#endif - -#define pthread_t TID -#define pthread_mutex_t HMTX - -typedef struct { - HEV event_sem_; - HEV ack_sem_; - volatile unsigned wait_count_; -} pthread_cond_t; - -//------------------------------------------------------------------------------ -// simplistic pthread emulation layer - -#define THREADFN void * -#define THREAD_RETURN(val) (val) - -typedef struct { - void *(*start_)(void *); - void *arg_; -} thread_arg; - -static void thread_start(void *arg) { - thread_arg targ = *(thread_arg *)arg; - free(arg); - - targ.start_(targ.arg_); -} - -static INLINE int pthread_create(pthread_t *const thread, const void *attr, - void *(*start)(void *), void *arg) { - int tid; - thread_arg *targ = (thread_arg *)malloc(sizeof(*targ)); - if (targ == NULL) return 1; - - (void)attr; - - targ->start_ = start; - targ->arg_ = arg; - tid = (pthread_t)_beginthread(thread_start, NULL, 1024 * 1024, targ); - if (tid == -1) { - free(targ); - return 1; - } - - *thread = tid; - return 0; -} - -static INLINE int pthread_join(pthread_t thread, void **value_ptr) { - (void)value_ptr; - return DosWaitThread(&thread, DCWW_WAIT) != 0; -} - -// Mutex -static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex, - void *mutexattr) { - (void)mutexattr; - return DosCreateMutexSem(NULL, mutex, 0, FALSE) != 0; -} - -static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) { - return DosRequestMutexSem(*mutex, SEM_IMMEDIATE_RETURN) == 0 ? 0 : EBUSY; -} - -static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) { - return DosRequestMutexSem(*mutex, SEM_INDEFINITE_WAIT) != 0; -} - -static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) { - return DosReleaseMutexSem(*mutex) != 0; -} - -static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) { - return DosCloseMutexSem(*mutex) != 0; -} - -// Condition -static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) { - int ok = 1; - ok &= DosCloseEventSem(condition->event_sem_) == 0; - ok &= DosCloseEventSem(condition->ack_sem_) == 0; - return !ok; -} - -static INLINE int pthread_cond_init(pthread_cond_t *const condition, - void *cond_attr) { - int ok = 1; - (void)cond_attr; - - ok &= - DosCreateEventSem(NULL, &condition->event_sem_, DCE_POSTONE, FALSE) == 0; - ok &= DosCreateEventSem(NULL, &condition->ack_sem_, DCE_POSTONE, FALSE) == 0; - if (!ok) { - pthread_cond_destroy(condition); - return 1; - } - condition->wait_count_ = 0; - return 0; -} - -static INLINE int pthread_cond_signal(pthread_cond_t *const condition) { - int ok = 1; - - if (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0)) { - ok &= DosPostEventSem(condition->event_sem_) == 0; - ok &= DosWaitEventSem(condition->ack_sem_, SEM_INDEFINITE_WAIT) == 0; - } - - return !ok; -} - -static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) { - int ok = 1; - - while (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0)) - ok &= pthread_cond_signal(condition) == 0; - - return !ok; -} - -static INLINE int pthread_cond_wait(pthread_cond_t *const condition, - pthread_mutex_t *const mutex) { - int ok = 1; - - __atomic_increment(&condition->wait_count_); - - ok &= pthread_mutex_unlock(mutex) == 0; - - ok &= DosWaitEventSem(condition->event_sem_, SEM_INDEFINITE_WAIT) == 0; - - __atomic_decrement(&condition->wait_count_); - - ok &= DosPostEventSem(condition->ack_sem_) == 0; - - pthread_mutex_lock(mutex); - - return !ok; -} -#else // _WIN32 -#include <pthread.h> // NOLINT -#define THREADFN void * -#define THREAD_RETURN(val) val -#endif - -#endif // CONFIG_MULTITHREAD +#define MAX_NUM_THREADS 64 // State of the worker thread object typedef enum { - NOT_OK = 0, // object is unusable - OK, // ready to work - WORK // busy finishing the current task + VPX_WORKER_STATUS_NOT_OK = 0, // object is unusable + VPX_WORKER_STATUS_OK, // ready to work + VPX_WORKER_STATUS_WORKING // busy finishing the current task } VPxWorkerStatus; // Function to be called by the worker thread. Takes two opaque pointers as -// arguments (data1 and data2), and should return false in case of error. +// arguments (data1 and data2). Should return true on success and return false +// in case of error. typedef int (*VPxWorkerHook)(void *, void *); // Platform-dependent implementation details for the worker. @@ -388,10 +40,14 @@ typedef struct VPxWorkerImpl VPxWorkerImpl; typedef struct { VPxWorkerImpl *impl_; VPxWorkerStatus status_; + // Thread name for the debugger. If not NULL, must point to a string that + // outlives the worker thread. For portability, use a name <= 15 characters + // long (not including the terminating NUL character). + const char *thread_name; VPxWorkerHook hook; // hook to call void *data1; // first argument passed to 'hook' void *data2; // second argument passed to 'hook' - int had_error; // return value of the last call to 'hook' + int had_error; // true if a call to 'hook' returned false } VPxWorker; // The interface for all thread-worker related functions. All these functions diff --git a/media/libvpx/libvpx/vpx_util/vpx_util.mk b/media/libvpx/libvpx/vpx_util/vpx_util.mk index 1162714956..948e6d6f89 100644 --- a/media/libvpx/libvpx/vpx_util/vpx_util.mk +++ b/media/libvpx/libvpx/vpx_util/vpx_util.mk @@ -10,6 +10,7 @@ UTIL_SRCS-yes += vpx_atomics.h UTIL_SRCS-yes += vpx_util.mk +UTIL_SRCS-yes += vpx_pthread.h UTIL_SRCS-yes += vpx_thread.c UTIL_SRCS-yes += vpx_thread.h UTIL_SRCS-yes += endian_inl.h diff --git a/media/libvpx/missing_header.patch b/media/libvpx/missing_header.patch new file mode 100644 index 0000000000..02b77170ee --- /dev/null +++ b/media/libvpx/missing_header.patch @@ -0,0 +1,12 @@ +Add missing header for EBUSY + +--- a/vpx_util/vpx_pthread.h ++++ b/vpx_util/vpx_pthread.h +@@ -26,6 +26,7 @@ extern "C" { + #define NOMINMAX + #undef WIN32_LEAN_AND_MEAN + #define WIN32_LEAN_AND_MEAN ++#include <errno.h> // NOLINT + #include <process.h> // NOLINT + #include <stddef.h> // NOLINT + #include <windows.h> // NOLINT diff --git a/media/libvpx/moz.build b/media/libvpx/moz.build index 582bc6fd5d..635b5d0fdd 100644 --- a/media/libvpx/moz.build +++ b/media/libvpx/moz.build @@ -72,7 +72,10 @@ elif CONFIG['TARGET_CPU'] == 'arm': ] elif CONFIG['TARGET_CPU'] == 'aarch64' and CONFIG['OS_TARGET'] == 'WINNT': EXPORTS.vpx += files['ARM64_EXPORTS'] - SOURCES += files['ARM64_SOURCES'] + # Bug 1885585: clang on win/aarch64 cannot compile SVInt8_t type for now. + SOURCES += [ + f for f in files['ARM64_SOURCES'] if not f.endswith('_sve.c') + ] ASFLAGS += [ '-I%s/media/libvpx/config/win/aarch64/' % TOPSRCDIR ] LOCAL_INCLUDES += [ '/media/libvpx/config/win/aarch64/' ] SOURCES += [ '/media/libvpx/config/win/aarch64/vpx_config.c' ] @@ -125,6 +128,10 @@ for f in SOURCES: SOURCES[f].flags += ['-march=armv8.2-a+dotprod'] if 'neon_i8mm.c' in f: SOURCES[f].flags += ['-march=armv8.2-a+dotprod+i8mm'] + if 'sve.c' in f: + SOURCES[f].flags += ['-march=armv8.2-a+dotprod+i8mm+sve'] + if 'sve2.c' in f: + SOURCES[f].flags += ['-march=armv9-a+sve2'] # Suppress warnings in third-party code. CFLAGS += [ diff --git a/media/libvpx/moz.yaml b/media/libvpx/moz.yaml index 17704a1905..0b3ec52482 100644 --- a/media/libvpx/moz.yaml +++ b/media/libvpx/moz.yaml @@ -20,11 +20,11 @@ origin: # Human-readable identifier for this version/release # Generally "version NNN", "tag SSS", "bookmark SSS" - release: f6b7166a2b6bac544c2c487d3a7e49bc265cdf9d (Tue Jan 02 20:08:06 2024). + release: 7fb8ceccf92c35cd5131b05c0502916715ebc76b (Fri Mar 15 01:11:50 2024). # Revision to pull in # Must be a long or short commit SHA (long preferred) - revision: f6b7166a2b6bac544c2c487d3a7e49bc265cdf9d + revision: 7fb8ceccf92c35cd5131b05c0502916715ebc76b # The package's license, where possible using the mnemonic from # https://spdx.org/licenses/ @@ -53,8 +53,10 @@ vendoring: - tools/ patches: + - arm_cpu_runtime_detection_code_on_openbsd.patch - input_frame_validation.patch - input_frame_validation_vp9.patch + - missing_header.patch update-actions: - action: move-file diff --git a/media/libvpx/sources.mozbuild b/media/libvpx/sources.mozbuild index 2960dee255..1ad5d4447c 100644 --- a/media/libvpx/sources.mozbuild +++ b/media/libvpx/sources.mozbuild @@ -934,6 +934,7 @@ files = { 'libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c', 'libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c', 'libvpx/vp9/encoder/arm/neon/vp9_error_neon.c', + 'libvpx/vp9/encoder/arm/neon/vp9_error_sve.c', 'libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c', 'libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c', 'libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c', @@ -1006,6 +1007,7 @@ files = { 'libvpx/vpx_dsp/arm/subpel_variance_neon.c', 'libvpx/vpx_dsp/arm/subtract_neon.c', 'libvpx/vpx_dsp/arm/sum_squares_neon.c', + 'libvpx/vpx_dsp/arm/sum_squares_sve.c', 'libvpx/vpx_dsp/arm/variance_neon.c', 'libvpx/vpx_dsp/arm/variance_neon_dotprod.c', 'libvpx/vpx_dsp/arm/vpx_convolve8_neon.c', @@ -1014,8 +1016,6 @@ files = { 'libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c', 'libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c', 'libvpx/vpx_dsp/arm/vpx_convolve_neon.c', - 'libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c', - 'libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c', 'libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c', 'libvpx/vpx_dsp/avg.c', 'libvpx/vpx_dsp/bitreader.c', |