diff options
Diffstat (limited to '')
-rw-r--r-- | audio/filter/af_drop.c | 114 | ||||
-rw-r--r-- | audio/filter/af_format.c | 143 | ||||
-rw-r--r-- | audio/filter/af_lavcac3enc.c | 437 | ||||
-rw-r--r-- | audio/filter/af_rubberband.c | 382 | ||||
-rw-r--r-- | audio/filter/af_scaletempo.c | 626 | ||||
-rw-r--r-- | audio/filter/af_scaletempo2.c | 254 | ||||
-rw-r--r-- | audio/filter/af_scaletempo2_internals.c | 873 | ||||
-rw-r--r-- | audio/filter/af_scaletempo2_internals.h | 134 |
8 files changed, 2963 insertions, 0 deletions
diff --git a/audio/filter/af_drop.c b/audio/filter/af_drop.c new file mode 100644 index 0000000..724c482 --- /dev/null +++ b/audio/filter/af_drop.c @@ -0,0 +1,114 @@ +#include "audio/aframe.h" +#include "audio/format.h" +#include "common/common.h" +#include "filters/f_autoconvert.h" +#include "filters/filter_internal.h" +#include "filters/user_filters.h" + +struct priv { + double speed; + double diff; // amount of too many additional samples in normal speed + struct mp_aframe *last; // for repeating +}; + +static void process(struct mp_filter *f) +{ + struct priv *p = f->priv; + + if (!mp_pin_in_needs_data(f->ppins[1])) + return; + + struct mp_frame frame = {0}; + + double last_dur = p->last ? mp_aframe_duration(p->last) : 0; + if (p->last && p->diff < 0 && -p->diff > last_dur / 2) { + MP_VERBOSE(f, "repeat\n"); + frame = MAKE_FRAME(MP_FRAME_AUDIO, p->last); + p->last = NULL; + } else { + frame = mp_pin_out_read(f->ppins[0]); + + if (frame.type == MP_FRAME_AUDIO) { + last_dur = mp_aframe_duration(frame.data); + p->diff -= last_dur; + if (p->diff > last_dur / 2) { + MP_VERBOSE(f, "drop\n"); + mp_frame_unref(&frame); + mp_filter_internal_mark_progress(f); + } + } + } + + if (frame.type == MP_FRAME_AUDIO) { + struct mp_aframe *fr = frame.data; + talloc_free(p->last); + p->last = mp_aframe_new_ref(fr); + mp_aframe_mul_speed(fr, p->speed); + p->diff += mp_aframe_duration(fr); + mp_aframe_set_pts(p->last, mp_aframe_end_pts(fr)); + } else if (frame.type == MP_FRAME_EOF) { + TA_FREEP(&p->last); + } + mp_pin_in_write(f->ppins[1], frame); +} + +static bool command(struct mp_filter *f, struct mp_filter_command *cmd) +{ + struct priv *p = f->priv; + + switch (cmd->type) { + case MP_FILTER_COMMAND_SET_SPEED: + p->speed = cmd->speed; + return true; + } + + return false; +} + +static void reset(struct mp_filter *f) +{ + struct priv *p = f->priv; + + TA_FREEP(&p->last); + p->diff = 0; +} + +static void destroy(struct mp_filter *f) +{ + reset(f); +} + +static const struct mp_filter_info af_drop_filter = { + .name = "drop", + .priv_size = sizeof(struct priv), + .process = process, + .command = command, + .reset = reset, + .destroy = destroy, +}; + +static struct mp_filter *af_drop_create(struct mp_filter *parent, void *options) +{ + struct mp_filter *f = mp_filter_create(parent, &af_drop_filter); + if (!f) { + talloc_free(options); + return NULL; + } + + mp_filter_add_pin(f, MP_PIN_IN, "in"); + mp_filter_add_pin(f, MP_PIN_OUT, "out"); + + struct priv *p = f->priv; + p->speed = 1.0; + + return f; +} + +const struct mp_user_filter_entry af_drop = { + .desc = { + .description = "Change audio speed by dropping/repeating frames", + .name = "drop", + .priv_size = sizeof(struct priv), + }, + .create = af_drop_create, +}; diff --git a/audio/filter/af_format.c b/audio/filter/af_format.c new file mode 100644 index 0000000..2d1c1cc --- /dev/null +++ b/audio/filter/af_format.c @@ -0,0 +1,143 @@ +/* + * This file is part of mpv. + * + * mpv is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * mpv is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with mpv. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "audio/aframe.h" +#include "audio/format.h" +#include "filters/f_autoconvert.h" +#include "filters/filter_internal.h" +#include "filters/user_filters.h" +#include "options/m_option.h" + +struct f_opts { + int in_format; + int in_srate; + struct m_channels in_channels; + int out_format; + int out_srate; + struct m_channels out_channels; + + bool fail; +}; + +struct priv { + struct f_opts *opts; + struct mp_pin *in_pin; +}; + +static void process(struct mp_filter *f) +{ + struct priv *p = f->priv; + + if (!mp_pin_can_transfer_data(f->ppins[1], p->in_pin)) + return; + + struct mp_frame frame = mp_pin_out_read(p->in_pin); + + if (p->opts->fail) { + MP_ERR(f, "Failing on purpose.\n"); + goto error; + } + + if (frame.type == MP_FRAME_EOF) { + mp_pin_in_write(f->ppins[1], frame); + return; + } + + if (frame.type != MP_FRAME_AUDIO) { + MP_ERR(f, "audio frame expected\n"); + goto error; + } + + struct mp_aframe *in = frame.data; + + if (p->opts->out_channels.num_chmaps > 0) { + if (!mp_aframe_set_chmap(in, &p->opts->out_channels.chmaps[0])) { + MP_ERR(f, "could not force output channels\n"); + goto error; + } + } + + if (p->opts->out_srate) + mp_aframe_set_rate(in, p->opts->out_srate); + + mp_pin_in_write(f->ppins[1], frame); + return; + +error: + mp_frame_unref(&frame); + mp_filter_internal_mark_failed(f); +} + +static const struct mp_filter_info af_format_filter = { + .name = "format", + .priv_size = sizeof(struct priv), + .process = process, +}; + +static struct mp_filter *af_format_create(struct mp_filter *parent, + void *options) +{ + struct mp_filter *f = mp_filter_create(parent, &af_format_filter); + if (!f) { + talloc_free(options); + return NULL; + } + + struct priv *p = f->priv; + p->opts = talloc_steal(p, options); + + mp_filter_add_pin(f, MP_PIN_IN, "in"); + mp_filter_add_pin(f, MP_PIN_OUT, "out"); + + struct mp_autoconvert *conv = mp_autoconvert_create(f); + if (!conv) + abort(); + + if (p->opts->in_format) + mp_autoconvert_add_afmt(conv, p->opts->in_format); + if (p->opts->in_srate) + mp_autoconvert_add_srate(conv, p->opts->in_srate); + if (p->opts->in_channels.num_chmaps > 0) + mp_autoconvert_add_chmap(conv, &p->opts->in_channels.chmaps[0]); + + mp_pin_connect(conv->f->pins[0], f->ppins[0]); + p->in_pin = conv->f->pins[1]; + + return f; +} + +#define OPT_BASE_STRUCT struct f_opts + +const struct mp_user_filter_entry af_format = { + .desc = { + .name = "format", + .description = "Force audio format", + .priv_size = sizeof(struct f_opts), + .options = (const struct m_option[]) { + {"format", OPT_AUDIOFORMAT(in_format)}, + {"srate", OPT_INT(in_srate), M_RANGE(1000, 8*48000)}, + {"channels", OPT_CHANNELS(in_channels), + .flags = M_OPT_CHANNELS_LIMITED}, + {"out-srate", OPT_INT(out_srate), M_RANGE(1000, 8*48000)}, + {"out-channels", OPT_CHANNELS(out_channels), + .flags = M_OPT_CHANNELS_LIMITED}, + {"fail", OPT_BOOL(fail)}, + {0} + }, + }, + .create = af_format_create, +}; diff --git a/audio/filter/af_lavcac3enc.c b/audio/filter/af_lavcac3enc.c new file mode 100644 index 0000000..b4a1d59 --- /dev/null +++ b/audio/filter/af_lavcac3enc.c @@ -0,0 +1,437 @@ +/* + * audio filter for runtime AC-3 encoding with libavcodec. + * + * Copyright (C) 2007 Ulion <ulion A gmail P com> + * + * This file is part of mpv. + * + * mpv is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * mpv is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with mpv. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <inttypes.h> +#include <assert.h> + +#include <libavcodec/avcodec.h> +#include <libavutil/intreadwrite.h> +#include <libavutil/common.h> +#include <libavutil/bswap.h> +#include <libavutil/mem.h> + +#include "config.h" + +#include "audio/aframe.h" +#include "audio/chmap_avchannel.h" +#include "audio/chmap_sel.h" +#include "audio/fmt-conversion.h" +#include "audio/format.h" +#include "common/av_common.h" +#include "common/common.h" +#include "filters/f_autoconvert.h" +#include "filters/f_utils.h" +#include "filters/filter_internal.h" +#include "filters/user_filters.h" +#include "options/m_option.h" + + +#define AC3_MAX_CHANNELS 6 +#define AC3_MAX_CODED_FRAME_SIZE 3840 +#define AC3_FRAME_SIZE (6 * 256) +const static uint16_t ac3_bitrate_tab[19] = { + 32, 40, 48, 56, 64, 80, 96, 112, 128, + 160, 192, 224, 256, 320, 384, 448, 512, 576, 640 +}; + +struct f_opts { + bool add_iec61937_header; + int bit_rate; + int min_channel_num; + char *encoder; + char **avopts; +}; + +struct priv { + struct f_opts *opts; + + struct mp_pin *in_pin; + struct mp_aframe *cur_format; + struct mp_aframe *in_frame; + struct mp_aframe_pool *out_pool; + + const struct AVCodec *lavc_acodec; + struct AVCodecContext *lavc_actx; + AVPacket *lavc_pkt; + int bit_rate; + int out_samples; // upper bound on encoded output per AC3 frame +}; + +static bool reinit(struct mp_filter *f) +{ + struct priv *s = f->priv; + + mp_aframe_reset(s->cur_format); + + static const int default_bit_rate[AC3_MAX_CHANNELS+1] = \ + {0, 96000, 192000, 256000, 384000, 448000, 448000}; + + if (s->opts->add_iec61937_header) { + s->out_samples = AC3_FRAME_SIZE; + } else { + s->out_samples = AC3_MAX_CODED_FRAME_SIZE / + mp_aframe_get_sstride(s->in_frame); + } + + int format = mp_aframe_get_format(s->in_frame); + int rate = mp_aframe_get_rate(s->in_frame); + struct mp_chmap chmap = {0}; + mp_aframe_get_chmap(s->in_frame, &chmap); + + int bit_rate = s->bit_rate; + if (!bit_rate && chmap.num < AC3_MAX_CHANNELS + 1) + bit_rate = default_bit_rate[chmap.num]; + + avcodec_close(s->lavc_actx); + + // Put sample parameters + s->lavc_actx->sample_fmt = af_to_avformat(format); + +#if !HAVE_AV_CHANNEL_LAYOUT + s->lavc_actx->channels = chmap.num; + s->lavc_actx->channel_layout = mp_chmap_to_lavc(&chmap); +#else + mp_chmap_to_av_layout(&s->lavc_actx->ch_layout, &chmap); +#endif + s->lavc_actx->sample_rate = rate; + s->lavc_actx->bit_rate = bit_rate; + + if (avcodec_open2(s->lavc_actx, s->lavc_acodec, NULL) < 0) { + MP_ERR(f, "Couldn't open codec %s, br=%d.\n", "ac3", bit_rate); + return false; + } + + if (s->lavc_actx->frame_size < 1) { + MP_ERR(f, "encoder didn't specify input frame size\n"); + return false; + } + + mp_aframe_config_copy(s->cur_format, s->in_frame); + return true; +} + +static void reset(struct mp_filter *f) +{ + struct priv *s = f->priv; + + TA_FREEP(&s->in_frame); +} + +static void destroy(struct mp_filter *f) +{ + struct priv *s = f->priv; + + reset(f); + av_packet_free(&s->lavc_pkt); + avcodec_free_context(&s->lavc_actx); +} + +static void swap_16(uint16_t *ptr, size_t size) +{ + for (size_t n = 0; n < size; n++) + ptr[n] = av_bswap16(ptr[n]); +} + +static void process(struct mp_filter *f) +{ + struct priv *s = f->priv; + + if (!mp_pin_in_needs_data(f->ppins[1])) + return; + + bool err = true; + struct mp_aframe *out = NULL; + AVPacket *pkt = s->lavc_pkt; + + // Send input as long as it wants. + while (1) { + if (avcodec_is_open(s->lavc_actx)) { + int lavc_ret = avcodec_receive_packet(s->lavc_actx, pkt); + if (lavc_ret >= 0) + break; + if (lavc_ret < 0 && lavc_ret != AVERROR(EAGAIN)) { + MP_FATAL(f, "Encode failed (receive).\n"); + goto error; + } + } + AVFrame *frame = NULL; + struct mp_frame input = mp_pin_out_read(s->in_pin); + // The following code assumes no sample data buffering in the encoder. + switch (input.type) { + case MP_FRAME_NONE: + goto done; // no data yet + case MP_FRAME_EOF: + mp_pin_in_write(f->ppins[1], input); + goto done; + case MP_FRAME_AUDIO: + TA_FREEP(&s->in_frame); + s->in_frame = input.data; + frame = mp_frame_to_av(input, NULL); + if (!frame) + goto error; + if (mp_aframe_get_channels(s->in_frame) < s->opts->min_channel_num) { + // Just pass it through. + s->in_frame = NULL; + mp_pin_in_write(f->ppins[1], input); + goto done; + } + if (!mp_aframe_config_equals(s->in_frame, s->cur_format)) { + if (!reinit(f)) + goto error; + } + break; + default: goto error; // unexpected packet type + } + int lavc_ret = avcodec_send_frame(s->lavc_actx, frame); + av_frame_free(&frame); + if (lavc_ret < 0 && lavc_ret != AVERROR(EAGAIN)) { + MP_FATAL(f, "Encode failed (send).\n"); + goto error; + } + } + + if (!s->in_frame) + goto error; + + out = mp_aframe_create(); + mp_aframe_set_format(out, AF_FORMAT_S_AC3); + mp_aframe_set_chmap(out, &(struct mp_chmap)MP_CHMAP_INIT_STEREO); + mp_aframe_set_rate(out, 48000); + + if (mp_aframe_pool_allocate(s->out_pool, out, s->out_samples) < 0) + goto error; + + int sstride = mp_aframe_get_sstride(out); + + mp_aframe_copy_attributes(out, s->in_frame); + + int frame_size = pkt->size; + int header_len = 0; + char hdr[8]; + + if (s->opts->add_iec61937_header && pkt->size > 5) { + int bsmod = pkt->data[5] & 0x7; + int len = frame_size; + + frame_size = AC3_FRAME_SIZE * 2 * 2; + header_len = 8; + + AV_WL16(hdr, 0xF872); // iec 61937 syncword 1 + AV_WL16(hdr + 2, 0x4E1F); // iec 61937 syncword 2 + hdr[5] = bsmod; // bsmod + hdr[4] = 0x01; // data-type ac3 + AV_WL16(hdr + 6, len << 3); // number of bits in payload + } + + if (frame_size > s->out_samples * sstride) + abort(); + + uint8_t **planes = mp_aframe_get_data_rw(out); + if (!planes) + goto error; + char *buf = planes[0]; + memcpy(buf, hdr, header_len); + memcpy(buf + header_len, pkt->data, pkt->size); + memset(buf + header_len + pkt->size, 0, + frame_size - (header_len + pkt->size)); + swap_16((uint16_t *)(buf + header_len), pkt->size / 2); + mp_aframe_set_size(out, frame_size / sstride); + mp_pin_in_write(f->ppins[1], MAKE_FRAME(MP_FRAME_AUDIO, out)); + out = NULL; + +done: + err = false; + // fall through +error: + av_packet_unref(pkt); + talloc_free(out); + if (err) + mp_filter_internal_mark_failed(f); +} + +static const struct mp_filter_info af_lavcac3enc_filter = { + .name = "lavcac3enc", + .priv_size = sizeof(struct priv), + .process = process, + .reset = reset, + .destroy = destroy, +}; + +static void add_chmaps_to_autoconv(struct mp_filter *f, + struct mp_autoconvert *conv, + const struct AVCodec *codec) +{ +#if !HAVE_AV_CHANNEL_LAYOUT + const uint64_t *lch = codec->channel_layouts; + for (int n = 0; lch && lch[n]; n++) { + struct mp_chmap chmap = {0}; + mp_chmap_from_lavc(&chmap, lch[n]); + if (mp_chmap_is_valid(&chmap)) + mp_autoconvert_add_chmap(conv, &chmap); + } +#else + const AVChannelLayout *lch = codec->ch_layouts; + for (int n = 0; lch && lch[n].nb_channels; n++) { + struct mp_chmap chmap = {0}; + + if (!mp_chmap_from_av_layout(&chmap, &lch[n])) { + char layout[128] = {0}; + MP_VERBOSE(f, "Skipping unsupported channel layout: %s\n", + av_channel_layout_describe(&lch[n], + layout, 128) < 0 ? + "undefined" : layout); + continue; + } + + if (mp_chmap_is_valid(&chmap)) + mp_autoconvert_add_chmap(conv, &chmap); + } +#endif +} + +static struct mp_filter *af_lavcac3enc_create(struct mp_filter *parent, + void *options) +{ + struct mp_filter *f = mp_filter_create(parent, &af_lavcac3enc_filter); + if (!f) { + talloc_free(options); + return NULL; + } + + mp_filter_add_pin(f, MP_PIN_IN, "in"); + mp_filter_add_pin(f, MP_PIN_OUT, "out"); + + struct priv *s = f->priv; + s->opts = talloc_steal(s, options); + s->cur_format = talloc_steal(s, mp_aframe_create()); + s->out_pool = mp_aframe_pool_create(s); + + s->lavc_acodec = avcodec_find_encoder_by_name(s->opts->encoder); + if (!s->lavc_acodec) { + MP_ERR(f, "Couldn't find encoder %s.\n", s->opts->encoder); + goto error; + } + + s->lavc_actx = avcodec_alloc_context3(s->lavc_acodec); + if (!s->lavc_actx) { + MP_ERR(f, "Audio LAVC, couldn't allocate context!\n"); + goto error; + } + + s->lavc_pkt = av_packet_alloc(); + if (!s->lavc_pkt) + goto error; + + if (mp_set_avopts(f->log, s->lavc_actx, s->opts->avopts) < 0) + goto error; + + // For this one, we require the decoder to export lists of all supported + // parameters. (Not all decoders do that, but the ones we're interested + // in do.) + if (!s->lavc_acodec->sample_fmts || +#if !HAVE_AV_CHANNEL_LAYOUT + !s->lavc_acodec->channel_layouts +#else + !s->lavc_acodec->ch_layouts +#endif + ) + { + MP_ERR(f, "Audio encoder doesn't list supported parameters.\n"); + goto error; + } + + if (s->opts->bit_rate) { + int i; + for (i = 0; i < 19; i++) { + if (ac3_bitrate_tab[i] == s->opts->bit_rate) { + s->bit_rate = ac3_bitrate_tab[i] * 1000; + break; + } + } + if (i >= 19) { + MP_WARN(f, "unable set unsupported bitrate %d, using default " + "bitrate (check manpage to see supported bitrates).\n", + s->opts->bit_rate); + } + } + + struct mp_autoconvert *conv = mp_autoconvert_create(f); + if (!conv) + abort(); + + const enum AVSampleFormat *lf = s->lavc_acodec->sample_fmts; + for (int i = 0; lf && lf[i] != AV_SAMPLE_FMT_NONE; i++) { + int mpfmt = af_from_avformat(lf[i]); + if (mpfmt) + mp_autoconvert_add_afmt(conv, mpfmt); + } + + add_chmaps_to_autoconv(f, conv, s->lavc_acodec); + + // At least currently, the AC3 encoder doesn't export sample rates. + mp_autoconvert_add_srate(conv, 48000); + + mp_pin_connect(conv->f->pins[0], f->ppins[0]); + + struct mp_filter *fs = mp_fixed_aframe_size_create(f, AC3_FRAME_SIZE, true); + if (!fs) + abort(); + + mp_pin_connect(fs->pins[0], conv->f->pins[1]); + s->in_pin = fs->pins[1]; + + return f; + +error: + av_packet_free(&s->lavc_pkt); + avcodec_free_context(&s->lavc_actx); + talloc_free(f); + return NULL; +} + +#define OPT_BASE_STRUCT struct f_opts + +const struct mp_user_filter_entry af_lavcac3enc = { + .desc = { + .description = "runtime encode to ac3 using libavcodec", + .name = "lavcac3enc", + .priv_size = sizeof(OPT_BASE_STRUCT), + .priv_defaults = &(const OPT_BASE_STRUCT) { + .add_iec61937_header = true, + .bit_rate = 640, + .min_channel_num = 3, + .encoder = "ac3", + }, + .options = (const struct m_option[]) { + {"tospdif", OPT_BOOL(add_iec61937_header)}, + {"bitrate", OPT_CHOICE(bit_rate, + {"auto", 0}, {"default", 0}), M_RANGE(32, 640)}, + {"minch", OPT_INT(min_channel_num), M_RANGE(2, 6)}, + {"encoder", OPT_STRING(encoder)}, + {"o", OPT_KEYVALUELIST(avopts)}, + {0} + }, + }, + .create = af_lavcac3enc_create, +}; diff --git a/audio/filter/af_rubberband.c b/audio/filter/af_rubberband.c new file mode 100644 index 0000000..48e5cc1 --- /dev/null +++ b/audio/filter/af_rubberband.c @@ -0,0 +1,382 @@ +/* + * This file is part of mpv. + * + * mpv is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * mpv is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with mpv. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <stdlib.h> +#include <assert.h> + +#include <rubberband/rubberband-c.h> + +#include "config.h" + +#include "audio/aframe.h" +#include "audio/format.h" +#include "common/common.h" +#include "filters/f_autoconvert.h" +#include "filters/filter_internal.h" +#include "filters/user_filters.h" +#include "options/m_option.h" + +// command line options +struct f_opts { + int transients, detector, phase, window, + smoothing, formant, pitch, channels, engine; + double scale; +}; + +struct priv { + struct f_opts *opts; + + struct mp_pin *in_pin; + struct mp_aframe *cur_format; + struct mp_aframe_pool *out_pool; + bool sent_final; + RubberBandState rubber; + double speed; + double pitch; + struct mp_aframe *pending; + // Estimate how much librubberband has buffered internally. + // I could not find a way to do this with the librubberband API. + double rubber_delay; +}; + +static void update_speed(struct priv *p, double new_speed) +{ + p->speed = new_speed; + if (p->rubber) + rubberband_set_time_ratio(p->rubber, 1.0 / p->speed); +} + +static bool update_pitch(struct priv *p, double new_pitch) +{ + if (new_pitch < 0.01 || new_pitch > 100.0) + return false; + + p->pitch = new_pitch; + if (p->rubber) + rubberband_set_pitch_scale(p->rubber, p->pitch); + return true; +} + +static bool init_rubberband(struct mp_filter *f) +{ + struct priv *p = f->priv; + + assert(!p->rubber); + assert(p->pending); + + int opts = p->opts->transients | p->opts->detector | p->opts->phase | + p->opts->window | p->opts->smoothing | p->opts->formant | + p->opts->pitch | p->opts->channels | +#if HAVE_RUBBERBAND_3 + p->opts->engine | +#endif + RubberBandOptionProcessRealTime; + + int rate = mp_aframe_get_rate(p->pending); + int channels = mp_aframe_get_channels(p->pending); + if (mp_aframe_get_format(p->pending) != AF_FORMAT_FLOATP) + return false; + + p->rubber = rubberband_new(rate, channels, opts, 1.0, 1.0); + if (!p->rubber) { + MP_FATAL(f, "librubberband initialization failed.\n"); + return false; + } + + mp_aframe_config_copy(p->cur_format, p->pending); + + update_speed(p, p->speed); + update_pitch(p, p->pitch); + + return true; +} + +static void process(struct mp_filter *f) +{ + struct priv *p = f->priv; + + if (!mp_pin_in_needs_data(f->ppins[1])) + return; + + while (!p->rubber || !p->pending || rubberband_available(p->rubber) <= 0) { + const float *dummy[MP_NUM_CHANNELS] = {0}; + const float **in_data = dummy; + size_t in_samples = 0; + + bool eof = false; + if (!p->pending || !mp_aframe_get_size(p->pending)) { + struct mp_frame frame = mp_pin_out_read(p->in_pin); + if (frame.type == MP_FRAME_AUDIO) { + TA_FREEP(&p->pending); + p->pending = frame.data; + } else if (frame.type == MP_FRAME_EOF) { + eof = true; + } else if (frame.type) { + MP_ERR(f, "unexpected frame type\n"); + goto error; + } else { + return; // no new data yet + } + } + assert(p->pending || eof); + + if (!p->rubber) { + if (!p->pending) { + mp_pin_in_write(f->ppins[1], MP_EOF_FRAME); + return; + } + if (!init_rubberband(f)) + goto error; + } + + bool format_change = + p->pending && !mp_aframe_config_equals(p->pending, p->cur_format); + + if (p->pending && !format_change) { + size_t needs = rubberband_get_samples_required(p->rubber); + uint8_t **planes = mp_aframe_get_data_ro(p->pending); + int num_planes = mp_aframe_get_planes(p->pending); + for (int n = 0; n < num_planes; n++) + in_data[n] = (void *)planes[n]; + in_samples = MPMIN(mp_aframe_get_size(p->pending), needs); + } + + bool final = format_change || eof; + if (!p->sent_final) + rubberband_process(p->rubber, in_data, in_samples, final); + p->sent_final |= final; + + p->rubber_delay += in_samples; + + if (p->pending && !format_change) + mp_aframe_skip_samples(p->pending, in_samples); + + if (rubberband_available(p->rubber) > 0) { + if (eof) + mp_pin_out_repeat_eof(p->in_pin); // drain more next time + } else { + if (eof) { + mp_pin_in_write(f->ppins[1], MP_EOF_FRAME); + rubberband_reset(p->rubber); + p->rubber_delay = 0; + TA_FREEP(&p->pending); + p->sent_final = false; + return; + } else if (format_change) { + // go on with proper reinit on the next iteration + rubberband_delete(p->rubber); + p->sent_final = false; + p->rubber = NULL; + } + } + } + + assert(p->pending); + + int out_samples = rubberband_available(p->rubber); + if (out_samples > 0) { + struct mp_aframe *out = mp_aframe_new_ref(p->cur_format); + if (mp_aframe_pool_allocate(p->out_pool, out, out_samples) < 0) { + talloc_free(out); + goto error; + } + + mp_aframe_copy_attributes(out, p->pending); + + float *out_data[MP_NUM_CHANNELS] = {0}; + uint8_t **planes = mp_aframe_get_data_rw(out); + assert(planes); + int num_planes = mp_aframe_get_planes(out); + for (int n = 0; n < num_planes; n++) + out_data[n] = (void *)planes[n]; + + out_samples = rubberband_retrieve(p->rubber, out_data, out_samples); + + if (!out_samples) { + mp_filter_internal_mark_progress(f); // unexpected, just try again + talloc_free(out); + return; + } + + mp_aframe_set_size(out, out_samples); + + p->rubber_delay -= out_samples * p->speed; + + double pts = mp_aframe_get_pts(p->pending); + if (pts != MP_NOPTS_VALUE) { + // Note: rubberband_get_latency() does not do what you'd expect. + double delay = p->rubber_delay / mp_aframe_get_effective_rate(out); + mp_aframe_set_pts(out, pts - delay); + } + + mp_aframe_mul_speed(out, p->speed); + + mp_pin_in_write(f->ppins[1], MAKE_FRAME(MP_FRAME_AUDIO, out)); + } + + return; +error: + mp_filter_internal_mark_failed(f); +} + +static bool command(struct mp_filter *f, struct mp_filter_command *cmd) +{ + struct priv *p = f->priv; + + switch (cmd->type) { + case MP_FILTER_COMMAND_TEXT: { + char *endptr = NULL; + double pitch = p->pitch; + if (!strcmp(cmd->cmd, "set-pitch")) { + pitch = strtod(cmd->arg, &endptr); + if (*endptr) + return false; + return update_pitch(p, pitch); + } else if (!strcmp(cmd->cmd, "multiply-pitch")) { + double mult = strtod(cmd->arg, &endptr); + if (*endptr || mult <= 0) + return false; + pitch *= mult; + return update_pitch(p, pitch); + } + return false; + } + case MP_FILTER_COMMAND_SET_SPEED: + update_speed(p, cmd->speed); + return true; + } + + return false; +} + +static void reset(struct mp_filter *f) +{ + struct priv *p = f->priv; + + if (p->rubber) + rubberband_reset(p->rubber); + p->rubber_delay = 0; + p->sent_final = false; + TA_FREEP(&p->pending); +} + +static void destroy(struct mp_filter *f) +{ + struct priv *p = f->priv; + + if (p->rubber) + rubberband_delete(p->rubber); + talloc_free(p->pending); +} + +static const struct mp_filter_info af_rubberband_filter = { + .name = "rubberband", + .priv_size = sizeof(struct priv), + .process = process, + .command = command, + .reset = reset, + .destroy = destroy, +}; + +static struct mp_filter *af_rubberband_create(struct mp_filter *parent, + void *options) +{ + struct mp_filter *f = mp_filter_create(parent, &af_rubberband_filter); + if (!f) { + talloc_free(options); + return NULL; + } + + mp_filter_add_pin(f, MP_PIN_IN, "in"); + mp_filter_add_pin(f, MP_PIN_OUT, "out"); + + struct priv *p = f->priv; + p->opts = talloc_steal(p, options); + p->speed = 1.0; + p->pitch = p->opts->scale; + p->cur_format = talloc_steal(p, mp_aframe_create()); + p->out_pool = mp_aframe_pool_create(p); + + struct mp_autoconvert *conv = mp_autoconvert_create(f); + if (!conv) + abort(); + + mp_autoconvert_add_afmt(conv, AF_FORMAT_FLOATP); + + mp_pin_connect(conv->f->pins[0], f->ppins[0]); + p->in_pin = conv->f->pins[1]; + + return f; +} + +#define OPT_BASE_STRUCT struct f_opts + +const struct mp_user_filter_entry af_rubberband = { + .desc = { + .description = "Pitch conversion with librubberband", + .name = "rubberband", + .priv_size = sizeof(OPT_BASE_STRUCT), + .priv_defaults = &(const OPT_BASE_STRUCT) { + .scale = 1.0, + .pitch = RubberBandOptionPitchHighConsistency, + .transients = RubberBandOptionTransientsMixed, + .formant = RubberBandOptionFormantPreserved, + .channels = RubberBandOptionChannelsTogether, +#if HAVE_RUBBERBAND_3 + .engine = RubberBandOptionEngineFiner, +#endif + }, + .options = (const struct m_option[]) { + {"transients", OPT_CHOICE(transients, + {"crisp", RubberBandOptionTransientsCrisp}, + {"mixed", RubberBandOptionTransientsMixed}, + {"smooth", RubberBandOptionTransientsSmooth})}, + {"detector", OPT_CHOICE(detector, + {"compound", RubberBandOptionDetectorCompound}, + {"percussive", RubberBandOptionDetectorPercussive}, + {"soft", RubberBandOptionDetectorSoft})}, + {"phase", OPT_CHOICE(phase, + {"laminar", RubberBandOptionPhaseLaminar}, + {"independent", RubberBandOptionPhaseIndependent})}, + {"window", OPT_CHOICE(window, + {"standard", RubberBandOptionWindowStandard}, + {"short", RubberBandOptionWindowShort}, + {"long", RubberBandOptionWindowLong})}, + {"smoothing", OPT_CHOICE(smoothing, + {"off", RubberBandOptionSmoothingOff}, + {"on", RubberBandOptionSmoothingOn})}, + {"formant", OPT_CHOICE(formant, + {"shifted", RubberBandOptionFormantShifted}, + {"preserved", RubberBandOptionFormantPreserved})}, + {"pitch", OPT_CHOICE(pitch, + {"quality", RubberBandOptionPitchHighQuality}, + {"speed", RubberBandOptionPitchHighSpeed}, + {"consistency", RubberBandOptionPitchHighConsistency})}, + {"channels", OPT_CHOICE(channels, + {"apart", RubberBandOptionChannelsApart}, + {"together", RubberBandOptionChannelsTogether})}, +#if HAVE_RUBBERBAND_3 + {"engine", OPT_CHOICE(engine, + {"finer", RubberBandOptionEngineFiner}, + {"faster", RubberBandOptionEngineFaster})}, +#endif + {"pitch-scale", OPT_DOUBLE(scale), M_RANGE(0.01, 100)}, + {0} + }, + }, + .create = af_rubberband_create, +}; diff --git a/audio/filter/af_scaletempo.c b/audio/filter/af_scaletempo.c new file mode 100644 index 0000000..f06478f --- /dev/null +++ b/audio/filter/af_scaletempo.c @@ -0,0 +1,626 @@ +/* + * scaletempo audio filter + * + * scale tempo while maintaining pitch + * (WSOLA technique with cross correlation) + * inspired by SoundTouch library by Olli Parviainen + * + * basic algorithm + * - produce 'stride' output samples per loop + * - consume stride*scale input samples per loop + * + * to produce smoother transitions between strides, blend next overlap + * samples from last stride with correlated samples of current input + * + * Copyright (c) 2007 Robert Juliano + * + * This file is part of mpv. + * + * mpv is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * mpv is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with mpv. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <float.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> +#include <assert.h> + +#include "audio/aframe.h" +#include "audio/format.h" +#include "common/common.h" +#include "filters/f_autoconvert.h" +#include "filters/filter_internal.h" +#include "filters/user_filters.h" +#include "options/m_option.h" + +struct f_opts { + float scale_nominal; + float ms_stride; + float ms_search; + float factor_overlap; +#define SCALE_TEMPO 1 +#define SCALE_PITCH 2 + int speed_opt; +}; + +struct priv { + struct f_opts *opts; + + struct mp_pin *in_pin; + struct mp_aframe *cur_format; + struct mp_aframe_pool *out_pool; + double current_pts; + struct mp_aframe *in; + + // stride + float scale; + float speed; + int frames_stride; + float frames_stride_scaled; + float frames_stride_error; + int bytes_per_frame; + int bytes_stride; + int bytes_queue; + int bytes_queued; + int bytes_to_slide; + int8_t *buf_queue; + // overlap + int samples_overlap; + int samples_standing; + int bytes_overlap; + int bytes_standing; + void *buf_overlap; + void *table_blend; + void (*output_overlap)(struct priv *s, void *out_buf, + int bytes_off); + // best overlap + int frames_search; + int num_channels; + void *buf_pre_corr; + void *table_window; + int (*best_overlap_offset)(struct priv *s); +}; + +static bool reinit(struct mp_filter *f); + +// Return whether it got enough data for filtering. +static bool fill_queue(struct priv *s) +{ + int bytes_in = s->in ? mp_aframe_get_size(s->in) * s->bytes_per_frame : 0; + int offset = 0; + + if (s->bytes_to_slide > 0) { + if (s->bytes_to_slide < s->bytes_queued) { + int bytes_move = s->bytes_queued - s->bytes_to_slide; + memmove(s->buf_queue, s->buf_queue + s->bytes_to_slide, bytes_move); + s->bytes_to_slide = 0; + s->bytes_queued = bytes_move; + } else { + int bytes_skip; + s->bytes_to_slide -= s->bytes_queued; + bytes_skip = MPMIN(s->bytes_to_slide, bytes_in); + s->bytes_queued = 0; + s->bytes_to_slide -= bytes_skip; + offset += bytes_skip; + bytes_in -= bytes_skip; + } + } + + int bytes_needed = s->bytes_queue - s->bytes_queued; + assert(bytes_needed >= 0); + + int bytes_copy = MPMIN(bytes_needed, bytes_in); + if (bytes_copy > 0) { + uint8_t **planes = mp_aframe_get_data_ro(s->in); + memcpy(s->buf_queue + s->bytes_queued, planes[0] + offset, bytes_copy); + s->bytes_queued += bytes_copy; + offset += bytes_copy; + bytes_needed -= bytes_copy; + } + + if (s->in) + mp_aframe_skip_samples(s->in, offset / s->bytes_per_frame); + + return bytes_needed == 0; +} + +#define UNROLL_PADDING (4 * 4) + +static int best_overlap_offset_float(struct priv *s) +{ + float best_corr = INT_MIN; + int best_off = 0; + + float *pw = s->table_window; + float *po = s->buf_overlap; + po += s->num_channels; + float *ppc = s->buf_pre_corr; + for (int i = s->num_channels; i < s->samples_overlap; i++) + *ppc++ = *pw++ **po++; + + float *search_start = (float *)s->buf_queue + s->num_channels; + for (int off = 0; off < s->frames_search; off++) { + float corr = 0; + float *ps = search_start; + ppc = s->buf_pre_corr; + for (int i = s->num_channels; i < s->samples_overlap; i++) + corr += *ppc++ **ps++; + if (corr > best_corr) { + best_corr = corr; + best_off = off; + } + search_start += s->num_channels; + } + + return best_off * 4 * s->num_channels; +} + +static int best_overlap_offset_s16(struct priv *s) +{ + int64_t best_corr = INT64_MIN; + int best_off = 0; + + int32_t *pw = s->table_window; + int16_t *po = s->buf_overlap; + po += s->num_channels; + int32_t *ppc = s->buf_pre_corr; + for (long i = s->num_channels; i < s->samples_overlap; i++) + *ppc++ = (*pw++ **po++) >> 15; + + int16_t *search_start = (int16_t *)s->buf_queue + s->num_channels; + for (int off = 0; off < s->frames_search; off++) { + int64_t corr = 0; + int16_t *ps = search_start; + ppc = s->buf_pre_corr; + ppc += s->samples_overlap - s->num_channels; + ps += s->samples_overlap - s->num_channels; + long i = -(s->samples_overlap - s->num_channels); + do { + corr += ppc[i + 0] * (int64_t)ps[i + 0]; + corr += ppc[i + 1] * (int64_t)ps[i + 1]; + corr += ppc[i + 2] * (int64_t)ps[i + 2]; + corr += ppc[i + 3] * (int64_t)ps[i + 3]; + i += 4; + } while (i < 0); + if (corr > best_corr) { + best_corr = corr; + best_off = off; + } + search_start += s->num_channels; + } + + return best_off * 2 * s->num_channels; +} + +static void output_overlap_float(struct priv *s, void *buf_out, + int bytes_off) +{ + float *pout = buf_out; + float *pb = s->table_blend; + float *po = s->buf_overlap; + float *pin = (float *)(s->buf_queue + bytes_off); + for (int i = 0; i < s->samples_overlap; i++) { + *pout++ = *po - *pb++ *(*po - *pin++); + po++; + } +} + +static void output_overlap_s16(struct priv *s, void *buf_out, + int bytes_off) +{ + int16_t *pout = buf_out; + int32_t *pb = s->table_blend; + int16_t *po = s->buf_overlap; + int16_t *pin = (int16_t *)(s->buf_queue + bytes_off); + for (int i = 0; i < s->samples_overlap; i++) { + *pout++ = *po - ((*pb++ *(*po - *pin++)) >> 16); + po++; + } +} + +static void process(struct mp_filter *f) +{ + struct priv *s = f->priv; + + if (!mp_pin_in_needs_data(f->ppins[1])) + return; + + struct mp_aframe *out = NULL; + + bool drain = false; + bool is_eof = false; + if (!s->in) { + struct mp_frame frame = mp_pin_out_read(s->in_pin); + if (!frame.type) + return; // no input yet + if (frame.type != MP_FRAME_AUDIO && frame.type != MP_FRAME_EOF) { + MP_ERR(f, "unexpected frame type\n"); + goto error; + } + + s->in = frame.type == MP_FRAME_AUDIO ? frame.data : NULL; + is_eof = drain = !s->in; + + // EOF before it was even initialized once. + if (is_eof && !mp_aframe_config_is_valid(s->cur_format)) { + mp_pin_in_write(f->ppins[1], MP_EOF_FRAME); + return; + } + + if (s->in && !mp_aframe_config_equals(s->in, s->cur_format)) { + if (s->bytes_queued) { + // Drain remaining data before executing the format change. + MP_VERBOSE(f, "draining\n"); + mp_pin_out_unread(s->in_pin, frame); + s->in = NULL; + drain = true; + } else { + if (!reinit(f)) { + MP_ERR(f, "initialization failed\n"); + goto error; + } + } + } + + if (s->in) + s->current_pts = mp_aframe_end_pts(s->in); + } + + if (!fill_queue(s) && !drain) { + TA_FREEP(&s->in); + mp_pin_out_request_data_next(s->in_pin); + return; + } + + int max_out_samples = s->bytes_stride / s->bytes_per_frame; + if (drain) + max_out_samples += s->bytes_queued; + + out = mp_aframe_new_ref(s->cur_format); + if (mp_aframe_pool_allocate(s->out_pool, out, max_out_samples) < 0) + goto error; + + if (s->in) + mp_aframe_copy_attributes(out, s->in); + + uint8_t **out_planes = mp_aframe_get_data_rw(out); + if (!out_planes) + goto error; + int8_t *pout = out_planes[0]; + int out_offset = 0; + if (s->bytes_queued >= s->bytes_queue) { + int ti; + float tf; + int bytes_off = 0; + + // output stride + if (s->output_overlap) { + if (s->best_overlap_offset) + bytes_off = s->best_overlap_offset(s); + s->output_overlap(s, pout + out_offset, bytes_off); + } + memcpy(pout + out_offset + s->bytes_overlap, + s->buf_queue + bytes_off + s->bytes_overlap, + s->bytes_standing); + out_offset += s->bytes_stride; + + // input stride + memcpy(s->buf_overlap, + s->buf_queue + bytes_off + s->bytes_stride, + s->bytes_overlap); + tf = s->frames_stride_scaled + s->frames_stride_error; + ti = (int)tf; + s->frames_stride_error = tf - ti; + s->bytes_to_slide = ti * s->bytes_per_frame; + } + // Drain remaining buffered data. + if (drain && s->bytes_queued) { + memcpy(pout + out_offset, s->buf_queue, s->bytes_queued); + out_offset += s->bytes_queued; + s->bytes_queued = 0; + } + mp_aframe_set_size(out, out_offset / s->bytes_per_frame); + + // This filter can have a negative delay when scale > 1: + // output corresponding to some length of input can be decided and written + // after receiving only a part of that input. + float delay = (out_offset * s->speed + s->bytes_queued - s->bytes_to_slide) / + s->bytes_per_frame / mp_aframe_get_effective_rate(out) + + (s->in ? mp_aframe_duration(s->in) : 0); + + if (s->current_pts != MP_NOPTS_VALUE) + mp_aframe_set_pts(out, s->current_pts - delay); + + mp_aframe_mul_speed(out, s->speed); + + if (!mp_aframe_get_size(out)) + TA_FREEP(&out); + + if (is_eof && out) { + mp_pin_out_repeat_eof(s->in_pin); + } else if (is_eof && !out) { + mp_pin_in_write(f->ppins[1], MP_EOF_FRAME); + } else if (!is_eof && !out) { + mp_pin_out_request_data_next(s->in_pin); + } + + if (out) + mp_pin_in_write(f->ppins[1], MAKE_FRAME(MP_FRAME_AUDIO, out)); + + return; + +error: + TA_FREEP(&s->in); + talloc_free(out); + mp_filter_internal_mark_failed(f); +} + +static void update_speed(struct priv *s, float speed) +{ + s->speed = speed; + + double factor = (s->opts->speed_opt & SCALE_PITCH) ? 1.0 / s->speed : s->speed; + s->scale = factor * s->opts->scale_nominal; + + s->frames_stride_scaled = s->scale * s->frames_stride; + s->frames_stride_error = MPMIN(s->frames_stride_error, s->frames_stride_scaled); +} + +static bool reinit(struct mp_filter *f) +{ + struct priv *s = f->priv; + + mp_aframe_reset(s->cur_format); + + float srate = mp_aframe_get_rate(s->in) / 1000.0; + int nch = mp_aframe_get_channels(s->in); + int format = mp_aframe_get_format(s->in); + + int use_int = 0; + if (format == AF_FORMAT_S16) { + use_int = 1; + } else if (format != AF_FORMAT_FLOAT) { + return false; + } + int bps = use_int ? 2 : 4; + + s->frames_stride = srate * s->opts->ms_stride; + s->bytes_stride = s->frames_stride * bps * nch; + + update_speed(s, s->speed); + + int frames_overlap = s->frames_stride * s->opts->factor_overlap; + if (frames_overlap <= 0) { + s->bytes_standing = s->bytes_stride; + s->samples_standing = s->bytes_standing / bps; + s->output_overlap = NULL; + s->bytes_overlap = 0; + } else { + s->samples_overlap = frames_overlap * nch; + s->bytes_overlap = frames_overlap * nch * bps; + s->bytes_standing = s->bytes_stride - s->bytes_overlap; + s->samples_standing = s->bytes_standing / bps; + s->buf_overlap = realloc(s->buf_overlap, s->bytes_overlap); + s->table_blend = realloc(s->table_blend, s->bytes_overlap * 4); + if (!s->buf_overlap || !s->table_blend) { + MP_FATAL(f, "Out of memory\n"); + return false; + } + memset(s->buf_overlap, 0, s->bytes_overlap); + if (use_int) { + int32_t *pb = s->table_blend; + int64_t blend = 0; + for (int i = 0; i < frames_overlap; i++) { + int32_t v = blend / frames_overlap; + for (int j = 0; j < nch; j++) + *pb++ = v; + blend += 65536; // 2^16 + } + s->output_overlap = output_overlap_s16; + } else { + float *pb = s->table_blend; + for (int i = 0; i < frames_overlap; i++) { + float v = i / (float)frames_overlap; + for (int j = 0; j < nch; j++) + *pb++ = v; + } + s->output_overlap = output_overlap_float; + } + } + + s->frames_search = (frames_overlap > 1) ? srate * s->opts->ms_search : 0; + if (s->frames_search <= 0) + s->best_overlap_offset = NULL; + else { + if (use_int) { + int64_t t = frames_overlap; + int32_t n = 8589934588LL / (t * t); // 4 * (2^31 - 1) / t^2 + s->buf_pre_corr = realloc(s->buf_pre_corr, + s->bytes_overlap * 2 + UNROLL_PADDING); + s->table_window = realloc(s->table_window, + s->bytes_overlap * 2 - nch * bps * 2); + if (!s->buf_pre_corr || !s->table_window) { + MP_FATAL(f, "Out of memory\n"); + return false; + } + memset((char *)s->buf_pre_corr + s->bytes_overlap * 2, 0, + UNROLL_PADDING); + int32_t *pw = s->table_window; + for (int i = 1; i < frames_overlap; i++) { + int32_t v = (i * (t - i) * n) >> 15; + for (int j = 0; j < nch; j++) + *pw++ = v; + } + s->best_overlap_offset = best_overlap_offset_s16; + } else { + s->buf_pre_corr = realloc(s->buf_pre_corr, s->bytes_overlap); + s->table_window = realloc(s->table_window, + s->bytes_overlap - nch * bps); + if (!s->buf_pre_corr || !s->table_window) { + MP_FATAL(f, "Out of memory\n"); + return false; + } + float *pw = s->table_window; + for (int i = 1; i < frames_overlap; i++) { + float v = i * (frames_overlap - i); + for (int j = 0; j < nch; j++) + *pw++ = v; + } + s->best_overlap_offset = best_overlap_offset_float; + } + } + + s->bytes_per_frame = bps * nch; + s->num_channels = nch; + + s->bytes_queue = (s->frames_search + s->frames_stride + frames_overlap) + * bps * nch; + s->buf_queue = realloc(s->buf_queue, s->bytes_queue + UNROLL_PADDING); + if (!s->buf_queue) { + MP_FATAL(f, "Out of memory\n"); + return false; + } + + s->bytes_queued = 0; + s->bytes_to_slide = 0; + + MP_DBG(f, "" + "%.2f stride_in, %i stride_out, %i standing, " + "%i overlap, %i search, %i queue, %s mode\n", + s->frames_stride_scaled, + (int)(s->bytes_stride / nch / bps), + (int)(s->bytes_standing / nch / bps), + (int)(s->bytes_overlap / nch / bps), + s->frames_search, + (int)(s->bytes_queue / nch / bps), + (use_int ? "s16" : "float")); + + mp_aframe_config_copy(s->cur_format, s->in); + + return true; +} + +static bool command(struct mp_filter *f, struct mp_filter_command *cmd) +{ + struct priv *s = f->priv; + + if (cmd->type == MP_FILTER_COMMAND_SET_SPEED) { + if (s->opts->speed_opt & SCALE_TEMPO) { + if (s->opts->speed_opt & SCALE_PITCH) + return false; + update_speed(s, cmd->speed); + return true; + } else if (s->opts->speed_opt & SCALE_PITCH) { + update_speed(s, cmd->speed); + return false; // do not signal OK + } + } + + return false; +} + +static void reset(struct mp_filter *f) +{ + struct priv *s = f->priv; + + s->current_pts = MP_NOPTS_VALUE; + s->bytes_queued = 0; + s->bytes_to_slide = 0; + s->frames_stride_error = 0; + if (s->buf_overlap && s->bytes_overlap) + memset(s->buf_overlap, 0, s->bytes_overlap); + TA_FREEP(&s->in); +} + +static void destroy(struct mp_filter *f) +{ + struct priv *s = f->priv; + free(s->buf_queue); + free(s->buf_overlap); + free(s->buf_pre_corr); + free(s->table_blend); + free(s->table_window); + TA_FREEP(&s->in); + mp_filter_free_children(f); +} + +static const struct mp_filter_info af_scaletempo_filter = { + .name = "scaletempo", + .priv_size = sizeof(struct priv), + .process = process, + .command = command, + .reset = reset, + .destroy = destroy, +}; + +static struct mp_filter *af_scaletempo_create(struct mp_filter *parent, + void *options) +{ + struct mp_filter *f = mp_filter_create(parent, &af_scaletempo_filter); + if (!f) { + talloc_free(options); + return NULL; + } + + mp_filter_add_pin(f, MP_PIN_IN, "in"); + mp_filter_add_pin(f, MP_PIN_OUT, "out"); + + struct priv *s = f->priv; + s->opts = talloc_steal(s, options); + s->speed = 1.0; + s->cur_format = talloc_steal(s, mp_aframe_create()); + s->out_pool = mp_aframe_pool_create(s); + + struct mp_autoconvert *conv = mp_autoconvert_create(f); + if (!conv) + abort(); + + mp_autoconvert_add_afmt(conv, AF_FORMAT_S16); + mp_autoconvert_add_afmt(conv, AF_FORMAT_FLOAT); + + mp_pin_connect(conv->f->pins[0], f->ppins[0]); + s->in_pin = conv->f->pins[1]; + + return f; +} + +#define OPT_BASE_STRUCT struct f_opts + +const struct mp_user_filter_entry af_scaletempo = { + .desc = { + .description = "Scale audio tempo while maintaining pitch", + .name = "scaletempo", + .priv_size = sizeof(OPT_BASE_STRUCT), + .priv_defaults = &(const OPT_BASE_STRUCT) { + .ms_stride = 60, + .factor_overlap = .20, + .ms_search = 14, + .speed_opt = SCALE_TEMPO, + .scale_nominal = 1.0, + }, + .options = (const struct m_option[]) { + {"scale", OPT_FLOAT(scale_nominal), M_RANGE(0.01, DBL_MAX)}, + {"stride", OPT_FLOAT(ms_stride), M_RANGE(0.01, DBL_MAX)}, + {"overlap", OPT_FLOAT(factor_overlap), M_RANGE(0, 1)}, + {"search", OPT_FLOAT(ms_search), M_RANGE(0, DBL_MAX)}, + {"speed", OPT_CHOICE(speed_opt, + {"pitch", SCALE_PITCH}, + {"tempo", SCALE_TEMPO}, + {"none", 0}, + {"both", SCALE_TEMPO | SCALE_PITCH})}, + {0} + }, + }, + .create = af_scaletempo_create, +}; diff --git a/audio/filter/af_scaletempo2.c b/audio/filter/af_scaletempo2.c new file mode 100644 index 0000000..7ad8e35 --- /dev/null +++ b/audio/filter/af_scaletempo2.c @@ -0,0 +1,254 @@ +#include "audio/aframe.h" +#include "audio/filter/af_scaletempo2_internals.h" +#include "audio/format.h" +#include "common/common.h" +#include "filters/f_autoconvert.h" +#include "filters/filter_internal.h" +#include "filters/user_filters.h" +#include "options/m_option.h" + +struct priv { + struct mp_scaletempo2 data; + struct mp_pin *in_pin; + struct mp_aframe *cur_format; + struct mp_aframe_pool *out_pool; + bool sent_final; + struct mp_aframe *pending; + bool initialized; + float speed; +}; + +static bool init_scaletempo2(struct mp_filter *f); +static void reset(struct mp_filter *f); + +static void process(struct mp_filter *f) +{ + struct priv *p = f->priv; + + if (!mp_pin_in_needs_data(f->ppins[1])) + return; + + while (!p->initialized || !p->pending || + !mp_scaletempo2_frames_available(&p->data, p->speed)) + { + bool eof = false; + if (!p->pending || !mp_aframe_get_size(p->pending)) { + struct mp_frame frame = mp_pin_out_read(p->in_pin); + if (frame.type == MP_FRAME_AUDIO) { + TA_FREEP(&p->pending); + p->pending = frame.data; + } else if (frame.type == MP_FRAME_EOF) { + eof = true; + } else if (frame.type) { + MP_ERR(f, "unexpected frame type\n"); + goto error; + } else { + return; // no new data yet + } + } + assert(p->pending || eof); + + if (!p->initialized) { + if (!p->pending) { + mp_pin_in_write(f->ppins[1], MP_EOF_FRAME); + return; + } + if (!init_scaletempo2(f)) + goto error; + } + + bool format_change = + p->pending && !mp_aframe_config_equals(p->pending, p->cur_format); + + bool final = format_change || eof; + if (p->pending && !format_change && !p->sent_final) { + int frame_size = mp_aframe_get_size(p->pending); + uint8_t **planes = mp_aframe_get_data_ro(p->pending); + int read = mp_scaletempo2_fill_input_buffer(&p->data, + planes, frame_size, p->speed); + mp_aframe_skip_samples(p->pending, read); + } + if (final && p->pending && !p->sent_final) { + mp_scaletempo2_set_final(&p->data); + p->sent_final = true; + } + + if (mp_scaletempo2_frames_available(&p->data, p->speed)) { + if (eof) { + mp_pin_out_repeat_eof(p->in_pin); // drain more next time + } + } else if (final) { + p->initialized = false; + p->sent_final = false; + if (eof) { + mp_pin_in_write(f->ppins[1], MP_EOF_FRAME); + return; + } + // for format change go on with proper reinit on the next iteration + } + } + + assert(p->pending); + if (mp_scaletempo2_frames_available(&p->data, p->speed)) { + struct mp_aframe *out = mp_aframe_new_ref(p->cur_format); + int out_samples = p->data.ola_hop_size; + if (mp_aframe_pool_allocate(p->out_pool, out, out_samples) < 0) { + talloc_free(out); + goto error; + } + + mp_aframe_copy_attributes(out, p->pending); + + uint8_t **planes = mp_aframe_get_data_rw(out); + assert(planes); + assert(mp_aframe_get_planes(out) == p->data.channels); + + out_samples = mp_scaletempo2_fill_buffer(&p->data, + (float**)planes, out_samples, p->speed); + + double pts = mp_aframe_get_pts(p->pending); + if (pts != MP_NOPTS_VALUE) { + double frame_delay = mp_scaletempo2_get_latency(&p->data, p->speed) + + out_samples * p->speed; + mp_aframe_set_pts(out, pts - frame_delay / mp_aframe_get_effective_rate(out)); + + if (p->sent_final) { + double remain_pts = pts - mp_aframe_get_pts(out); + double rate = mp_aframe_get_effective_rate(out) / p->speed; + int max_samples = MPMAX(0, (int) (remain_pts * rate)); + // truncate final packet to expected length + if (out_samples >= max_samples) { + out_samples = max_samples; + + // reset the filter to ensure it stops generating audio + // and mp_scaletempo2_frames_available returns false + mp_scaletempo2_reset(&p->data); + } + } + } + + mp_aframe_set_size(out, out_samples); + mp_aframe_mul_speed(out, p->speed); + mp_pin_in_write(f->ppins[1], MAKE_FRAME(MP_FRAME_AUDIO, out)); + } + + return; +error: + mp_filter_internal_mark_failed(f); +} + +static bool init_scaletempo2(struct mp_filter *f) +{ + struct priv *p = f->priv; + assert(p->pending); + + if (mp_aframe_get_format(p->pending) != AF_FORMAT_FLOATP) + return false; + + mp_aframe_reset(p->cur_format); + p->initialized = true; + p->sent_final = false; + mp_aframe_config_copy(p->cur_format, p->pending); + + mp_scaletempo2_init(&p->data, mp_aframe_get_channels(p->pending), + mp_aframe_get_rate(p->pending)); + + return true; +} + +static bool command(struct mp_filter *f, struct mp_filter_command *cmd) +{ + struct priv *p = f->priv; + + switch (cmd->type) { + case MP_FILTER_COMMAND_SET_SPEED: + p->speed = cmd->speed; + return true; + } + + return false; +} + +static void reset(struct mp_filter *f) +{ + struct priv *p = f->priv; + mp_scaletempo2_reset(&p->data); + p->initialized = false; + TA_FREEP(&p->pending); +} + +static void destroy(struct mp_filter *f) +{ + struct priv *p = f->priv; + mp_scaletempo2_destroy(&p->data); + talloc_free(p->pending); +} + +static const struct mp_filter_info af_scaletempo2_filter = { + .name = "scaletempo2", + .priv_size = sizeof(struct priv), + .process = process, + .command = command, + .reset = reset, + .destroy = destroy, +}; + +static struct mp_filter *af_scaletempo2_create( + struct mp_filter *parent, void *options) +{ + struct mp_filter *f = mp_filter_create(parent, &af_scaletempo2_filter); + if (!f) { + talloc_free(options); + return NULL; + } + + mp_filter_add_pin(f, MP_PIN_IN, "in"); + mp_filter_add_pin(f, MP_PIN_OUT, "out"); + + struct priv *p = f->priv; + p->data.opts = talloc_steal(p, options); + p->speed = 1.0; + p->cur_format = talloc_steal(p, mp_aframe_create()); + p->out_pool = mp_aframe_pool_create(p); + p->pending = NULL; + p->initialized = false; + + struct mp_autoconvert *conv = mp_autoconvert_create(f); + if (!conv) + abort(); + + mp_autoconvert_add_afmt(conv, AF_FORMAT_FLOATP); + + mp_pin_connect(conv->f->pins[0], f->ppins[0]); + p->in_pin = conv->f->pins[1]; + + return f; +} + +#define OPT_BASE_STRUCT struct mp_scaletempo2_opts +const struct mp_user_filter_entry af_scaletempo2 = { + .desc = { + .description = "Scale audio tempo while maintaining pitch" + " (filter ported from chromium)", + .name = "scaletempo2", + .priv_size = sizeof(OPT_BASE_STRUCT), + .priv_defaults = &(const OPT_BASE_STRUCT) { + .min_playback_rate = 0.25, + .max_playback_rate = 8.0, + .ola_window_size_ms = 12, + .wsola_search_interval_ms = 40, + }, + .options = (const struct m_option[]) { + {"search-interval", + OPT_FLOAT(wsola_search_interval_ms), M_RANGE(1, 1000)}, + {"window-size", + OPT_FLOAT(ola_window_size_ms), M_RANGE(1, 1000)}, + {"min-speed", + OPT_FLOAT(min_playback_rate), M_RANGE(0, FLT_MAX)}, + {"max-speed", + OPT_FLOAT(max_playback_rate), M_RANGE(0, FLT_MAX)}, + {0} + } + }, + .create = af_scaletempo2_create, +}; diff --git a/audio/filter/af_scaletempo2_internals.c b/audio/filter/af_scaletempo2_internals.c new file mode 100644 index 0000000..534f4f6 --- /dev/null +++ b/audio/filter/af_scaletempo2_internals.c @@ -0,0 +1,873 @@ +#include <float.h> +#include <math.h> + +#include "audio/chmap.h" +#include "audio/filter/af_scaletempo2_internals.h" + +#include "config.h" + +// Algorithm overview (from chromium): +// Waveform Similarity Overlap-and-add (WSOLA). +// +// One WSOLA iteration +// +// 1) Extract |target_block| as input frames at indices +// [|target_block_index|, |target_block_index| + |ola_window_size|). +// Note that |target_block| is the "natural" continuation of the output. +// +// 2) Extract |search_block| as input frames at indices +// [|search_block_index|, +// |search_block_index| + |num_candidate_blocks| + |ola_window_size|). +// +// 3) Find a block within the |search_block| that is most similar +// to |target_block|. Let |optimal_index| be the index of such block and +// write it to |optimal_block|. +// +// 4) Update: +// |optimal_block| = |transition_window| * |target_block| + +// (1 - |transition_window|) * |optimal_block|. +// +// 5) Overlap-and-add |optimal_block| to the |wsola_output|. +// +// 6) Update:write + +struct interval { + int lo; + int hi; +}; + +static bool in_interval(int n, struct interval q) +{ + return n >= q.lo && n <= q.hi; +} + +static float **realloc_2d(float **p, int x, int y) +{ + float **array = realloc(p, sizeof(float*) * x + sizeof(float) * x * y); + float* data = (float*) (array + x); + for (int i = 0; i < x; ++i) { + array[i] = data + i * y; + } + return array; +} + +static void zero_2d(float **a, int x, int y) +{ + memset(a + x, 0, sizeof(float) * x * y); +} + +static void zero_2d_partial(float **a, int x, int y) +{ + for (int i = 0; i < x; ++i) { + memset(a[i], 0, sizeof(float) * y); + } +} + +// Energies of sliding windows of channels are interleaved. +// The number windows is |input_frames| - (|frames_per_window| - 1), hence, +// the method assumes |energy| must be, at least, of size +// (|input_frames| - (|frames_per_window| - 1)) * |channels|. +static void multi_channel_moving_block_energies( + float **input, int input_frames, int channels, + int frames_per_block, float *energy) +{ + int num_blocks = input_frames - (frames_per_block - 1); + + for (int k = 0; k < channels; ++k) { + const float* input_channel = input[k]; + + energy[k] = 0; + + // First block of channel |k|. + for (int m = 0; m < frames_per_block; ++m) { + energy[k] += input_channel[m] * input_channel[m]; + } + + const float* slide_out = input_channel; + const float* slide_in = input_channel + frames_per_block; + for (int n = 1; n < num_blocks; ++n, ++slide_in, ++slide_out) { + energy[k + n * channels] = energy[k + (n - 1) * channels] + - *slide_out * *slide_out + *slide_in * *slide_in; + } + } +} + +static float multi_channel_similarity_measure( + const float* dot_prod_a_b, + const float* energy_a, const float* energy_b, + int channels) +{ + const float epsilon = 1e-12f; + float similarity_measure = 0.0f; + for (int n = 0; n < channels; ++n) { + similarity_measure += dot_prod_a_b[n] + / sqrtf(energy_a[n] * energy_b[n] + epsilon); + } + return similarity_measure; +} + +#if HAVE_VECTOR + +typedef float v8sf __attribute__ ((vector_size (32), aligned (1))); + +// Dot-product of channels of two AudioBus. For each AudioBus an offset is +// given. |dot_product[k]| is the dot-product of channel |k|. The caller should +// allocate sufficient space for |dot_product|. +static void multi_channel_dot_product( + float **a, int frame_offset_a, + float **b, int frame_offset_b, + int channels, + int num_frames, float *dot_product) +{ + assert(frame_offset_a >= 0); + assert(frame_offset_b >= 0); + + for (int k = 0; k < channels; ++k) { + const float* ch_a = a[k] + frame_offset_a; + const float* ch_b = b[k] + frame_offset_b; + float sum = 0.0; + if (num_frames < 32) + goto rest; + + const v8sf *va = (const v8sf *) ch_a; + const v8sf *vb = (const v8sf *) ch_b; + v8sf vsum[4] = { + // Initialize to product of first 32 floats + va[0] * vb[0], + va[1] * vb[1], + va[2] * vb[2], + va[3] * vb[3], + }; + va += 4; + vb += 4; + + // Process `va` and `vb` across four vertical stripes + for (int n = 1; n < num_frames / 32; n++) { + vsum[0] += va[0] * vb[0]; + vsum[1] += va[1] * vb[1]; + vsum[2] += va[2] * vb[2]; + vsum[3] += va[3] * vb[3]; + va += 4; + vb += 4; + } + + // Vertical sum across `vsum` entries + vsum[0] += vsum[1]; + vsum[2] += vsum[3]; + vsum[0] += vsum[2]; + + // Horizontal sum across `vsum[0]`, could probably be done better but + // this section is not super performance critical + float *vf = (float *) &vsum[0]; + sum = vf[0] + vf[1] + vf[2] + vf[3] + vf[4] + vf[5] + vf[6] + vf[7]; + ch_a = (const float *) va; + ch_b = (const float *) vb; + +rest: + // Process the remainder + for (int n = 0; n < num_frames % 32; n++) + sum += *ch_a++ * *ch_b++; + + dot_product[k] = sum; + } +} + +#else // !HAVE_VECTOR + +static void multi_channel_dot_product( + float **a, int frame_offset_a, + float **b, int frame_offset_b, + int channels, + int num_frames, float *dot_product) +{ + assert(frame_offset_a >= 0); + assert(frame_offset_b >= 0); + + for (int k = 0; k < channels; ++k) { + const float* ch_a = a[k] + frame_offset_a; + const float* ch_b = b[k] + frame_offset_b; + float sum = 0.0; + for (int n = 0; n < num_frames; n++) + sum += *ch_a++ * *ch_b++; + dot_product[k] = sum; + } +} + +#endif // HAVE_VECTOR + +// Fit the curve f(x) = a * x^2 + b * x + c such that +// f(-1) = y[0] +// f(0) = y[1] +// f(1) = y[2] +// and return the maximum, assuming that y[0] <= y[1] >= y[2]. +static void quadratic_interpolation( + const float* y_values, float* extremum, float* extremum_value) +{ + float a = 0.5f * (y_values[2] + y_values[0]) - y_values[1]; + float b = 0.5f * (y_values[2] - y_values[0]); + float c = y_values[1]; + + if (a == 0.f) { + // The coordinates are colinear (within floating-point error). + *extremum = 0; + *extremum_value = y_values[1]; + } else { + *extremum = -b / (2.f * a); + *extremum_value = a * (*extremum) * (*extremum) + b * (*extremum) + c; + } +} + +// Search a subset of all candid blocks. The search is performed every +// |decimation| frames. This reduces complexity by a factor of about +// 1 / |decimation|. A cubic interpolation is used to have a better estimate of +// the best match. +static int decimated_search( + int decimation, struct interval exclude_interval, + float **target_block, int target_block_frames, + float **search_segment, int search_segment_frames, + int channels, + const float *energy_target_block, const float *energy_candidate_blocks) +{ + int num_candidate_blocks = search_segment_frames - (target_block_frames - 1); + float dot_prod [MP_NUM_CHANNELS]; + float similarity[3]; // Three elements for cubic interpolation. + + int n = 0; + multi_channel_dot_product( + target_block, 0, + search_segment, n, + channels, + target_block_frames, dot_prod); + similarity[0] = multi_channel_similarity_measure( + dot_prod, energy_target_block, + &energy_candidate_blocks[n * channels], channels); + + // Set the starting point as optimal point. + float best_similarity = similarity[0]; + int optimal_index = 0; + + n += decimation; + if (n >= num_candidate_blocks) { + return 0; + } + + multi_channel_dot_product( + target_block, 0, + search_segment, n, + channels, + target_block_frames, dot_prod); + similarity[1] = multi_channel_similarity_measure( + dot_prod, energy_target_block, + &energy_candidate_blocks[n * channels], channels); + + n += decimation; + if (n >= num_candidate_blocks) { + // We cannot do any more sampling. Compare these two values and return the + // optimal index. + return similarity[1] > similarity[0] ? decimation : 0; + } + + for (; n < num_candidate_blocks; n += decimation) { + multi_channel_dot_product( + target_block, 0, + search_segment, n, + channels, + target_block_frames, dot_prod); + + similarity[2] = multi_channel_similarity_measure( + dot_prod, energy_target_block, + &energy_candidate_blocks[n * channels], channels); + + if ((similarity[1] > similarity[0] && similarity[1] >= similarity[2]) || + (similarity[1] >= similarity[0] && similarity[1] > similarity[2])) + { + // A local maximum is found. Do a cubic interpolation for a better + // estimate of candidate maximum. + float normalized_candidate_index; + float candidate_similarity; + quadratic_interpolation(similarity, &normalized_candidate_index, + &candidate_similarity); + + int candidate_index = n - decimation + + (int)(normalized_candidate_index * decimation + 0.5f); + if (candidate_similarity > best_similarity + && !in_interval(candidate_index, exclude_interval)) { + optimal_index = candidate_index; + best_similarity = candidate_similarity; + } + } else if (n + decimation >= num_candidate_blocks && + similarity[2] > best_similarity && + !in_interval(n, exclude_interval)) + { + // If this is the end-point and has a better similarity-measure than + // optimal, then we accept it as optimal point. + optimal_index = n; + best_similarity = similarity[2]; + } + memmove(similarity, &similarity[1], 2 * sizeof(*similarity)); + } + return optimal_index; +} + +// Search [|low_limit|, |high_limit|] of |search_segment| to find a block that +// is most similar to |target_block|. |energy_target_block| is the energy of the +// |target_block|. |energy_candidate_blocks| is the energy of all blocks within +// |search_block|. +static int full_search( + int low_limit, int high_limit, + struct interval exclude_interval, + float **target_block, int target_block_frames, + float **search_block, int search_block_frames, + int channels, + const float* energy_target_block, + const float* energy_candidate_blocks) +{ + // int block_size = target_block->frames; + float dot_prod [sizeof(float) * MP_NUM_CHANNELS]; + + float best_similarity = -FLT_MAX;//FLT_MIN; + int optimal_index = 0; + + for (int n = low_limit; n <= high_limit; ++n) { + if (in_interval(n, exclude_interval)) { + continue; + } + multi_channel_dot_product(target_block, 0, search_block, n, channels, + target_block_frames, dot_prod); + + float similarity = multi_channel_similarity_measure( + dot_prod, energy_target_block, + &energy_candidate_blocks[n * channels], channels); + + if (similarity > best_similarity) { + best_similarity = similarity; + optimal_index = n; + } + } + + return optimal_index; +} + +// Find the index of the block, within |search_block|, that is most similar +// to |target_block|. Obviously, the returned index is w.r.t. |search_block|. +// |exclude_interval| is an interval that is excluded from the search. +static int compute_optimal_index( + float **search_block, int search_block_frames, + float **target_block, int target_block_frames, + float *energy_candidate_blocks, + int channels, + struct interval exclude_interval) +{ + int num_candidate_blocks = search_block_frames - (target_block_frames - 1); + + // This is a compromise between complexity reduction and search accuracy. I + // don't have a proof that down sample of order 5 is optimal. + // One can compute a decimation factor that minimizes complexity given + // the size of |search_block| and |target_block|. However, my experiments + // show the rate of missing the optimal index is significant. + // This value is chosen heuristically based on experiments. + const int search_decimation = 5; + + float energy_target_block [MP_NUM_CHANNELS]; + // energy_candidate_blocks must have at least size + // sizeof(float) * channels * num_candidate_blocks + + // Energy of all candid frames. + multi_channel_moving_block_energies( + search_block, + search_block_frames, + channels, + target_block_frames, + energy_candidate_blocks); + + // Energy of target frame. + multi_channel_dot_product( + target_block, 0, + target_block, 0, + channels, + target_block_frames, energy_target_block); + + int optimal_index = decimated_search( + search_decimation, exclude_interval, + target_block, target_block_frames, + search_block, search_block_frames, + channels, + energy_target_block, + energy_candidate_blocks); + + int lim_low = MPMAX(0, optimal_index - search_decimation); + int lim_high = MPMIN(num_candidate_blocks - 1, + optimal_index + search_decimation); + return full_search( + lim_low, lim_high, exclude_interval, + target_block, target_block_frames, + search_block, search_block_frames, + channels, + energy_target_block, energy_candidate_blocks); +} + +static void peek_buffer(struct mp_scaletempo2 *p, + int frames, int read_offset, int write_offset, float **dest) +{ + assert(p->input_buffer_frames >= frames); + for (int i = 0; i < p->channels; ++i) { + memcpy(dest[i] + write_offset, + p->input_buffer[i] + read_offset, + frames * sizeof(float)); + } +} + +static void seek_buffer(struct mp_scaletempo2 *p, int frames) +{ + assert(p->input_buffer_frames >= frames); + p->input_buffer_frames -= frames; + if (p->input_buffer_final_frames > 0) { + p->input_buffer_final_frames = MPMAX(0, p->input_buffer_final_frames - frames); + } + for (int i = 0; i < p->channels; ++i) { + memmove(p->input_buffer[i], p->input_buffer[i] + frames, + p->input_buffer_frames * sizeof(float)); + } +} + +static int write_completed_frames_to(struct mp_scaletempo2 *p, + int requested_frames, int dest_offset, float **dest) +{ + int rendered_frames = MPMIN(p->num_complete_frames, requested_frames); + + if (rendered_frames == 0) + return 0; // There is nothing to read from |wsola_output|, return. + + for (int i = 0; i < p->channels; ++i) { + memcpy(dest[i] + dest_offset, p->wsola_output[i], + rendered_frames * sizeof(float)); + } + + // Remove the frames which are read. + int frames_to_move = p->wsola_output_size - rendered_frames; + for (int k = 0; k < p->channels; ++k) { + float *ch = p->wsola_output[k]; + memmove(ch, &ch[rendered_frames], sizeof(*ch) * frames_to_move); + } + p->num_complete_frames -= rendered_frames; + return rendered_frames; +} + +// next output_time for the given playback_rate +static double get_updated_time(struct mp_scaletempo2 *p, double playback_rate) +{ + return p->output_time + p->ola_hop_size * playback_rate; +} + +// search_block_index for the given output_time +static int get_search_block_index(struct mp_scaletempo2 *p, double output_time) +{ + return (int)(output_time - p->search_block_center_offset + 0.5); +} + +// number of frames needed until a wsola iteration can be performed +static int frames_needed(struct mp_scaletempo2 *p, double playback_rate) +{ + int search_block_index = + get_search_block_index(p, get_updated_time(p, playback_rate)); + return MPMAX(0, MPMAX( + p->target_block_index + p->ola_window_size - p->input_buffer_frames, + search_block_index + p->search_block_size - p->input_buffer_frames)); +} + +static bool can_perform_wsola(struct mp_scaletempo2 *p, double playback_rate) +{ + return frames_needed(p, playback_rate) <= 0; +} + +static void resize_input_buffer(struct mp_scaletempo2 *p, int size) +{ + p->input_buffer_size = size; + p->input_buffer = realloc_2d(p->input_buffer, p->channels, size); +} + +// pad end with silence until a wsola iteration can be performed +static void add_input_buffer_final_silence(struct mp_scaletempo2 *p, double playback_rate) +{ + int needed = frames_needed(p, playback_rate); + if (needed <= 0) + return; // no silence needed for iteration + + int required_size = needed + p->input_buffer_frames; + if (required_size > p->input_buffer_size) + resize_input_buffer(p, required_size); + + for (int i = 0; i < p->channels; ++i) { + float *ch_input = p->input_buffer[i]; + for (int j = 0; j < needed; ++j) { + ch_input[p->input_buffer_frames + j] = 0.0f; + } + } + + p->input_buffer_added_silence += needed; + p->input_buffer_frames += needed; +} + +void mp_scaletempo2_set_final(struct mp_scaletempo2 *p) +{ + if (p->input_buffer_final_frames <= 0) { + p->input_buffer_final_frames = p->input_buffer_frames; + } +} + +int mp_scaletempo2_fill_input_buffer(struct mp_scaletempo2 *p, + uint8_t **planes, int frame_size, double playback_rate) +{ + int needed = frames_needed(p, playback_rate); + int read = MPMIN(needed, frame_size); + if (read == 0) + return 0; + + int required_size = read + p->input_buffer_frames; + if (required_size > p->input_buffer_size) + resize_input_buffer(p, required_size); + + for (int i = 0; i < p->channels; ++i) { + memcpy(p->input_buffer[i] + p->input_buffer_frames, + planes[i], read * sizeof(float)); + } + + p->input_buffer_frames += read; + return read; +} + +static bool target_is_within_search_region(struct mp_scaletempo2 *p) +{ + return p->target_block_index >= p->search_block_index + && p->target_block_index + p->ola_window_size + <= p->search_block_index + p->search_block_size; +} + + +static void peek_audio_with_zero_prepend(struct mp_scaletempo2 *p, + int read_offset_frames, float **dest, int dest_frames) +{ + assert(read_offset_frames + dest_frames <= p->input_buffer_frames); + + int write_offset = 0; + int num_frames_to_read = dest_frames; + if (read_offset_frames < 0) { + int num_zero_frames_appended = MPMIN( + -read_offset_frames, num_frames_to_read); + read_offset_frames = 0; + num_frames_to_read -= num_zero_frames_appended; + write_offset = num_zero_frames_appended; + zero_2d_partial(dest, p->channels, num_zero_frames_appended); + } + peek_buffer(p, num_frames_to_read, read_offset_frames, write_offset, dest); +} + +static void get_optimal_block(struct mp_scaletempo2 *p) +{ + int optimal_index = 0; + + // An interval around last optimal block which is excluded from the search. + // This is to reduce the buzzy sound. The number 160 is rather arbitrary and + // derived heuristically. + const int exclude_interval_length_frames = 160; + if (target_is_within_search_region(p)) { + optimal_index = p->target_block_index; + peek_audio_with_zero_prepend(p, + optimal_index, p->optimal_block, p->ola_window_size); + } else { + peek_audio_with_zero_prepend(p, + p->target_block_index, p->target_block, p->ola_window_size); + peek_audio_with_zero_prepend(p, + p->search_block_index, p->search_block, p->search_block_size); + int last_optimal = p->target_block_index + - p->ola_hop_size - p->search_block_index; + struct interval exclude_iterval = { + .lo = last_optimal - exclude_interval_length_frames / 2, + .hi = last_optimal + exclude_interval_length_frames / 2 + }; + + // |optimal_index| is in frames and it is relative to the beginning of the + // |search_block|. + optimal_index = compute_optimal_index( + p->search_block, p->search_block_size, + p->target_block, p->ola_window_size, + p->energy_candidate_blocks, + p->channels, + exclude_iterval); + + // Translate |index| w.r.t. the beginning of |audio_buffer| and extract the + // optimal block. + optimal_index += p->search_block_index; + peek_audio_with_zero_prepend(p, + optimal_index, p->optimal_block, p->ola_window_size); + + // Make a transition from target block to the optimal block if different. + // Target block has the best continuation to the current output. + // Optimal block is the most similar block to the target, however, it might + // introduce some discontinuity when over-lap-added. Therefore, we combine + // them for a smoother transition. The length of transition window is twice + // as that of the optimal-block which makes it like a weighting function + // where target-block has higher weight close to zero (weight of 1 at index + // 0) and lower weight close the end. + for (int k = 0; k < p->channels; ++k) { + float* ch_opt = p->optimal_block[k]; + float* ch_target = p->target_block[k]; + for (int n = 0; n < p->ola_window_size; ++n) { + ch_opt[n] = ch_opt[n] * p->transition_window[n] + + ch_target[n] * p->transition_window[p->ola_window_size + n]; + } + } + } + + // Next target is one hop ahead of the current optimal. + p->target_block_index = optimal_index + p->ola_hop_size; +} + +static void set_output_time(struct mp_scaletempo2 *p, double output_time) +{ + p->output_time = output_time; + p->search_block_index = get_search_block_index(p, output_time); +} + +static void remove_old_input_frames(struct mp_scaletempo2 *p) +{ + const int earliest_used_index = MPMIN( + p->target_block_index, p->search_block_index); + if (earliest_used_index <= 0) + return; // Nothing to remove. + + // Remove frames from input and adjust indices accordingly. + seek_buffer(p, earliest_used_index); + p->target_block_index -= earliest_used_index; + p->output_time -= earliest_used_index; + p->search_block_index -= earliest_used_index; +} + +static bool run_one_wsola_iteration(struct mp_scaletempo2 *p, double playback_rate) +{ + if (!can_perform_wsola(p, playback_rate)) { + return false; + } + + set_output_time(p, get_updated_time(p, playback_rate)); + remove_old_input_frames(p); + + assert(p->search_block_index + p->search_block_size <= p->input_buffer_frames); + + get_optimal_block(p); + + // Overlap-and-add. + for (int k = 0; k < p->channels; ++k) { + float* ch_opt_frame = p->optimal_block[k]; + float* ch_output = p->wsola_output[k] + p->num_complete_frames; + if (p->wsola_output_started) { + for (int n = 0; n < p->ola_hop_size; ++n) { + ch_output[n] = ch_output[n] * p->ola_window[p->ola_hop_size + n] + + ch_opt_frame[n] * p->ola_window[n]; + } + + // Copy the second half to the output. + memcpy(&ch_output[p->ola_hop_size], &ch_opt_frame[p->ola_hop_size], + sizeof(*ch_opt_frame) * p->ola_hop_size); + } else { + // No overlap for the first iteration. + memcpy(ch_output, ch_opt_frame, + sizeof(*ch_opt_frame) * p->ola_window_size); + } + } + + p->num_complete_frames += p->ola_hop_size; + p->wsola_output_started = true; + return true; +} + +static int read_input_buffer(struct mp_scaletempo2 *p, int dest_size, float **dest) +{ + int frames_to_copy = MPMIN(dest_size, p->input_buffer_frames - p->target_block_index); + + if (frames_to_copy <= 0) + return 0; // There is nothing to read from input buffer; return. + + peek_buffer(p, frames_to_copy, p->target_block_index, 0, dest); + seek_buffer(p, frames_to_copy); + return frames_to_copy; +} + +int mp_scaletempo2_fill_buffer(struct mp_scaletempo2 *p, + float **dest, int dest_size, double playback_rate) +{ + if (playback_rate == 0) return 0; + + if (p->input_buffer_final_frames > 0) { + add_input_buffer_final_silence(p, playback_rate); + } + + // Optimize the muted case to issue a single clear instead of performing + // the full crossfade and clearing each crossfaded frame. + if (playback_rate < p->opts->min_playback_rate + || (playback_rate > p->opts->max_playback_rate + && p->opts->max_playback_rate > 0)) + { + int frames_to_render = MPMIN(dest_size, + (int) (p->input_buffer_frames / playback_rate)); + + // Compute accurate number of frames to actually skip in the source data. + // Includes the leftover partial frame from last request. However, we can + // only skip over complete frames, so a partial frame may remain for next + // time. + p->muted_partial_frame += frames_to_render * playback_rate; + int seek_frames = (int) (p->muted_partial_frame); + zero_2d_partial(dest, p->channels, frames_to_render); + seek_buffer(p, seek_frames); + + // Determine the partial frame that remains to be skipped for next call. If + // the user switches back to playing, it may be off time by this partial + // frame, which would be undetectable. If they subsequently switch to + // another playback rate that mutes, the code will attempt to line up the + // frames again. + p->muted_partial_frame -= seek_frames; + return frames_to_render; + } + + int slower_step = (int) ceilf(p->ola_window_size * playback_rate); + int faster_step = (int) ceilf(p->ola_window_size / playback_rate); + + // Optimize the most common |playback_rate| ~= 1 case to use a single copy + // instead of copying frame by frame. + if (p->ola_window_size <= faster_step && slower_step >= p->ola_window_size) { + + if (p->wsola_output_started) { + p->wsola_output_started = false; + + // sync audio precisely again + set_output_time(p, p->target_block_index); + remove_old_input_frames(p); + } + + return read_input_buffer(p, dest_size, dest); + } + + int rendered_frames = 0; + do { + rendered_frames += write_completed_frames_to(p, + dest_size - rendered_frames, rendered_frames, dest); + } while (rendered_frames < dest_size + && run_one_wsola_iteration(p, playback_rate)); + return rendered_frames; +} + +double mp_scaletempo2_get_latency(struct mp_scaletempo2 *p, double playback_rate) +{ + return p->input_buffer_frames - p->output_time + - p->input_buffer_added_silence + + p->num_complete_frames * playback_rate; +} + +bool mp_scaletempo2_frames_available(struct mp_scaletempo2 *p, double playback_rate) +{ + return p->input_buffer_final_frames > p->target_block_index + || can_perform_wsola(p, playback_rate) + || p->num_complete_frames > 0; +} + +void mp_scaletempo2_destroy(struct mp_scaletempo2 *p) +{ + free(p->ola_window); + free(p->transition_window); + free(p->wsola_output); + free(p->optimal_block); + free(p->search_block); + free(p->target_block); + free(p->input_buffer); + free(p->energy_candidate_blocks); +} + +void mp_scaletempo2_reset(struct mp_scaletempo2 *p) +{ + p->input_buffer_frames = 0; + p->input_buffer_final_frames = 0; + p->input_buffer_added_silence = 0; + p->output_time = 0.0; + p->search_block_index = 0; + p->target_block_index = 0; + // Clear the queue of decoded packets. + zero_2d(p->wsola_output, p->channels, p->wsola_output_size); + p->num_complete_frames = 0; + p->wsola_output_started = false; +} + +// Return a "periodic" Hann window. This is the first L samples of an L+1 +// Hann window. It is perfect reconstruction for overlap-and-add. +static void get_symmetric_hanning_window(int window_length, float* window) +{ + const float scale = 2.0f * M_PI / window_length; + for (int n = 0; n < window_length; ++n) + window[n] = 0.5f * (1.0f - cosf(n * scale)); +} + + +void mp_scaletempo2_init(struct mp_scaletempo2 *p, int channels, int rate) +{ + p->muted_partial_frame = 0; + p->output_time = 0; + p->search_block_index = 0; + p->target_block_index = 0; + p->num_complete_frames = 0; + p->wsola_output_started = false; + p->channels = channels; + + p->samples_per_second = rate; + p->num_candidate_blocks = (int)(p->opts->wsola_search_interval_ms + * p->samples_per_second / 1000); + p->ola_window_size = (int)(p->opts->ola_window_size_ms + * p->samples_per_second / 1000); + // Make sure window size in an even number. + p->ola_window_size += p->ola_window_size & 1; + p->ola_hop_size = p->ola_window_size / 2; + // |num_candidate_blocks| / 2 is the offset of the center of the search + // block to the center of the first (left most) candidate block. The offset + // of the center of a candidate block to its left most point is + // |ola_window_size| / 2 - 1. Note that |ola_window_size| is even and in + // our convention the center belongs to the left half, so we need to subtract + // one frame to get the correct offset. + // + // Search Block + // <-------------------------------------------> + // + // |ola_window_size| / 2 - 1 + // <---- + // + // |num_candidate_blocks| / 2 + // <---------------- + // center + // X----X----------------X---------------X-----X + // <----------> <----------> + // Candidate ... Candidate + // 1, ... |num_candidate_blocks| + p->search_block_center_offset = p->num_candidate_blocks / 2 + + (p->ola_window_size / 2 - 1); + p->ola_window = realloc(p->ola_window, sizeof(float) * p->ola_window_size); + get_symmetric_hanning_window(p->ola_window_size, p->ola_window); + p->transition_window = realloc(p->transition_window, + sizeof(float) * p->ola_window_size * 2); + get_symmetric_hanning_window(2 * p->ola_window_size, p->transition_window); + + p->wsola_output_size = p->ola_window_size + p->ola_hop_size; + p->wsola_output = realloc_2d(p->wsola_output, p->channels, p->wsola_output_size); + // Initialize for overlap-and-add of the first block. + zero_2d(p->wsola_output, p->channels, p->wsola_output_size); + + // Auxiliary containers. + p->optimal_block = realloc_2d(p->optimal_block, p->channels, p->ola_window_size); + p->search_block_size = p->num_candidate_blocks + (p->ola_window_size - 1); + p->search_block = realloc_2d(p->search_block, p->channels, p->search_block_size); + p->target_block = realloc_2d(p->target_block, p->channels, p->ola_window_size); + + resize_input_buffer(p, 4 * MPMAX(p->ola_window_size, p->search_block_size)); + p->input_buffer_frames = 0; + p->input_buffer_final_frames = 0; + p->input_buffer_added_silence = 0; + + p->energy_candidate_blocks = realloc(p->energy_candidate_blocks, + sizeof(float) * p->channels * p->num_candidate_blocks); +} diff --git a/audio/filter/af_scaletempo2_internals.h b/audio/filter/af_scaletempo2_internals.h new file mode 100644 index 0000000..6c3c94c --- /dev/null +++ b/audio/filter/af_scaletempo2_internals.h @@ -0,0 +1,134 @@ +// This filter was ported from Chromium +// (https://chromium.googlesource.com/chromium/chromium/+/51ed77e3f37a9a9b80d6d0a8259e84a8ca635259/media/filters/audio_renderer_algorithm.cc) +// +// Copyright 2015 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "common/common.h" + +struct mp_scaletempo2_opts { + // Max/min supported playback rates for fast/slow audio. Audio outside of these + // ranges are muted. + // Audio at these speeds would sound better under a frequency domain algorithm. + float min_playback_rate; + float max_playback_rate; + // Overlap-and-add window size in milliseconds. + float ola_window_size_ms; + // Size of search interval in milliseconds. The search interval is + // [-delta delta] around |output_index| * |playback_rate|. So the search + // interval is 2 * delta. + float wsola_search_interval_ms; +}; + +struct mp_scaletempo2 { + struct mp_scaletempo2_opts *opts; + // Number of channels in audio stream. + int channels; + // Sample rate of audio stream. + int samples_per_second; + // If muted, keep track of partial frames that should have been skipped over. + double muted_partial_frame; + // Book keeping of the current time of generated audio, in frames. + // Corresponds to the center of |search_block|. This is increased in + // intervals of |ola_hop_size| multiplied by the current playback_rate, + // for every WSOLA iteration. This tracks the number of advanced frames as + // a double to achieve accurate playback rates beyond the integer precision + // of |search_block_index|. + // Needs to be adjusted like any other index when frames are evicted from + // |input_buffer|. + double output_time; + // The offset of the center frame of |search_block| w.r.t. its first frame. + int search_block_center_offset; + // Index of the beginning of the |search_block|, in frames. This may be + // negative, which is handled by |peek_audio_with_zero_prepend|. + int search_block_index; + // Number of Blocks to search to find the most similar one to the target + // frame. + int num_candidate_blocks; + // Index of the beginning of the target block, counted in frames. + int target_block_index; + // Overlap-and-add window size in frames. + int ola_window_size; + // The hop size of overlap-and-add in frames. This implementation assumes 50% + // overlap-and-add. + int ola_hop_size; + // Number of frames in |wsola_output| that overlap-and-add is completed for + // them and can be copied to output if fill_buffer() is called. It also + // specifies the index where the next WSOLA window has to overlap-and-add. + int num_complete_frames; + // Whether |wsola_output| contains an additional |ola_hop_size| of overlap + // frames for the next iteration. + bool wsola_output_started; + // Overlap-and-add window. + float *ola_window; + // Transition window, used to update |optimal_block| by a weighted sum of + // |optimal_block| and |target_block|. + float *transition_window; + // This stores a part of the output that is created but couldn't be rendered. + // Output is generated frame-by-frame which at some point might exceed the + // number of requested samples. Furthermore, due to overlap-and-add, + // the last half-window of the output is incomplete, which is stored in this + // buffer. + float **wsola_output; + int wsola_output_size; + // Auxiliary variables to avoid allocation in every iteration. + // Stores the optimal block in every iteration. This is the most + // similar block to |target_block| within |search_block| and it is + // overlap-and-added to |wsola_output|. + float **optimal_block; + // A block of data that search is performed over to find the |optimal_block|. + float **search_block; + int search_block_size; + // Stores the target block, denoted as |target| above. |search_block| is + // searched for a block (|optimal_block|) that is most similar to + // |target_block|. + float **target_block; + // Buffered audio data. + float **input_buffer; + int input_buffer_size; + int input_buffer_frames; + // How many frames in |input_buffer| need to be flushed by padding with + // silence to process the final packet. While this is nonzero, the filter + // appends silence to |input_buffer| until these frames are processed. + int input_buffer_final_frames; + // How many additional frames of silence have been added to |input_buffer| + // for padding after the final packet. + int input_buffer_added_silence; + float *energy_candidate_blocks; +}; + +void mp_scaletempo2_destroy(struct mp_scaletempo2 *p); +void mp_scaletempo2_reset(struct mp_scaletempo2 *p); +void mp_scaletempo2_init(struct mp_scaletempo2 *p, int channels, int rate); +double mp_scaletempo2_get_latency(struct mp_scaletempo2 *p, double playback_rate); +int mp_scaletempo2_fill_input_buffer(struct mp_scaletempo2 *p, + uint8_t **planes, int frame_size, double playback_rate); +void mp_scaletempo2_set_final(struct mp_scaletempo2 *p); +int mp_scaletempo2_fill_buffer(struct mp_scaletempo2 *p, + float **dest, int dest_size, double playback_rate); +bool mp_scaletempo2_frames_available(struct mp_scaletempo2 *p, double playback_rate); |