8 files changed, 2963 insertions, 0 deletions
diff --git a/audio/filter/af_drop.c b/audio/filter/af_drop.c
new file mode 100644
index 0000000..724c482
--- /dev/null
+++ b/audio/filter/af_drop.c
@@ -0,0 +1,114 @@
+#include "audio/aframe.h"
+#include "audio/format.h"
+#include "common/common.h"
+#include "filters/f_autoconvert.h"
+#include "filters/filter_internal.h"
+#include "filters/user_filters.h"
+
+struct priv {
+    double speed;
+    double diff; // amount of too many additional samples in normal speed
+    struct mp_aframe *last; // for repeating
+};
+
+static void process(struct mp_filter *f)
+{
+    struct priv *p = f->priv;
+
+    if (!mp_pin_in_needs_data(f->ppins[1]))
+        return;
+
+    struct mp_frame frame = {0};
+
+    double last_dur = p->last ? mp_aframe_duration(p->last) : 0;
+    if (p->last && p->diff < 0 && -p->diff > last_dur / 2) {
+        MP_VERBOSE(f, "repeat\n");
+        frame = MAKE_FRAME(MP_FRAME_AUDIO, p->last);
+        p->last = NULL;
+    } else {
+        frame = mp_pin_out_read(f->ppins[0]);
+
+        if (frame.type == MP_FRAME_AUDIO) {
+            last_dur = mp_aframe_duration(frame.data);
+            p->diff -= last_dur;
+            if (p->diff > last_dur / 2) {
+                MP_VERBOSE(f, "drop\n");
+                mp_frame_unref(&frame);
+                mp_filter_internal_mark_progress(f);
+            }
+        }
+    }
+
+    if (frame.type == MP_FRAME_AUDIO) {
+        struct mp_aframe *fr = frame.data;
+        talloc_free(p->last);
+        p->last = mp_aframe_new_ref(fr);
+        mp_aframe_mul_speed(fr, p->speed);
+        p->diff += mp_aframe_duration(fr);
+        mp_aframe_set_pts(p->last, mp_aframe_end_pts(fr));
+    } else if (frame.type == MP_FRAME_EOF) {
+        TA_FREEP(&p->last);
+    }
+    mp_pin_in_write(f->ppins[1], frame);
+}
+
+static bool command(struct mp_filter *f, struct mp_filter_command *cmd)
+{
+    struct priv *p = f->priv;
+
+    switch (cmd->type) {
+    case MP_FILTER_COMMAND_SET_SPEED:
+        p->speed = cmd->speed;
+        return true;
+    }
+
+    return false;
+}
+
+static void reset(struct mp_filter *f)
+{
+    struct priv *p = f->priv;
+
+    TA_FREEP(&p->last);
+    p->diff = 0;
+}
+
+static void destroy(struct mp_filter *f)
+{
+    reset(f);
+}
+
+static const struct mp_filter_info af_drop_filter = {
+    .name = "drop",
+    .priv_size = sizeof(struct priv),
+    .process = process,
+    .command = command,
+    .reset = reset,
+    .destroy = destroy,
+};
+
+static struct mp_filter *af_drop_create(struct mp_filter *parent, void *options)
+{
+    struct mp_filter *f = mp_filter_create(parent, &af_drop_filter);
+    if (!f) {
+        talloc_free(options);
+        return NULL;
+    }
+
+    mp_filter_add_pin(f, MP_PIN_IN, "in");
+    mp_filter_add_pin(f, MP_PIN_OUT, "out");
+
+    struct priv *p = f->priv;
+    p->speed = 1.0;
+
+    return f;
+}
+
+const struct mp_user_filter_entry af_drop = {
+    .desc = {
+        .description = "Change audio speed by dropping/repeating frames",
+        .name = "drop",
+        .priv_size = sizeof(struct priv),
+    },
+    .create = af_drop_create,
+};
diff --git a/audio/filter/af_format.c b/audio/filter/af_format.c
new file mode 100644
index 0000000..2d1c1cc
--- /dev/null
+++ b/audio/filter/af_format.c
@@ -0,0 +1,143 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "audio/aframe.h"
+#include "audio/format.h"
+#include "filters/f_autoconvert.h"
+#include "filters/filter_internal.h"
+#include "filters/user_filters.h"
+#include "options/m_option.h"
+
+struct f_opts {
+    int in_format;
+    int in_srate;
+    struct m_channels in_channels;
+    int out_format;
+    int out_srate;
+    struct m_channels out_channels;
+
+    bool fail;
+};
+
+struct priv {
+    struct f_opts *opts;
+    struct mp_pin *in_pin;
+};
+
+static void process(struct mp_filter *f)
+{
+    struct priv *p = f->priv;
+
+    if (!mp_pin_can_transfer_data(f->ppins[1], p->in_pin))
+        return;
+
+    struct mp_frame frame = mp_pin_out_read(p->in_pin);
+
+    if (p->opts->fail) {
+        MP_ERR(f, "Failing on purpose.\n");
+        goto error;
+    }
+
+    if (frame.type == MP_FRAME_EOF) {
+        mp_pin_in_write(f->ppins[1], frame);
+        return;
+    }
+
+    if (frame.type != MP_FRAME_AUDIO) {
+        MP_ERR(f, "audio frame expected\n");
+        goto error;
+    }
+
+    struct mp_aframe *in = frame.data;
+
+    if (p->opts->out_channels.num_chmaps > 0) {
+        if (!mp_aframe_set_chmap(in, &p->opts->out_channels.chmaps[0])) {
+            MP_ERR(f, "could not force output channels\n");
+            goto error;
+        }
+    }
+
+    if (p->opts->out_srate)
+        mp_aframe_set_rate(in, p->opts->out_srate);
+
+    mp_pin_in_write(f->ppins[1], frame);
+    return;
+
+error:
+    mp_frame_unref(&frame);
+    mp_filter_internal_mark_failed(f);
+}
+
+static const struct mp_filter_info af_format_filter = {
+    .name = "format",
+    .priv_size = sizeof(struct priv),
+    .process = process,
+};
+
+static struct mp_filter *af_format_create(struct mp_filter *parent,
+                                              void *options)
+{
+    struct mp_filter *f = mp_filter_create(parent, &af_format_filter);
+    if (!f) {
+        talloc_free(options);
+        return NULL;
+    }
+
+    struct priv *p = f->priv;
+    p->opts = talloc_steal(p, options);
+
+    mp_filter_add_pin(f, MP_PIN_IN, "in");
+    mp_filter_add_pin(f, MP_PIN_OUT, "out");
+
+    struct mp_autoconvert *conv = mp_autoconvert_create(f);
+    if (!conv)
+        abort();
+
+    if (p->opts->in_format)
+        mp_autoconvert_add_afmt(conv, p->opts->in_format);
+    if (p->opts->in_srate)
+        mp_autoconvert_add_srate(conv, p->opts->in_srate);
+    if (p->opts->in_channels.num_chmaps > 0)
+        mp_autoconvert_add_chmap(conv, &p->opts->in_channels.chmaps[0]);
+
+    mp_pin_connect(conv->f->pins[0], f->ppins[0]);
+    p->in_pin = conv->f->pins[1];
+
+    return f;
+}
+
+#define OPT_BASE_STRUCT struct f_opts
+
+const struct mp_user_filter_entry af_format = {
+    .desc = {
+        .name = "format",
+        .description = "Force audio format",
+        .priv_size = sizeof(struct f_opts),
+        .options = (const struct m_option[]) {
+            {"format", OPT_AUDIOFORMAT(in_format)},
+            {"srate", OPT_INT(in_srate), M_RANGE(1000, 8*48000)},
+            {"channels", OPT_CHANNELS(in_channels),
+                .flags = M_OPT_CHANNELS_LIMITED},
+            {"out-srate", OPT_INT(out_srate), M_RANGE(1000, 8*48000)},
+            {"out-channels", OPT_CHANNELS(out_channels),
+                .flags = M_OPT_CHANNELS_LIMITED},
+            {"fail", OPT_BOOL(fail)},
+            {0}
+        },
+    },
+    .create = af_format_create,
+};
diff --git a/audio/filter/af_lavcac3enc.c b/audio/filter/af_lavcac3enc.c
new file mode 100644
index 0000000..b4a1d59
--- /dev/null
+++ b/audio/filter/af_lavcac3enc.c
@@ -0,0 +1,437 @@
+/*
+ * audio filter for runtime AC-3 encoding with libavcodec.
+ *
+ * Copyright (C) 2007 Ulion <ulion A gmail P com>
+ *
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include <assert.h>
+
+#include <libavcodec/avcodec.h>
+#include <libavutil/intreadwrite.h>
+#include <libavutil/common.h>
+#include <libavutil/bswap.h>
+#include <libavutil/mem.h>
+
+#include "config.h"
+
+#include "audio/aframe.h"
+#include "audio/chmap_avchannel.h"
+#include "audio/chmap_sel.h"
+#include "audio/fmt-conversion.h"
+#include "audio/format.h"
+#include "common/av_common.h"
+#include "common/common.h"
+#include "filters/f_autoconvert.h"
+#include "filters/f_utils.h"
+#include "filters/filter_internal.h"
+#include "filters/user_filters.h"
+#include "options/m_option.h"
+
+
+#define AC3_MAX_CHANNELS 6
+#define AC3_MAX_CODED_FRAME_SIZE 3840
+#define AC3_FRAME_SIZE (6  * 256)
+const static uint16_t ac3_bitrate_tab[19] = {
+    32, 40, 48, 56, 64, 80, 96, 112, 128,
+    160, 192, 224, 256, 320, 384, 448, 512, 576, 640
+};
+
+struct f_opts {
+    bool add_iec61937_header;
+    int bit_rate;
+    int min_channel_num;
+    char *encoder;
+    char **avopts;
+};
+
+struct priv {
+    struct f_opts *opts;
+
+    struct mp_pin *in_pin;
+    struct mp_aframe *cur_format;
+    struct mp_aframe *in_frame;
+    struct mp_aframe_pool *out_pool;
+
+    const struct AVCodec  *lavc_acodec;
+    struct AVCodecContext *lavc_actx;
+    AVPacket              *lavc_pkt;
+    int bit_rate;
+    int out_samples;    // upper bound on encoded output per AC3 frame
+};
+
+static bool reinit(struct mp_filter *f)
+{
+    struct priv *s = f->priv;
+
+    mp_aframe_reset(s->cur_format);
+
+    static const int default_bit_rate[AC3_MAX_CHANNELS+1] = \
+        {0, 96000, 192000, 256000, 384000, 448000, 448000};
+
+    if (s->opts->add_iec61937_header) {
+        s->out_samples = AC3_FRAME_SIZE;
+    } else {
+        s->out_samples = AC3_MAX_CODED_FRAME_SIZE /
+                         mp_aframe_get_sstride(s->in_frame);
+    }
+
+    int format = mp_aframe_get_format(s->in_frame);
+    int rate = mp_aframe_get_rate(s->in_frame);
+    struct mp_chmap chmap = {0};
+    mp_aframe_get_chmap(s->in_frame, &chmap);
+
+    int bit_rate = s->bit_rate;
+    if (!bit_rate && chmap.num < AC3_MAX_CHANNELS + 1)
+        bit_rate = default_bit_rate[chmap.num];
+
+    avcodec_close(s->lavc_actx);
+
+    // Put sample parameters
+    s->lavc_actx->sample_fmt = af_to_avformat(format);
+
+#if !HAVE_AV_CHANNEL_LAYOUT
+    s->lavc_actx->channels = chmap.num;
+    s->lavc_actx->channel_layout = mp_chmap_to_lavc(&chmap);
+#else
+    mp_chmap_to_av_layout(&s->lavc_actx->ch_layout, &chmap);
+#endif
+    s->lavc_actx->sample_rate = rate;
+    s->lavc_actx->bit_rate = bit_rate;
+
+    if (avcodec_open2(s->lavc_actx, s->lavc_acodec, NULL) < 0) {
+        MP_ERR(f, "Couldn't open codec %s, br=%d.\n", "ac3", bit_rate);
+        return false;
+    }
+
+    if (s->lavc_actx->frame_size < 1) {
+        MP_ERR(f, "encoder didn't specify input frame size\n");
+        return false;
+    }
+
+    mp_aframe_config_copy(s->cur_format, s->in_frame);
+    return true;
+}
+
+static void reset(struct mp_filter *f)
+{
+    struct priv *s = f->priv;
+
+    TA_FREEP(&s->in_frame);
+}
+
+static void destroy(struct mp_filter *f)
+{
+    struct priv *s = f->priv;
+
+    reset(f);
+    av_packet_free(&s->lavc_pkt);
+    avcodec_free_context(&s->lavc_actx);
+}
+
+static void swap_16(uint16_t *ptr, size_t size)
+{
+    for (size_t n = 0; n < size; n++)
+        ptr[n] = av_bswap16(ptr[n]);
+}
+
+static void process(struct mp_filter *f)
+{
+    struct priv *s = f->priv;
+
+    if (!mp_pin_in_needs_data(f->ppins[1]))
+        return;
+
+    bool err = true;
+    struct mp_aframe *out = NULL;
+    AVPacket *pkt = s->lavc_pkt;
+
+    // Send input as long as it wants.
+    while (1) {
+        if (avcodec_is_open(s->lavc_actx)) {
+            int lavc_ret = avcodec_receive_packet(s->lavc_actx, pkt);
+            if (lavc_ret >= 0)
+                break;
+            if (lavc_ret < 0 && lavc_ret != AVERROR(EAGAIN)) {
+                MP_FATAL(f, "Encode failed (receive).\n");
+                goto error;
+            }
+        }
+        AVFrame *frame = NULL;
+        struct mp_frame input = mp_pin_out_read(s->in_pin);
+        // The following code assumes no sample data buffering in the encoder.
+        switch (input.type) {
+        case MP_FRAME_NONE:
+            goto done; // no data yet
+        case MP_FRAME_EOF:
+            mp_pin_in_write(f->ppins[1], input);
+            goto done;
+        case MP_FRAME_AUDIO:
+            TA_FREEP(&s->in_frame);
+            s->in_frame = input.data;
+            frame = mp_frame_to_av(input, NULL);
+            if (!frame)
+                goto error;
+            if (mp_aframe_get_channels(s->in_frame) < s->opts->min_channel_num) {
+                // Just pass it through.
+                s->in_frame = NULL;
+                mp_pin_in_write(f->ppins[1], input);
+                goto done;
+            }
+            if (!mp_aframe_config_equals(s->in_frame, s->cur_format)) {
+                if (!reinit(f))
+                    goto error;
+            }
+            break;
+        default: goto error; // unexpected packet type
+        }
+        int lavc_ret = avcodec_send_frame(s->lavc_actx, frame);
+        av_frame_free(&frame);
+        if (lavc_ret < 0 && lavc_ret != AVERROR(EAGAIN)) {
+            MP_FATAL(f, "Encode failed (send).\n");
+            goto error;
+        }
+    }
+
+    if (!s->in_frame)
+        goto error;
+
+    out = mp_aframe_create();
+    mp_aframe_set_format(out, AF_FORMAT_S_AC3);
+    mp_aframe_set_chmap(out, &(struct mp_chmap)MP_CHMAP_INIT_STEREO);
+    mp_aframe_set_rate(out, 48000);
+
+    if (mp_aframe_pool_allocate(s->out_pool, out, s->out_samples) < 0)
+        goto error;
+
+    int sstride = mp_aframe_get_sstride(out);
+
+    mp_aframe_copy_attributes(out, s->in_frame);
+
+    int frame_size = pkt->size;
+    int header_len = 0;
+    char hdr[8];
+
+    if (s->opts->add_iec61937_header && pkt->size > 5) {
+        int bsmod = pkt->data[5] & 0x7;
+        int len = frame_size;
+
+        frame_size = AC3_FRAME_SIZE * 2 * 2;
+        header_len = 8;
+
+        AV_WL16(hdr,     0xF872);   // iec 61937 syncword 1
+        AV_WL16(hdr + 2, 0x4E1F);   // iec 61937 syncword 2
+        hdr[5] = bsmod;             // bsmod
+        hdr[4] = 0x01;              // data-type ac3
+        AV_WL16(hdr + 6, len << 3); // number of bits in payload
+    }
+
+    if (frame_size > s->out_samples * sstride)
+        abort();
+
+    uint8_t **planes = mp_aframe_get_data_rw(out);
+    if (!planes)
+        goto error;
+    char *buf = planes[0];
+    memcpy(buf, hdr, header_len);
+    memcpy(buf + header_len, pkt->data, pkt->size);
+    memset(buf + header_len + pkt->size, 0,
+           frame_size - (header_len + pkt->size));
+    swap_16((uint16_t *)(buf + header_len), pkt->size / 2);
+    mp_aframe_set_size(out, frame_size / sstride);
+    mp_pin_in_write(f->ppins[1], MAKE_FRAME(MP_FRAME_AUDIO, out));
+    out = NULL;
+
+done:
+    err = false;
+    // fall through
+error:
+    av_packet_unref(pkt);
+    talloc_free(out);
+    if (err)
+        mp_filter_internal_mark_failed(f);
+}
+
+static const struct mp_filter_info af_lavcac3enc_filter = {
+    .name = "lavcac3enc",
+    .priv_size = sizeof(struct priv),
+    .process = process,
+    .reset = reset,
+    .destroy = destroy,
+};
+
+static void add_chmaps_to_autoconv(struct mp_filter *f,
+                                   struct mp_autoconvert *conv,
+                                   const struct AVCodec *codec)
+{
+#if !HAVE_AV_CHANNEL_LAYOUT
+    const uint64_t *lch = codec->channel_layouts;
+    for (int n = 0; lch && lch[n]; n++) {
+        struct mp_chmap chmap = {0};
+        mp_chmap_from_lavc(&chmap, lch[n]);
+        if (mp_chmap_is_valid(&chmap))
+            mp_autoconvert_add_chmap(conv, &chmap);
+    }
+#else
+    const AVChannelLayout *lch = codec->ch_layouts;
+    for (int n = 0; lch && lch[n].nb_channels; n++) {
+        struct mp_chmap chmap = {0};
+
+        if (!mp_chmap_from_av_layout(&chmap, &lch[n])) {
+            char layout[128] = {0};
+            MP_VERBOSE(f, "Skipping unsupported channel layout: %s\n",
+                       av_channel_layout_describe(&lch[n],
+                                                  layout, 128) < 0 ?
+                       "undefined" : layout);
+            continue;
+        }
+
+        if (mp_chmap_is_valid(&chmap))
+            mp_autoconvert_add_chmap(conv, &chmap);
+    }
+#endif
+}
+
+static struct mp_filter *af_lavcac3enc_create(struct mp_filter *parent,
+                                              void *options)
+{
+    struct mp_filter *f = mp_filter_create(parent, &af_lavcac3enc_filter);
+    if (!f) {
+        talloc_free(options);
+        return NULL;
+    }
+
+    mp_filter_add_pin(f, MP_PIN_IN, "in");
+    mp_filter_add_pin(f, MP_PIN_OUT, "out");
+
+    struct priv *s = f->priv;
+    s->opts = talloc_steal(s, options);
+    s->cur_format = talloc_steal(s, mp_aframe_create());
+    s->out_pool = mp_aframe_pool_create(s);
+
+    s->lavc_acodec = avcodec_find_encoder_by_name(s->opts->encoder);
+    if (!s->lavc_acodec) {
+        MP_ERR(f, "Couldn't find encoder %s.\n", s->opts->encoder);
+        goto error;
+    }
+
+    s->lavc_actx = avcodec_alloc_context3(s->lavc_acodec);
+    if (!s->lavc_actx) {
+        MP_ERR(f, "Audio LAVC, couldn't allocate context!\n");
+        goto error;
+    }
+
+    s->lavc_pkt = av_packet_alloc();
+    if (!s->lavc_pkt)
+        goto error;
+
+    if (mp_set_avopts(f->log, s->lavc_actx, s->opts->avopts) < 0)
+        goto error;
+
+    // For this one, we require the decoder to export lists of all supported
+    // parameters. (Not all decoders do that, but the ones we're interested
+    // in do.)
+    if (!s->lavc_acodec->sample_fmts ||
+#if !HAVE_AV_CHANNEL_LAYOUT
+        !s->lavc_acodec->channel_layouts
+#else
+        !s->lavc_acodec->ch_layouts
+#endif
+        )
+    {
+        MP_ERR(f, "Audio encoder doesn't list supported parameters.\n");
+        goto error;
+    }
+
+    if (s->opts->bit_rate) {
+        int i;
+        for (i = 0; i < 19; i++) {
+            if (ac3_bitrate_tab[i] == s->opts->bit_rate) {
+                s->bit_rate = ac3_bitrate_tab[i] * 1000;
+                break;
+            }
+        }
+        if (i >= 19) {
+            MP_WARN(f, "unable set unsupported bitrate %d, using default "
+                    "bitrate (check manpage to see supported bitrates).\n",
+                    s->opts->bit_rate);
+        }
+    }
+
+    struct mp_autoconvert *conv = mp_autoconvert_create(f);
+    if (!conv)
+        abort();
+
+    const enum AVSampleFormat *lf = s->lavc_acodec->sample_fmts;
+    for (int i = 0; lf && lf[i] != AV_SAMPLE_FMT_NONE; i++) {
+        int mpfmt = af_from_avformat(lf[i]);
+        if (mpfmt)
+            mp_autoconvert_add_afmt(conv, mpfmt);
+    }
+
+    add_chmaps_to_autoconv(f, conv, s->lavc_acodec);
+
+    // At least currently, the AC3 encoder doesn't export sample rates.
+    mp_autoconvert_add_srate(conv, 48000);
+
+    mp_pin_connect(conv->f->pins[0], f->ppins[0]);
+
+    struct mp_filter *fs = mp_fixed_aframe_size_create(f, AC3_FRAME_SIZE, true);
+    if (!fs)
+        abort();
+
+    mp_pin_connect(fs->pins[0], conv->f->pins[1]);
+    s->in_pin = fs->pins[1];
+
+    return f;
+
+error:
+    av_packet_free(&s->lavc_pkt);
+    avcodec_free_context(&s->lavc_actx);
+    talloc_free(f);
+    return NULL;
+}
+
+#define OPT_BASE_STRUCT struct f_opts
+
+const struct mp_user_filter_entry af_lavcac3enc = {
+    .desc = {
+        .description = "runtime encode to ac3 using libavcodec",
+        .name = "lavcac3enc",
+        .priv_size = sizeof(OPT_BASE_STRUCT),
+        .priv_defaults = &(const OPT_BASE_STRUCT) {
+            .add_iec61937_header = true,
+            .bit_rate = 640,
+            .min_channel_num = 3,
+            .encoder = "ac3",
+        },
+        .options = (const struct m_option[]) {
+            {"tospdif", OPT_BOOL(add_iec61937_header)},
+            {"bitrate", OPT_CHOICE(bit_rate,
+                {"auto", 0}, {"default", 0}), M_RANGE(32, 640)},
+            {"minch", OPT_INT(min_channel_num), M_RANGE(2, 6)},
+            {"encoder", OPT_STRING(encoder)},
+            {"o", OPT_KEYVALUELIST(avopts)},
+            {0}
+        },
+    },
+    .create = af_lavcac3enc_create,
+};
diff --git a/audio/filter/af_rubberband.c b/audio/filter/af_rubberband.c
new file mode 100644
index 0000000..48e5cc1
--- /dev/null
+++ b/audio/filter/af_rubberband.c
@@ -0,0 +1,382 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdlib.h>
+#include <assert.h>
+
+#include <rubberband/rubberband-c.h>
+
+#include "config.h"
+
+#include "audio/aframe.h"
+#include "audio/format.h"
+#include "common/common.h"
+#include "filters/f_autoconvert.h"
+#include "filters/filter_internal.h"
+#include "filters/user_filters.h"
+#include "options/m_option.h"
+
+// command line options
+struct f_opts {
+    int transients, detector, phase, window,
+        smoothing, formant, pitch, channels, engine;
+    double scale;
+};
+
+struct priv {
+    struct f_opts *opts;
+
+    struct mp_pin *in_pin;
+    struct mp_aframe *cur_format;
+    struct mp_aframe_pool *out_pool;
+    bool sent_final;
+    RubberBandState rubber;
+    double speed;
+    double pitch;
+    struct mp_aframe *pending;
+    // Estimate how much librubberband has buffered internally.
+    // I could not find a way to do this with the librubberband API.
+    double rubber_delay;
+};
+
+static void update_speed(struct priv *p, double new_speed)
+{
+    p->speed = new_speed;
+    if (p->rubber)
+        rubberband_set_time_ratio(p->rubber, 1.0 / p->speed);
+}
+
+static bool update_pitch(struct priv *p, double new_pitch)
+{
+    if (new_pitch < 0.01 || new_pitch > 100.0)
+        return false;
+
+    p->pitch = new_pitch;
+    if (p->rubber)
+        rubberband_set_pitch_scale(p->rubber, p->pitch);
+    return true;
+}
+
+static bool init_rubberband(struct mp_filter *f)
+{
+    struct priv *p = f->priv;
+
+    assert(!p->rubber);
+    assert(p->pending);
+
+    int opts = p->opts->transients | p->opts->detector | p->opts->phase |
+               p->opts->window | p->opts->smoothing | p->opts->formant |
+               p->opts->pitch | p->opts->channels |
+#if HAVE_RUBBERBAND_3
+               p->opts->engine |
+#endif
+               RubberBandOptionProcessRealTime;
+
+    int rate = mp_aframe_get_rate(p->pending);
+    int channels = mp_aframe_get_channels(p->pending);
+    if (mp_aframe_get_format(p->pending) != AF_FORMAT_FLOATP)
+        return false;
+
+    p->rubber = rubberband_new(rate, channels, opts, 1.0, 1.0);
+    if (!p->rubber) {
+        MP_FATAL(f, "librubberband initialization failed.\n");
+        return false;
+    }
+
+    mp_aframe_config_copy(p->cur_format, p->pending);
+
+    update_speed(p, p->speed);
+    update_pitch(p, p->pitch);
+
+    return true;
+}
+
+static void process(struct mp_filter *f)
+{
+    struct priv *p = f->priv;
+
+    if (!mp_pin_in_needs_data(f->ppins[1]))
+        return;
+
+    while (!p->rubber || !p->pending || rubberband_available(p->rubber) <= 0) {
+        const float *dummy[MP_NUM_CHANNELS] = {0};
+        const float **in_data = dummy;
+        size_t in_samples = 0;
+
+        bool eof = false;
+        if (!p->pending || !mp_aframe_get_size(p->pending)) {
+            struct mp_frame frame = mp_pin_out_read(p->in_pin);
+            if (frame.type == MP_FRAME_AUDIO) {
+                TA_FREEP(&p->pending);
+                p->pending = frame.data;
+            } else if (frame.type == MP_FRAME_EOF) {
+                eof = true;
+            } else if (frame.type) {
+                MP_ERR(f, "unexpected frame type\n");
+                goto error;
+            } else {
+                return; // no new data yet
+            }
+        }
+        assert(p->pending || eof);
+
+        if (!p->rubber) {
+            if (!p->pending) {
+                mp_pin_in_write(f->ppins[1], MP_EOF_FRAME);
+                return;
+            }
+            if (!init_rubberband(f))
+                goto error;
+        }
+
+        bool format_change =
+            p->pending && !mp_aframe_config_equals(p->pending, p->cur_format);
+
+        if (p->pending && !format_change) {
+            size_t needs = rubberband_get_samples_required(p->rubber);
+            uint8_t **planes = mp_aframe_get_data_ro(p->pending);
+            int num_planes = mp_aframe_get_planes(p->pending);
+            for (int n = 0; n < num_planes; n++)
+                in_data[n] = (void *)planes[n];
+            in_samples = MPMIN(mp_aframe_get_size(p->pending), needs);
+        }
+
+        bool final = format_change || eof;
+        if (!p->sent_final)
+            rubberband_process(p->rubber, in_data, in_samples, final);
+        p->sent_final |= final;
+
+        p->rubber_delay += in_samples;
+
+        if (p->pending && !format_change)
+            mp_aframe_skip_samples(p->pending, in_samples);
+
+        if (rubberband_available(p->rubber) > 0) {
+            if (eof)
+                mp_pin_out_repeat_eof(p->in_pin); // drain more next time
+        } else {
+            if (eof) {
+                mp_pin_in_write(f->ppins[1], MP_EOF_FRAME);
+                rubberband_reset(p->rubber);
+                p->rubber_delay = 0;
+                TA_FREEP(&p->pending);
+                p->sent_final = false;
+                return;
+            } else if (format_change) {
+                // go on with proper reinit on the next iteration
+                rubberband_delete(p->rubber);
+                p->sent_final = false;
+                p->rubber = NULL;
+            }
+        }
+    }
+
+    assert(p->pending);
+
+    int out_samples = rubberband_available(p->rubber);
+    if (out_samples > 0) {
+        struct mp_aframe *out = mp_aframe_new_ref(p->cur_format);
+        if (mp_aframe_pool_allocate(p->out_pool, out, out_samples) < 0) {
+            talloc_free(out);
+            goto error;
+        }
+
+        mp_aframe_copy_attributes(out, p->pending);
+
+        float *out_data[MP_NUM_CHANNELS] = {0};
+        uint8_t **planes = mp_aframe_get_data_rw(out);
+        assert(planes);
+        int num_planes = mp_aframe_get_planes(out);
+        for (int n = 0; n < num_planes; n++)
+            out_data[n] = (void *)planes[n];
+
+        out_samples = rubberband_retrieve(p->rubber, out_data, out_samples);
+
+        if (!out_samples) {
+            mp_filter_internal_mark_progress(f); // unexpected, just try again
+            talloc_free(out);
+            return;
+        }
+
+        mp_aframe_set_size(out, out_samples);
+
+        p->rubber_delay -= out_samples * p->speed;
+
+        double pts = mp_aframe_get_pts(p->pending);
+        if (pts != MP_NOPTS_VALUE) {
+            // Note: rubberband_get_latency() does not do what you'd expect.
+            double delay = p->rubber_delay / mp_aframe_get_effective_rate(out);
+            mp_aframe_set_pts(out, pts - delay);
+        }
+
+        mp_aframe_mul_speed(out, p->speed);
+
+        mp_pin_in_write(f->ppins[1], MAKE_FRAME(MP_FRAME_AUDIO, out));
+    }
+
+    return;
+error:
+    mp_filter_internal_mark_failed(f);
+}
+
+static bool command(struct mp_filter *f, struct mp_filter_command *cmd)
+{
+    struct priv *p = f->priv;
+
+    switch (cmd->type) {
+    case MP_FILTER_COMMAND_TEXT: {
+        char *endptr = NULL;
+        double pitch = p->pitch;
+        if (!strcmp(cmd->cmd, "set-pitch")) {
+            pitch = strtod(cmd->arg, &endptr);
+            if (*endptr)
+                return false;
+            return update_pitch(p, pitch);
+        } else if (!strcmp(cmd->cmd, "multiply-pitch")) {
+            double mult = strtod(cmd->arg, &endptr);
+            if (*endptr || mult <= 0)
+                return false;
+            pitch *= mult;
+            return update_pitch(p, pitch);
+        }
+        return false;
+    }
+    case MP_FILTER_COMMAND_SET_SPEED:
+        update_speed(p, cmd->speed);
+        return true;
+    }
+
+    return false;
+}
+
+static void reset(struct mp_filter *f)
+{
+    struct priv *p = f->priv;
+
+    if (p->rubber)
+        rubberband_reset(p->rubber);
+    p->rubber_delay = 0;
+    p->sent_final = false;
+    TA_FREEP(&p->pending);
+}
+
+static void destroy(struct mp_filter *f)
+{
+    struct priv *p = f->priv;
+
+    if (p->rubber)
+        rubberband_delete(p->rubber);
+    talloc_free(p->pending);
+}
+
+static const struct mp_filter_info af_rubberband_filter = {
+    .name = "rubberband",
+    .priv_size = sizeof(struct priv),
+    .process = process,
+    .command = command,
+    .reset = reset,
+    .destroy = destroy,
+};
+
+static struct mp_filter *af_rubberband_create(struct mp_filter *parent,
+                                              void *options)
+{
+    struct mp_filter *f = mp_filter_create(parent, &af_rubberband_filter);
+    if (!f) {
+        talloc_free(options);
+        return NULL;
+    }
+
+    mp_filter_add_pin(f, MP_PIN_IN, "in");
+    mp_filter_add_pin(f, MP_PIN_OUT, "out");
+
+    struct priv *p = f->priv;
+    p->opts = talloc_steal(p, options);
+    p->speed = 1.0;
+    p->pitch = p->opts->scale;
+    p->cur_format = talloc_steal(p, mp_aframe_create());
+    p->out_pool = mp_aframe_pool_create(p);
+
+    struct mp_autoconvert *conv = mp_autoconvert_create(f);
+    if (!conv)
+        abort();
+
+    mp_autoconvert_add_afmt(conv, AF_FORMAT_FLOATP);
+
+    mp_pin_connect(conv->f->pins[0], f->ppins[0]);
+    p->in_pin = conv->f->pins[1];
+
+    return f;
+}
+
+#define OPT_BASE_STRUCT struct f_opts
+
+const struct mp_user_filter_entry af_rubberband = {
+    .desc = {
+        .description = "Pitch conversion with librubberband",
+        .name = "rubberband",
+        .priv_size = sizeof(OPT_BASE_STRUCT),
+        .priv_defaults = &(const OPT_BASE_STRUCT) {
+            .scale = 1.0,
+            .pitch = RubberBandOptionPitchHighConsistency,
+            .transients = RubberBandOptionTransientsMixed,
+            .formant = RubberBandOptionFormantPreserved,
+            .channels = RubberBandOptionChannelsTogether,
+#if HAVE_RUBBERBAND_3
+            .engine = RubberBandOptionEngineFiner,
+#endif
+        },
+        .options = (const struct m_option[]) {
+            {"transients", OPT_CHOICE(transients,
+                {"crisp", RubberBandOptionTransientsCrisp},
+                {"mixed", RubberBandOptionTransientsMixed},
+                {"smooth", RubberBandOptionTransientsSmooth})},
+            {"detector", OPT_CHOICE(detector,
+                {"compound", RubberBandOptionDetectorCompound},
+                {"percussive", RubberBandOptionDetectorPercussive},
+                {"soft", RubberBandOptionDetectorSoft})},
+            {"phase", OPT_CHOICE(phase,
+                {"laminar", RubberBandOptionPhaseLaminar},
+                {"independent", RubberBandOptionPhaseIndependent})},
+            {"window", OPT_CHOICE(window,
+                {"standard", RubberBandOptionWindowStandard},
+                {"short", RubberBandOptionWindowShort},
+                {"long", RubberBandOptionWindowLong})},
+            {"smoothing", OPT_CHOICE(smoothing,
+                {"off", RubberBandOptionSmoothingOff},
+                {"on", RubberBandOptionSmoothingOn})},
+            {"formant", OPT_CHOICE(formant,
+                {"shifted", RubberBandOptionFormantShifted},
+                {"preserved", RubberBandOptionFormantPreserved})},
+            {"pitch", OPT_CHOICE(pitch,
+                {"quality", RubberBandOptionPitchHighQuality},
+                {"speed", RubberBandOptionPitchHighSpeed},
+                {"consistency", RubberBandOptionPitchHighConsistency})},
+            {"channels", OPT_CHOICE(channels,
+                {"apart", RubberBandOptionChannelsApart},
+                {"together", RubberBandOptionChannelsTogether})},
+#if HAVE_RUBBERBAND_3
+            {"engine", OPT_CHOICE(engine,
+                {"finer", RubberBandOptionEngineFiner},
+                {"faster", RubberBandOptionEngineFaster})},
+#endif
+            {"pitch-scale", OPT_DOUBLE(scale), M_RANGE(0.01, 100)},
+            {0}
+        },
+    },
+    .create = af_rubberband_create,
+};
diff --git a/audio/filter/af_scaletempo.c b/audio/filter/af_scaletempo.c
new file mode 100644
index 0000000..f06478f
--- /dev/null
+++ b/audio/filter/af_scaletempo.c
@@ -0,0 +1,626 @@
+/*
+ * scaletempo audio filter
+ *
+ * scale tempo while maintaining pitch
+ * (WSOLA technique with cross correlation)
+ * inspired by SoundTouch library by Olli Parviainen
+ *
+ * basic algorithm
+ *   - produce 'stride' output samples per loop
+ *   - consume stride*scale input samples per loop
+ *
+ * to produce smoother transitions between strides, blend next overlap
+ * samples from last stride with correlated samples of current input
+ *
+ * Copyright (c) 2007 Robert Juliano
+ *
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <float.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <assert.h>
+
+#include "audio/aframe.h"
+#include "audio/format.h"
+#include "common/common.h"
+#include "filters/f_autoconvert.h"
+#include "filters/filter_internal.h"
+#include "filters/user_filters.h"
+#include "options/m_option.h"
+
+struct f_opts {
+    float scale_nominal;
+    float ms_stride;
+    float ms_search;
+    float factor_overlap;
+#define SCALE_TEMPO 1
+#define SCALE_PITCH 2
+    int speed_opt;
+};
+
+struct priv {
+    struct f_opts *opts;
+
+    struct mp_pin *in_pin;
+    struct mp_aframe *cur_format;
+    struct mp_aframe_pool *out_pool;
+    double current_pts;
+    struct mp_aframe *in;
+
+    // stride
+    float scale;
+    float speed;
+    int frames_stride;
+    float frames_stride_scaled;
+    float frames_stride_error;
+    int bytes_per_frame;
+    int bytes_stride;
+    int bytes_queue;
+    int bytes_queued;
+    int bytes_to_slide;
+    int8_t *buf_queue;
+    // overlap
+    int samples_overlap;
+    int samples_standing;
+    int bytes_overlap;
+    int bytes_standing;
+    void *buf_overlap;
+    void *table_blend;
+    void (*output_overlap)(struct priv *s, void *out_buf,
+                           int bytes_off);
+    // best overlap
+    int frames_search;
+    int num_channels;
+    void *buf_pre_corr;
+    void *table_window;
+    int (*best_overlap_offset)(struct priv *s);
+};
+
+static bool reinit(struct mp_filter *f);
+
+// Return whether it got enough data for filtering.
+static bool fill_queue(struct priv *s)
+{
+    int bytes_in = s->in ? mp_aframe_get_size(s->in) * s->bytes_per_frame : 0;
+    int offset = 0;
+
+    if (s->bytes_to_slide > 0) {
+        if (s->bytes_to_slide < s->bytes_queued) {
+            int bytes_move = s->bytes_queued - s->bytes_to_slide;
+            memmove(s->buf_queue, s->buf_queue + s->bytes_to_slide, bytes_move);
+            s->bytes_to_slide = 0;
+            s->bytes_queued = bytes_move;
+        } else {
+            int bytes_skip;
+            s->bytes_to_slide -= s->bytes_queued;
+            bytes_skip = MPMIN(s->bytes_to_slide, bytes_in);
+            s->bytes_queued = 0;
+            s->bytes_to_slide -= bytes_skip;
+            offset += bytes_skip;
+            bytes_in -= bytes_skip;
+        }
+    }
+
+    int bytes_needed = s->bytes_queue - s->bytes_queued;
+    assert(bytes_needed >= 0);
+
+    int bytes_copy = MPMIN(bytes_needed, bytes_in);
+    if (bytes_copy > 0) {
+        uint8_t **planes = mp_aframe_get_data_ro(s->in);
+        memcpy(s->buf_queue + s->bytes_queued, planes[0] + offset, bytes_copy);
+        s->bytes_queued += bytes_copy;
+        offset += bytes_copy;
+        bytes_needed -= bytes_copy;
+    }
+
+    if (s->in)
+        mp_aframe_skip_samples(s->in, offset / s->bytes_per_frame);
+
+    return bytes_needed == 0;
+}
+
+#define UNROLL_PADDING (4 * 4)
+
+static int best_overlap_offset_float(struct priv *s)
+{
+    float best_corr = INT_MIN;
+    int best_off = 0;
+
+    float *pw  = s->table_window;
+    float *po  = s->buf_overlap;
+    po += s->num_channels;
+    float *ppc = s->buf_pre_corr;
+    for (int i = s->num_channels; i < s->samples_overlap; i++)
+        *ppc++ = *pw++ **po++;
+
+    float *search_start = (float *)s->buf_queue + s->num_channels;
+    for (int off = 0; off < s->frames_search; off++) {
+        float corr = 0;
+        float *ps = search_start;
+        ppc = s->buf_pre_corr;
+        for (int i = s->num_channels; i < s->samples_overlap; i++)
+            corr += *ppc++ **ps++;
+        if (corr > best_corr) {
+            best_corr = corr;
+            best_off  = off;
+        }
+        search_start += s->num_channels;
+    }
+
+    return best_off * 4 * s->num_channels;
+}
+
+static int best_overlap_offset_s16(struct priv *s)
+{
+    int64_t best_corr = INT64_MIN;
+    int best_off = 0;
+
+    int32_t *pw  = s->table_window;
+    int16_t *po  = s->buf_overlap;
+    po += s->num_channels;
+    int32_t *ppc = s->buf_pre_corr;
+    for (long i = s->num_channels; i < s->samples_overlap; i++)
+        *ppc++ = (*pw++ **po++) >> 15;
+
+    int16_t *search_start = (int16_t *)s->buf_queue + s->num_channels;
+    for (int off = 0; off < s->frames_search; off++) {
+        int64_t corr = 0;
+        int16_t *ps = search_start;
+        ppc = s->buf_pre_corr;
+        ppc += s->samples_overlap - s->num_channels;
+        ps  += s->samples_overlap - s->num_channels;
+        long i  = -(s->samples_overlap - s->num_channels);
+        do {
+            corr += ppc[i + 0] * (int64_t)ps[i + 0];
+            corr += ppc[i + 1] * (int64_t)ps[i + 1];
+            corr += ppc[i + 2] * (int64_t)ps[i + 2];
+            corr += ppc[i + 3] * (int64_t)ps[i + 3];
+            i += 4;
+        } while (i < 0);
+        if (corr > best_corr) {
+            best_corr = corr;
+            best_off  = off;
+        }
+        search_start += s->num_channels;
+    }
+
+    return best_off * 2 * s->num_channels;
+}
+
+static void output_overlap_float(struct priv *s, void *buf_out,
+                                 int bytes_off)
+{
+    float *pout = buf_out;
+    float *pb   = s->table_blend;
+    float *po   = s->buf_overlap;
+    float *pin  = (float *)(s->buf_queue + bytes_off);
+    for (int i = 0; i < s->samples_overlap; i++) {
+        *pout++ = *po - *pb++ *(*po - *pin++);
+        po++;
+    }
+}
+
+static void output_overlap_s16(struct priv *s, void *buf_out,
+                               int bytes_off)
+{
+    int16_t *pout = buf_out;
+    int32_t *pb   = s->table_blend;
+    int16_t *po   = s->buf_overlap;
+    int16_t *pin  = (int16_t *)(s->buf_queue + bytes_off);
+    for (int i = 0; i < s->samples_overlap; i++) {
+        *pout++ = *po - ((*pb++ *(*po - *pin++)) >> 16);
+        po++;
+    }
+}
+
+static void process(struct mp_filter *f)
+{
+    struct priv *s = f->priv;
+
+    if (!mp_pin_in_needs_data(f->ppins[1]))
+        return;
+
+    struct mp_aframe *out = NULL;
+
+    bool drain = false;
+    bool is_eof = false;
+    if (!s->in) {
+        struct mp_frame frame = mp_pin_out_read(s->in_pin);
+        if (!frame.type)
+            return; // no input yet
+        if (frame.type != MP_FRAME_AUDIO && frame.type != MP_FRAME_EOF) {
+            MP_ERR(f, "unexpected frame type\n");
+            goto error;
+        }
+
+        s->in = frame.type == MP_FRAME_AUDIO ? frame.data : NULL;
+        is_eof = drain = !s->in;
+
+        // EOF before it was even initialized once.
+        if (is_eof && !mp_aframe_config_is_valid(s->cur_format)) {
+            mp_pin_in_write(f->ppins[1], MP_EOF_FRAME);
+            return;
+        }
+
+        if (s->in && !mp_aframe_config_equals(s->in, s->cur_format)) {
+            if (s->bytes_queued) {
+                // Drain remaining data before executing the format change.
+                MP_VERBOSE(f, "draining\n");
+                mp_pin_out_unread(s->in_pin, frame);
+                s->in = NULL;
+                drain = true;
+            } else {
+                if (!reinit(f)) {
+                    MP_ERR(f, "initialization failed\n");
+                    goto error;
+                }
+            }
+        }
+
+        if (s->in)
+            s->current_pts = mp_aframe_end_pts(s->in);
+    }
+
+    if (!fill_queue(s) && !drain) {
+        TA_FREEP(&s->in);
+        mp_pin_out_request_data_next(s->in_pin);
+        return;
+    }
+
+    int max_out_samples = s->bytes_stride / s->bytes_per_frame;
+    if (drain)
+        max_out_samples += s->bytes_queued;
+
+    out = mp_aframe_new_ref(s->cur_format);
+    if (mp_aframe_pool_allocate(s->out_pool, out, max_out_samples) < 0)
+        goto error;
+
+    if (s->in)
+        mp_aframe_copy_attributes(out, s->in);
+
+    uint8_t **out_planes = mp_aframe_get_data_rw(out);
+    if (!out_planes)
+        goto error;
+    int8_t *pout = out_planes[0];
+    int out_offset = 0;
+    if (s->bytes_queued >= s->bytes_queue) {
+        int ti;
+        float tf;
+        int bytes_off = 0;
+
+        // output stride
+        if (s->output_overlap) {
+            if (s->best_overlap_offset)
+                bytes_off = s->best_overlap_offset(s);
+            s->output_overlap(s, pout + out_offset, bytes_off);
+        }
+        memcpy(pout + out_offset + s->bytes_overlap,
+               s->buf_queue + bytes_off + s->bytes_overlap,
+               s->bytes_standing);
+        out_offset += s->bytes_stride;
+
+        // input stride
+        memcpy(s->buf_overlap,
+               s->buf_queue + bytes_off + s->bytes_stride,
+               s->bytes_overlap);
+        tf = s->frames_stride_scaled + s->frames_stride_error;
+        ti = (int)tf;
+        s->frames_stride_error = tf - ti;
+        s->bytes_to_slide = ti * s->bytes_per_frame;
+    }
+    // Drain remaining buffered data.
+    if (drain && s->bytes_queued) {
+        memcpy(pout + out_offset, s->buf_queue, s->bytes_queued);
+        out_offset += s->bytes_queued;
+        s->bytes_queued = 0;
+    }
+    mp_aframe_set_size(out, out_offset / s->bytes_per_frame);
+
+    // This filter can have a negative delay when scale > 1:
+    // output corresponding to some length of input can be decided and written
+    // after receiving only a part of that input.
+    float delay = (out_offset * s->speed + s->bytes_queued - s->bytes_to_slide) /
+                    s->bytes_per_frame / mp_aframe_get_effective_rate(out)
+                  + (s->in ? mp_aframe_duration(s->in) : 0);
+
+    if (s->current_pts != MP_NOPTS_VALUE)
+        mp_aframe_set_pts(out, s->current_pts - delay);
+
+    mp_aframe_mul_speed(out, s->speed);
+
+    if (!mp_aframe_get_size(out))
+        TA_FREEP(&out);
+
+    if (is_eof && out) {
+        mp_pin_out_repeat_eof(s->in_pin);
+    } else if (is_eof && !out) {
+        mp_pin_in_write(f->ppins[1], MP_EOF_FRAME);
+    } else if (!is_eof && !out) {
+        mp_pin_out_request_data_next(s->in_pin);
+    }
+
+    if (out)
+        mp_pin_in_write(f->ppins[1], MAKE_FRAME(MP_FRAME_AUDIO, out));
+
+    return;
+
+error:
+    TA_FREEP(&s->in);
+    talloc_free(out);
+    mp_filter_internal_mark_failed(f);
+}
+
+static void update_speed(struct priv *s, float speed)
+{
+    s->speed = speed;
+
+    double factor = (s->opts->speed_opt & SCALE_PITCH) ? 1.0 / s->speed : s->speed;
+    s->scale = factor * s->opts->scale_nominal;
+
+    s->frames_stride_scaled = s->scale * s->frames_stride;
+    s->frames_stride_error = MPMIN(s->frames_stride_error, s->frames_stride_scaled);
+}
+
+static bool reinit(struct mp_filter *f)
+{
+    struct priv *s = f->priv;
+
+    mp_aframe_reset(s->cur_format);
+
+    float srate  = mp_aframe_get_rate(s->in) / 1000.0;
+    int nch = mp_aframe_get_channels(s->in);
+    int format = mp_aframe_get_format(s->in);
+
+    int use_int = 0;
+    if (format == AF_FORMAT_S16) {
+        use_int = 1;
+    } else if (format != AF_FORMAT_FLOAT) {
+        return false;
+    }
+    int bps = use_int ? 2 : 4;
+
+    s->frames_stride        = srate * s->opts->ms_stride;
+    s->bytes_stride         = s->frames_stride * bps * nch;
+
+    update_speed(s, s->speed);
+
+    int frames_overlap = s->frames_stride * s->opts->factor_overlap;
+    if (frames_overlap <= 0) {
+        s->bytes_standing   = s->bytes_stride;
+        s->samples_standing = s->bytes_standing / bps;
+        s->output_overlap   = NULL;
+        s->bytes_overlap    = 0;
+    } else {
+        s->samples_overlap  = frames_overlap * nch;
+        s->bytes_overlap    = frames_overlap * nch * bps;
+        s->bytes_standing   = s->bytes_stride - s->bytes_overlap;
+        s->samples_standing = s->bytes_standing / bps;
+        s->buf_overlap      = realloc(s->buf_overlap, s->bytes_overlap);
+        s->table_blend      = realloc(s->table_blend, s->bytes_overlap * 4);
+        if (!s->buf_overlap || !s->table_blend) {
+            MP_FATAL(f, "Out of memory\n");
+            return false;
+        }
+        memset(s->buf_overlap, 0, s->bytes_overlap);
+        if (use_int) {
+            int32_t *pb = s->table_blend;
+            int64_t blend = 0;
+            for (int i = 0; i < frames_overlap; i++) {
+                int32_t v = blend / frames_overlap;
+                for (int j = 0; j < nch; j++)
+                    *pb++ = v;
+                blend += 65536; // 2^16
+            }
+            s->output_overlap = output_overlap_s16;
+        } else {
+            float *pb = s->table_blend;
+            for (int i = 0; i < frames_overlap; i++) {
+                float v = i / (float)frames_overlap;
+                for (int j = 0; j < nch; j++)
+                    *pb++ = v;
+            }
+            s->output_overlap = output_overlap_float;
+        }
+    }
+
+    s->frames_search = (frames_overlap > 1) ? srate * s->opts->ms_search : 0;
+    if (s->frames_search <= 0)
+        s->best_overlap_offset = NULL;
+    else {
+        if (use_int) {
+            int64_t t = frames_overlap;
+            int32_t n = 8589934588LL / (t * t); // 4 * (2^31 - 1) / t^2
+            s->buf_pre_corr = realloc(s->buf_pre_corr,
+                                        s->bytes_overlap * 2 + UNROLL_PADDING);
+            s->table_window = realloc(s->table_window,
+                                        s->bytes_overlap * 2 - nch * bps * 2);
+            if (!s->buf_pre_corr || !s->table_window) {
+                MP_FATAL(f, "Out of memory\n");
+                return false;
+            }
+            memset((char *)s->buf_pre_corr + s->bytes_overlap * 2, 0,
+                    UNROLL_PADDING);
+            int32_t *pw = s->table_window;
+            for (int i = 1; i < frames_overlap; i++) {
+                int32_t v = (i * (t - i) * n) >> 15;
+                for (int j = 0; j < nch; j++)
+                    *pw++ = v;
+            }
+            s->best_overlap_offset = best_overlap_offset_s16;
+        } else {
+            s->buf_pre_corr = realloc(s->buf_pre_corr, s->bytes_overlap);
+            s->table_window = realloc(s->table_window,
+                                        s->bytes_overlap - nch * bps);
+            if (!s->buf_pre_corr || !s->table_window) {
+                MP_FATAL(f, "Out of memory\n");
+                return false;
+            }
+            float *pw = s->table_window;
+            for (int i = 1; i < frames_overlap; i++) {
+                float v = i * (frames_overlap - i);
+                for (int j = 0; j < nch; j++)
+                    *pw++ = v;
+            }
+            s->best_overlap_offset = best_overlap_offset_float;
+        }
+    }
+
+    s->bytes_per_frame = bps * nch;
+    s->num_channels    = nch;
+
+    s->bytes_queue = (s->frames_search + s->frames_stride + frames_overlap)
+                        * bps * nch;
+    s->buf_queue = realloc(s->buf_queue, s->bytes_queue + UNROLL_PADDING);
+    if (!s->buf_queue) {
+        MP_FATAL(f, "Out of memory\n");
+        return false;
+    }
+
+    s->bytes_queued = 0;
+    s->bytes_to_slide = 0;
+
+    MP_DBG(f, ""
+           "%.2f stride_in, %i stride_out, %i standing, "
+           "%i overlap, %i search, %i queue, %s mode\n",
+           s->frames_stride_scaled,
+           (int)(s->bytes_stride / nch / bps),
+           (int)(s->bytes_standing / nch / bps),
+           (int)(s->bytes_overlap / nch / bps),
+           s->frames_search,
+           (int)(s->bytes_queue / nch / bps),
+           (use_int ? "s16" : "float"));
+
+    mp_aframe_config_copy(s->cur_format, s->in);
+
+    return true;
+}
+
+static bool command(struct mp_filter *f, struct mp_filter_command *cmd)
+{
+    struct priv *s = f->priv;
+
+    if (cmd->type == MP_FILTER_COMMAND_SET_SPEED) {
+        if (s->opts->speed_opt & SCALE_TEMPO) {
+            if (s->opts->speed_opt & SCALE_PITCH)
+                return false;
+            update_speed(s, cmd->speed);
+            return true;
+        } else if (s->opts->speed_opt & SCALE_PITCH) {
+            update_speed(s, cmd->speed);
+            return false; // do not signal OK
+        }
+    }
+
+    return false;
+}
+
+static void reset(struct mp_filter *f)
+{
+    struct priv *s = f->priv;
+
+    s->current_pts = MP_NOPTS_VALUE;
+    s->bytes_queued = 0;
+    s->bytes_to_slide = 0;
+    s->frames_stride_error = 0;
+    if (s->buf_overlap && s->bytes_overlap)
+        memset(s->buf_overlap, 0, s->bytes_overlap);
+    TA_FREEP(&s->in);
+}
+
+static void destroy(struct mp_filter *f)
+{
+    struct priv *s = f->priv;
+    free(s->buf_queue);
+    free(s->buf_overlap);
+    free(s->buf_pre_corr);
+    free(s->table_blend);
+    free(s->table_window);
+    TA_FREEP(&s->in);
+    mp_filter_free_children(f);
+}
+
+static const struct mp_filter_info af_scaletempo_filter = {
+    .name = "scaletempo",
+    .priv_size = sizeof(struct priv),
+    .process = process,
+    .command = command,
+    .reset = reset,
+    .destroy = destroy,
+};
+
+static struct mp_filter *af_scaletempo_create(struct mp_filter *parent,
+                                              void *options)
+{
+    struct mp_filter *f = mp_filter_create(parent, &af_scaletempo_filter);
+    if (!f) {
+        talloc_free(options);
+        return NULL;
+    }
+
+    mp_filter_add_pin(f, MP_PIN_IN, "in");
+    mp_filter_add_pin(f, MP_PIN_OUT, "out");
+
+    struct priv *s = f->priv;
+    s->opts = talloc_steal(s, options);
+    s->speed = 1.0;
+    s->cur_format = talloc_steal(s, mp_aframe_create());
+    s->out_pool = mp_aframe_pool_create(s);
+
+    struct mp_autoconvert *conv = mp_autoconvert_create(f);
+    if (!conv)
+        abort();
+
+    mp_autoconvert_add_afmt(conv, AF_FORMAT_S16);
+    mp_autoconvert_add_afmt(conv, AF_FORMAT_FLOAT);
+
+    mp_pin_connect(conv->f->pins[0], f->ppins[0]);
+    s->in_pin = conv->f->pins[1];
+
+    return f;
+}
+
+#define OPT_BASE_STRUCT struct f_opts
+
+const struct mp_user_filter_entry af_scaletempo = {
+    .desc = {
+        .description = "Scale audio tempo while maintaining pitch",
+        .name = "scaletempo",
+        .priv_size = sizeof(OPT_BASE_STRUCT),
+        .priv_defaults = &(const OPT_BASE_STRUCT) {
+            .ms_stride = 60,
+            .factor_overlap = .20,
+            .ms_search = 14,
+            .speed_opt = SCALE_TEMPO,
+            .scale_nominal = 1.0,
+        },
+        .options = (const struct m_option[]) {
+            {"scale", OPT_FLOAT(scale_nominal), M_RANGE(0.01, DBL_MAX)},
+            {"stride", OPT_FLOAT(ms_stride), M_RANGE(0.01, DBL_MAX)},
+            {"overlap", OPT_FLOAT(factor_overlap), M_RANGE(0, 1)},
+            {"search", OPT_FLOAT(ms_search), M_RANGE(0, DBL_MAX)},
+            {"speed", OPT_CHOICE(speed_opt,
+                {"pitch", SCALE_PITCH},
+                {"tempo", SCALE_TEMPO},
+                {"none", 0},
+                {"both", SCALE_TEMPO | SCALE_PITCH})},
+            {0}
+        },
+    },
+    .create = af_scaletempo_create,
+};
diff --git a/audio/filter/af_scaletempo2.c b/audio/filter/af_scaletempo2.c
new file mode 100644
index 0000000..7ad8e35
--- /dev/null
+++ b/audio/filter/af_scaletempo2.c
@@ -0,0 +1,254 @@
+#include "audio/aframe.h"
+#include "audio/filter/af_scaletempo2_internals.h"
+#include "audio/format.h"
+#include "common/common.h"
+#include "filters/f_autoconvert.h"
+#include "filters/filter_internal.h"
+#include "filters/user_filters.h"
+#include "options/m_option.h"
+
+struct priv {
+    struct mp_scaletempo2 data;
+    struct mp_pin *in_pin;
+    struct mp_aframe *cur_format;
+    struct mp_aframe_pool *out_pool;
+    bool sent_final;
+    struct mp_aframe *pending;
+    bool initialized;
+    float speed;
+};
+
+static bool init_scaletempo2(struct mp_filter *f);
+static void reset(struct mp_filter *f);
+
+static void process(struct mp_filter *f)
+{
+    struct priv *p = f->priv;
+
+    if (!mp_pin_in_needs_data(f->ppins[1]))
+        return;
+
+    while (!p->initialized || !p->pending ||
+           !mp_scaletempo2_frames_available(&p->data, p->speed))
+    {
+        bool eof = false;
+        if (!p->pending || !mp_aframe_get_size(p->pending)) {
+            struct mp_frame frame = mp_pin_out_read(p->in_pin);
+            if (frame.type == MP_FRAME_AUDIO) {
+                TA_FREEP(&p->pending);
+                p->pending = frame.data;
+            } else if (frame.type == MP_FRAME_EOF) {
+                eof = true;
+            } else if (frame.type) {
+                MP_ERR(f, "unexpected frame type\n");
+                goto error;
+            } else {
+                return; // no new data yet
+            }
+        }
+        assert(p->pending || eof);
+
+        if (!p->initialized) {
+            if (!p->pending) {
+                mp_pin_in_write(f->ppins[1], MP_EOF_FRAME);
+                return;
+            }
+            if (!init_scaletempo2(f))
+                goto error;
+        }
+
+        bool format_change =
+            p->pending && !mp_aframe_config_equals(p->pending, p->cur_format);
+
+        bool final = format_change || eof;
+        if (p->pending && !format_change && !p->sent_final) {
+            int frame_size = mp_aframe_get_size(p->pending);
+            uint8_t **planes = mp_aframe_get_data_ro(p->pending);
+            int read = mp_scaletempo2_fill_input_buffer(&p->data,
+                planes, frame_size, p->speed);
+            mp_aframe_skip_samples(p->pending, read);
+        }
+        if (final && p->pending && !p->sent_final) {
+            mp_scaletempo2_set_final(&p->data);
+            p->sent_final = true;
+        }
+
+        if (mp_scaletempo2_frames_available(&p->data, p->speed)) {
+            if (eof) {
+                mp_pin_out_repeat_eof(p->in_pin); // drain more next time
+            }
+        } else if (final) {
+            p->initialized = false;
+            p->sent_final = false;
+            if (eof) {
+                mp_pin_in_write(f->ppins[1], MP_EOF_FRAME);
+                return;
+            }
+            // for format change go on with proper reinit on the next iteration
+        }
+    }
+
+    assert(p->pending);
+    if (mp_scaletempo2_frames_available(&p->data, p->speed)) {
+        struct mp_aframe *out = mp_aframe_new_ref(p->cur_format);
+        int out_samples = p->data.ola_hop_size;
+        if (mp_aframe_pool_allocate(p->out_pool, out, out_samples) < 0) {
+            talloc_free(out);
+            goto error;
+        }
+
+        mp_aframe_copy_attributes(out, p->pending);
+
+        uint8_t **planes = mp_aframe_get_data_rw(out);
+        assert(planes);
+        assert(mp_aframe_get_planes(out) == p->data.channels);
+
+        out_samples = mp_scaletempo2_fill_buffer(&p->data,
+            (float**)planes, out_samples, p->speed);
+
+        double pts = mp_aframe_get_pts(p->pending);
+        if (pts != MP_NOPTS_VALUE) {
+            double frame_delay = mp_scaletempo2_get_latency(&p->data, p->speed)
+                + out_samples * p->speed;
+            mp_aframe_set_pts(out, pts - frame_delay / mp_aframe_get_effective_rate(out));
+
+            if (p->sent_final) {
+                double remain_pts = pts - mp_aframe_get_pts(out);
+                double rate = mp_aframe_get_effective_rate(out) / p->speed;
+                int max_samples = MPMAX(0, (int) (remain_pts * rate));
+                // truncate final packet to expected length
+                if (out_samples >= max_samples) {
+                    out_samples = max_samples;
+
+                    // reset the filter to ensure it stops generating audio
+                    // and mp_scaletempo2_frames_available returns false
+                    mp_scaletempo2_reset(&p->data);
+                }
+            }
+        }
+
+        mp_aframe_set_size(out, out_samples);
+        mp_aframe_mul_speed(out, p->speed);
+        mp_pin_in_write(f->ppins[1], MAKE_FRAME(MP_FRAME_AUDIO, out));
+    }
+
+    return;
+error:
+    mp_filter_internal_mark_failed(f);
+}
+
+static bool init_scaletempo2(struct mp_filter *f)
+{
+    struct priv *p = f->priv;
+    assert(p->pending);
+
+    if (mp_aframe_get_format(p->pending) != AF_FORMAT_FLOATP)
+        return false;
+
+    mp_aframe_reset(p->cur_format);
+    p->initialized = true;
+    p->sent_final = false;
+    mp_aframe_config_copy(p->cur_format, p->pending);
+
+    mp_scaletempo2_init(&p->data, mp_aframe_get_channels(p->pending),
+        mp_aframe_get_rate(p->pending));
+
+    return true;
+}
+
+static bool command(struct mp_filter *f, struct mp_filter_command *cmd)
+{
+    struct priv *p = f->priv;
+
+    switch (cmd->type) {
+    case MP_FILTER_COMMAND_SET_SPEED:
+        p->speed = cmd->speed;
+        return true;
+    }
+
+    return false;
+}
+
+static void reset(struct mp_filter *f)
+{
+    struct priv *p = f->priv;
+    mp_scaletempo2_reset(&p->data);
+    p->initialized = false;
+    TA_FREEP(&p->pending);
+}
+
+static void destroy(struct mp_filter *f)
+{
+    struct priv *p = f->priv;
+    mp_scaletempo2_destroy(&p->data);
+    talloc_free(p->pending);
+}
+
+static const struct mp_filter_info af_scaletempo2_filter = {
+    .name = "scaletempo2",
+    .priv_size = sizeof(struct priv),
+    .process = process,
+    .command = command,
+    .reset = reset,
+    .destroy = destroy,
+};
+
+static struct mp_filter *af_scaletempo2_create(
+    struct mp_filter *parent, void *options)
+{
+    struct mp_filter *f = mp_filter_create(parent, &af_scaletempo2_filter);
+    if (!f) {
+        talloc_free(options);
+        return NULL;
+    }
+
+    mp_filter_add_pin(f, MP_PIN_IN, "in");
+    mp_filter_add_pin(f, MP_PIN_OUT, "out");
+
+    struct priv *p = f->priv;
+    p->data.opts = talloc_steal(p, options);
+    p->speed = 1.0;
+    p->cur_format = talloc_steal(p, mp_aframe_create());
+    p->out_pool = mp_aframe_pool_create(p);
+    p->pending = NULL;
+    p->initialized = false;
+
+    struct mp_autoconvert *conv = mp_autoconvert_create(f);
+    if (!conv)
+        abort();
+
+    mp_autoconvert_add_afmt(conv, AF_FORMAT_FLOATP);
+
+    mp_pin_connect(conv->f->pins[0], f->ppins[0]);
+    p->in_pin = conv->f->pins[1];
+
+    return f;
+}
+
+#define OPT_BASE_STRUCT struct mp_scaletempo2_opts
+const struct mp_user_filter_entry af_scaletempo2 = {
+    .desc = {
+        .description = "Scale audio tempo while maintaining pitch"
+            " (filter ported from chromium)",
+        .name = "scaletempo2",
+        .priv_size = sizeof(OPT_BASE_STRUCT),
+        .priv_defaults = &(const OPT_BASE_STRUCT) {
+            .min_playback_rate = 0.25,
+            .max_playback_rate = 8.0,
+            .ola_window_size_ms = 12,
+            .wsola_search_interval_ms = 40,
+        },
+        .options = (const struct m_option[]) {
+            {"search-interval",
+                OPT_FLOAT(wsola_search_interval_ms), M_RANGE(1, 1000)},
+            {"window-size",
+                OPT_FLOAT(ola_window_size_ms), M_RANGE(1, 1000)},
+            {"min-speed",
+                OPT_FLOAT(min_playback_rate), M_RANGE(0, FLT_MAX)},
+            {"max-speed",
+                OPT_FLOAT(max_playback_rate), M_RANGE(0, FLT_MAX)},
+            {0}
+        }
+    },
+    .create = af_scaletempo2_create,
+};
diff --git a/audio/filter/af_scaletempo2_internals.c b/audio/filter/af_scaletempo2_internals.c
new file mode 100644
index 0000000..534f4f6
--- /dev/null
+++ b/audio/filter/af_scaletempo2_internals.c
@@ -0,0 +1,873 @@
+#include <float.h>
+#include <math.h>
+
+#include "audio/chmap.h"
+#include "audio/filter/af_scaletempo2_internals.h"
+
+#include "config.h"
+
+// Algorithm overview (from chromium):
+// Waveform Similarity Overlap-and-add (WSOLA).
+//
+// One WSOLA iteration
+//
+// 1) Extract |target_block| as input frames at indices
+//    [|target_block_index|, |target_block_index| + |ola_window_size|).
+//    Note that |target_block| is the "natural" continuation of the output.
+//
+// 2) Extract |search_block| as input frames at indices
+//    [|search_block_index|,
+//     |search_block_index| + |num_candidate_blocks| + |ola_window_size|).
+//
+// 3) Find a block within the |search_block| that is most similar
+//    to |target_block|. Let |optimal_index| be the index of such block and
+//    write it to |optimal_block|.
+//
+// 4) Update:
+//    |optimal_block| = |transition_window| * |target_block| +
+//    (1 - |transition_window|) * |optimal_block|.
+//
+// 5) Overlap-and-add |optimal_block| to the |wsola_output|.
+//
+// 6) Update:write
+
+struct interval {
+    int lo;
+    int hi;
+};
+
+static bool in_interval(int n, struct interval q)
+{
+    return n >= q.lo && n <= q.hi;
+}
+
+static float **realloc_2d(float **p, int x, int y)
+{
+    float **array = realloc(p, sizeof(float*) * x + sizeof(float) * x * y);
+    float* data = (float*) (array + x);
+    for (int i = 0; i < x; ++i) {
+        array[i] = data + i * y;
+    }
+    return array;
+}
+
+static void zero_2d(float **a, int x, int y)
+{
+    memset(a + x, 0, sizeof(float) * x * y);
+}
+
+static void zero_2d_partial(float **a, int x, int y)
+{
+    for (int i = 0; i < x; ++i) {
+        memset(a[i], 0, sizeof(float) * y);
+    }
+}
+
+// Energies of sliding windows of channels are interleaved.
+// The number windows is |input_frames| - (|frames_per_window| - 1), hence,
+// the method assumes |energy| must be, at least, of size
+// (|input_frames| - (|frames_per_window| - 1)) * |channels|.
+static void multi_channel_moving_block_energies(
+    float **input, int input_frames, int channels,
+    int frames_per_block, float *energy)
+{
+    int num_blocks = input_frames - (frames_per_block - 1);
+
+    for (int k = 0; k < channels; ++k) {
+        const float* input_channel = input[k];
+
+        energy[k] = 0;
+
+        // First block of channel |k|.
+        for (int m = 0; m < frames_per_block; ++m) {
+            energy[k] += input_channel[m] * input_channel[m];
+        }
+
+        const float* slide_out = input_channel;
+        const float* slide_in = input_channel + frames_per_block;
+        for (int n = 1; n < num_blocks; ++n, ++slide_in, ++slide_out) {
+            energy[k + n * channels] = energy[k + (n - 1) * channels]
+                - *slide_out * *slide_out + *slide_in * *slide_in;
+        }
+    }
+}
+
+static float multi_channel_similarity_measure(
+    const float* dot_prod_a_b,
+    const float* energy_a, const float* energy_b,
+    int channels)
+{
+    const float epsilon = 1e-12f;
+    float similarity_measure = 0.0f;
+    for (int n = 0; n < channels; ++n) {
+        similarity_measure += dot_prod_a_b[n]
+            / sqrtf(energy_a[n] * energy_b[n] + epsilon);
+    }
+    return similarity_measure;
+}
+
+#if HAVE_VECTOR
+
+typedef float v8sf __attribute__ ((vector_size (32), aligned (1)));
+
+// Dot-product of channels of two AudioBus. For each AudioBus an offset is
+// given. |dot_product[k]| is the dot-product of channel |k|. The caller should
+// allocate sufficient space for |dot_product|.
+static void multi_channel_dot_product(
+    float **a, int frame_offset_a,
+    float **b, int frame_offset_b,
+    int channels,
+    int num_frames, float *dot_product)
+{
+    assert(frame_offset_a >= 0);
+    assert(frame_offset_b >= 0);
+
+    for (int k = 0; k < channels; ++k) {
+        const float* ch_a = a[k] + frame_offset_a;
+        const float* ch_b = b[k] + frame_offset_b;
+        float sum = 0.0;
+        if (num_frames < 32)
+            goto rest;
+
+        const v8sf *va = (const v8sf *) ch_a;
+        const v8sf *vb = (const v8sf *) ch_b;
+        v8sf vsum[4] = {
+            // Initialize to product of first 32 floats
+            va[0] * vb[0],
+            va[1] * vb[1],
+            va[2] * vb[2],
+            va[3] * vb[3],
+        };
+        va += 4;
+        vb += 4;
+
+        // Process `va` and `vb` across four vertical stripes
+        for (int n = 1; n < num_frames / 32; n++) {
+            vsum[0] += va[0] * vb[0];
+            vsum[1] += va[1] * vb[1];
+            vsum[2] += va[2] * vb[2];
+            vsum[3] += va[3] * vb[3];
+            va += 4;
+            vb += 4;
+        }
+
+        // Vertical sum across `vsum` entries
+        vsum[0] += vsum[1];
+        vsum[2] += vsum[3];
+        vsum[0] += vsum[2];
+
+        // Horizontal sum across `vsum[0]`, could probably be done better but
+        // this section is not super performance critical
+        float *vf = (float *) &vsum[0];
+        sum = vf[0] + vf[1] + vf[2] + vf[3] + vf[4] + vf[5] + vf[6] + vf[7];
+        ch_a = (const float *) va;
+        ch_b = (const float *) vb;
+
+rest:
+        // Process the remainder
+        for (int n = 0; n < num_frames % 32; n++)
+            sum += *ch_a++ * *ch_b++;
+
+        dot_product[k] = sum;
+    }
+}
+
+#else // !HAVE_VECTOR
+
+static void multi_channel_dot_product(
+    float **a, int frame_offset_a,
+    float **b, int frame_offset_b,
+    int channels,
+    int num_frames, float *dot_product)
+{
+    assert(frame_offset_a >= 0);
+    assert(frame_offset_b >= 0);
+
+    for (int k = 0; k < channels; ++k) {
+        const float* ch_a = a[k] + frame_offset_a;
+        const float* ch_b = b[k] + frame_offset_b;
+        float sum = 0.0;
+        for (int n = 0; n < num_frames; n++)
+            sum += *ch_a++ * *ch_b++;
+        dot_product[k] = sum;
+    }
+}
+
+#endif // HAVE_VECTOR
+
+// Fit the curve f(x) = a * x^2 + b * x + c such that
+//   f(-1) = y[0]
+//   f(0) = y[1]
+//   f(1) = y[2]
+// and return the maximum, assuming that y[0] <= y[1] >= y[2].
+static void quadratic_interpolation(
+    const float* y_values, float* extremum, float* extremum_value)
+{
+    float a = 0.5f * (y_values[2] + y_values[0]) - y_values[1];
+    float b = 0.5f * (y_values[2] - y_values[0]);
+    float c = y_values[1];
+
+    if (a == 0.f) {
+        // The coordinates are colinear (within floating-point error).
+        *extremum = 0;
+        *extremum_value = y_values[1];
+    } else {
+        *extremum = -b / (2.f * a);
+        *extremum_value = a * (*extremum) * (*extremum) + b * (*extremum) + c;
+    }
+}
+
+// Search a subset of all candid blocks. The search is performed every
+// |decimation| frames. This reduces complexity by a factor of about
+// 1 / |decimation|. A cubic interpolation is used to have a better estimate of
+// the best match.
+static int decimated_search(
+    int decimation, struct interval exclude_interval,
+    float **target_block, int target_block_frames,
+    float **search_segment, int search_segment_frames,
+    int channels,
+    const float *energy_target_block, const float *energy_candidate_blocks)
+{
+    int num_candidate_blocks = search_segment_frames - (target_block_frames - 1);
+    float dot_prod [MP_NUM_CHANNELS];
+    float similarity[3];  // Three elements for cubic interpolation.
+
+    int n = 0;
+    multi_channel_dot_product(
+        target_block, 0,
+        search_segment, n,
+        channels,
+        target_block_frames, dot_prod);
+    similarity[0] = multi_channel_similarity_measure(
+        dot_prod, energy_target_block,
+        &energy_candidate_blocks[n * channels], channels);
+
+    // Set the starting point as optimal point.
+    float best_similarity = similarity[0];
+    int optimal_index = 0;
+
+    n += decimation;
+    if (n >= num_candidate_blocks) {
+        return 0;
+    }
+
+    multi_channel_dot_product(
+        target_block, 0,
+        search_segment, n,
+        channels,
+        target_block_frames, dot_prod);
+    similarity[1] = multi_channel_similarity_measure(
+        dot_prod, energy_target_block,
+        &energy_candidate_blocks[n * channels], channels);
+
+    n += decimation;
+    if (n >= num_candidate_blocks) {
+        // We cannot do any more sampling. Compare these two values and return the
+        // optimal index.
+        return similarity[1] > similarity[0] ? decimation : 0;
+    }
+
+    for (; n < num_candidate_blocks; n += decimation) {
+        multi_channel_dot_product(
+            target_block, 0,
+            search_segment, n,
+            channels,
+            target_block_frames, dot_prod);
+
+        similarity[2] = multi_channel_similarity_measure(
+            dot_prod, energy_target_block,
+            &energy_candidate_blocks[n * channels], channels);
+
+        if ((similarity[1] > similarity[0] && similarity[1] >= similarity[2]) ||
+            (similarity[1] >= similarity[0] && similarity[1] > similarity[2]))
+        {
+            // A local maximum is found. Do a cubic interpolation for a better
+            // estimate of candidate maximum.
+            float normalized_candidate_index;
+            float candidate_similarity;
+            quadratic_interpolation(similarity, &normalized_candidate_index,
+                                    &candidate_similarity);
+
+            int candidate_index = n - decimation
+                 + (int)(normalized_candidate_index * decimation +  0.5f);
+            if (candidate_similarity > best_similarity
+                && !in_interval(candidate_index, exclude_interval)) {
+                optimal_index = candidate_index;
+                best_similarity = candidate_similarity;
+            }
+        } else if (n + decimation >= num_candidate_blocks &&
+                   similarity[2] > best_similarity &&
+                   !in_interval(n, exclude_interval))
+        {
+            // If this is the end-point and has a better similarity-measure than
+            // optimal, then we accept it as optimal point.
+            optimal_index = n;
+            best_similarity = similarity[2];
+        }
+        memmove(similarity, &similarity[1], 2 * sizeof(*similarity));
+    }
+    return optimal_index;
+}
+
+// Search [|low_limit|, |high_limit|] of |search_segment| to find a block that
+// is most similar to |target_block|. |energy_target_block| is the energy of the
+// |target_block|. |energy_candidate_blocks| is the energy of all blocks within
+// |search_block|.
+static int full_search(
+    int low_limit, int high_limit,
+    struct interval exclude_interval,
+    float **target_block, int target_block_frames,
+    float **search_block, int search_block_frames,
+    int channels,
+    const float* energy_target_block,
+    const float* energy_candidate_blocks)
+{
+    // int block_size = target_block->frames;
+    float dot_prod [sizeof(float) * MP_NUM_CHANNELS];
+
+    float best_similarity = -FLT_MAX;//FLT_MIN;
+    int optimal_index = 0;
+
+    for (int n = low_limit; n <= high_limit; ++n) {
+        if (in_interval(n, exclude_interval)) {
+            continue;
+        }
+        multi_channel_dot_product(target_block, 0, search_block, n, channels,
+            target_block_frames, dot_prod);
+
+        float similarity = multi_channel_similarity_measure(
+            dot_prod, energy_target_block,
+            &energy_candidate_blocks[n * channels], channels);
+
+        if (similarity > best_similarity) {
+            best_similarity = similarity;
+            optimal_index = n;
+        }
+    }
+
+    return optimal_index;
+}
+
+// Find the index of the block, within |search_block|, that is most similar
+// to |target_block|. Obviously, the returned index is w.r.t. |search_block|.
+// |exclude_interval| is an interval that is excluded from the search.
+static int compute_optimal_index(
+    float **search_block, int search_block_frames,
+    float **target_block, int target_block_frames,
+    float *energy_candidate_blocks,
+    int channels,
+    struct interval exclude_interval)
+{
+    int num_candidate_blocks = search_block_frames - (target_block_frames - 1);
+
+    // This is a compromise between complexity reduction and search accuracy. I
+    // don't have a proof that down sample of order 5 is optimal.
+    // One can compute a decimation factor that minimizes complexity given
+    // the size of |search_block| and |target_block|. However, my experiments
+    // show the rate of missing the optimal index is significant.
+    // This value is chosen heuristically based on experiments.
+    const int search_decimation = 5;
+
+    float energy_target_block [MP_NUM_CHANNELS];
+    // energy_candidate_blocks must have at least size
+    // sizeof(float) * channels * num_candidate_blocks
+
+    // Energy of all candid frames.
+    multi_channel_moving_block_energies(
+        search_block,
+        search_block_frames,
+        channels,
+        target_block_frames,
+        energy_candidate_blocks);
+
+    // Energy of target frame.
+    multi_channel_dot_product(
+        target_block, 0,
+        target_block, 0,
+        channels,
+        target_block_frames, energy_target_block);
+
+    int optimal_index = decimated_search(
+        search_decimation, exclude_interval,
+        target_block, target_block_frames,
+        search_block, search_block_frames,
+        channels,
+        energy_target_block,
+        energy_candidate_blocks);
+
+    int lim_low = MPMAX(0, optimal_index - search_decimation);
+    int lim_high = MPMIN(num_candidate_blocks - 1,
+                            optimal_index + search_decimation);
+    return full_search(
+        lim_low, lim_high, exclude_interval,
+        target_block, target_block_frames,
+        search_block, search_block_frames,
+        channels,
+        energy_target_block, energy_candidate_blocks);
+}
+
+static void peek_buffer(struct mp_scaletempo2 *p,
+    int frames, int read_offset, int write_offset, float **dest)
+{
+    assert(p->input_buffer_frames >= frames);
+    for (int i = 0; i < p->channels; ++i) {
+        memcpy(dest[i] + write_offset,
+            p->input_buffer[i] + read_offset,
+            frames * sizeof(float));
+    }
+}
+
+static void seek_buffer(struct mp_scaletempo2 *p, int frames)
+{
+    assert(p->input_buffer_frames >= frames);
+    p->input_buffer_frames -= frames;
+    if (p->input_buffer_final_frames > 0) {
+        p->input_buffer_final_frames = MPMAX(0, p->input_buffer_final_frames - frames);
+    }
+    for (int i = 0; i < p->channels; ++i) {
+        memmove(p->input_buffer[i], p->input_buffer[i] + frames,
+            p->input_buffer_frames * sizeof(float));
+    }
+}
+
+static int write_completed_frames_to(struct mp_scaletempo2 *p,
+    int requested_frames, int dest_offset, float **dest)
+{
+    int rendered_frames = MPMIN(p->num_complete_frames, requested_frames);
+
+    if (rendered_frames == 0)
+        return 0;  // There is nothing to read from |wsola_output|, return.
+
+    for (int i = 0; i < p->channels; ++i) {
+        memcpy(dest[i] + dest_offset, p->wsola_output[i],
+            rendered_frames * sizeof(float));
+    }
+
+    // Remove the frames which are read.
+    int frames_to_move = p->wsola_output_size - rendered_frames;
+    for (int k = 0; k < p->channels; ++k) {
+        float *ch = p->wsola_output[k];
+        memmove(ch, &ch[rendered_frames], sizeof(*ch) * frames_to_move);
+    }
+    p->num_complete_frames -= rendered_frames;
+    return rendered_frames;
+}
+
+// next output_time for the given playback_rate
+static double get_updated_time(struct mp_scaletempo2 *p, double playback_rate)
+{
+    return p->output_time + p->ola_hop_size * playback_rate;
+}
+
+// search_block_index for the given output_time
+static int get_search_block_index(struct mp_scaletempo2 *p, double output_time)
+{
+    return (int)(output_time - p->search_block_center_offset + 0.5);
+}
+
+// number of frames needed until a wsola iteration can be performed
+static int frames_needed(struct mp_scaletempo2 *p, double playback_rate)
+{
+    int search_block_index =
+        get_search_block_index(p, get_updated_time(p, playback_rate));
+    return MPMAX(0, MPMAX(
+        p->target_block_index + p->ola_window_size - p->input_buffer_frames,
+        search_block_index + p->search_block_size - p->input_buffer_frames));
+}
+
+static bool can_perform_wsola(struct mp_scaletempo2 *p, double playback_rate)
+{
+    return frames_needed(p, playback_rate) <= 0;
+}
+
+static void resize_input_buffer(struct mp_scaletempo2 *p, int size)
+{
+    p->input_buffer_size = size;
+    p->input_buffer = realloc_2d(p->input_buffer, p->channels, size);
+}
+
+// pad end with silence until a wsola iteration can be performed
+static void add_input_buffer_final_silence(struct mp_scaletempo2 *p, double playback_rate)
+{
+    int needed = frames_needed(p, playback_rate);
+    if (needed <= 0)
+        return; // no silence needed for iteration
+
+    int required_size = needed + p->input_buffer_frames;
+    if (required_size > p->input_buffer_size)
+        resize_input_buffer(p, required_size);
+
+    for (int i = 0; i < p->channels; ++i) {
+        float *ch_input = p->input_buffer[i];
+        for (int j = 0; j < needed; ++j) {
+            ch_input[p->input_buffer_frames + j] = 0.0f;
+        }
+    }
+
+    p->input_buffer_added_silence += needed;
+    p->input_buffer_frames += needed;
+}
+
+void mp_scaletempo2_set_final(struct mp_scaletempo2 *p)
+{
+    if (p->input_buffer_final_frames <= 0) {
+        p->input_buffer_final_frames = p->input_buffer_frames;
+    }
+}
+
+int mp_scaletempo2_fill_input_buffer(struct mp_scaletempo2 *p,
+    uint8_t **planes, int frame_size, double playback_rate)
+{
+    int needed = frames_needed(p, playback_rate);
+    int read = MPMIN(needed, frame_size);
+    if (read == 0)
+        return 0;
+
+    int required_size = read + p->input_buffer_frames;
+    if (required_size > p->input_buffer_size)
+        resize_input_buffer(p, required_size);
+
+    for (int i = 0; i < p->channels; ++i) {
+        memcpy(p->input_buffer[i] + p->input_buffer_frames,
+            planes[i], read * sizeof(float));
+    }
+
+    p->input_buffer_frames += read;
+    return read;
+}
+
+static bool target_is_within_search_region(struct mp_scaletempo2 *p)
+{
+    return p->target_block_index >= p->search_block_index
+        && p->target_block_index + p->ola_window_size
+            <= p->search_block_index + p->search_block_size;
+}
+
+
+static void peek_audio_with_zero_prepend(struct mp_scaletempo2 *p,
+    int read_offset_frames, float **dest, int dest_frames)
+{
+    assert(read_offset_frames + dest_frames <= p->input_buffer_frames);
+
+    int write_offset = 0;
+    int num_frames_to_read = dest_frames;
+    if (read_offset_frames < 0) {
+        int num_zero_frames_appended = MPMIN(
+            -read_offset_frames, num_frames_to_read);
+        read_offset_frames = 0;
+        num_frames_to_read -= num_zero_frames_appended;
+        write_offset = num_zero_frames_appended;
+        zero_2d_partial(dest, p->channels, num_zero_frames_appended);
+    }
+    peek_buffer(p, num_frames_to_read, read_offset_frames, write_offset, dest);
+}
+
+static void get_optimal_block(struct mp_scaletempo2 *p)
+{
+    int optimal_index = 0;
+
+    // An interval around last optimal block which is excluded from the search.
+    // This is to reduce the buzzy sound. The number 160 is rather arbitrary and
+    // derived heuristically.
+    const int exclude_interval_length_frames = 160;
+    if (target_is_within_search_region(p)) {
+        optimal_index = p->target_block_index;
+        peek_audio_with_zero_prepend(p,
+            optimal_index, p->optimal_block, p->ola_window_size);
+    } else {
+        peek_audio_with_zero_prepend(p,
+            p->target_block_index, p->target_block, p->ola_window_size);
+        peek_audio_with_zero_prepend(p,
+            p->search_block_index, p->search_block, p->search_block_size);
+        int last_optimal = p->target_block_index
+            - p->ola_hop_size - p->search_block_index;
+        struct interval exclude_iterval = {
+            .lo = last_optimal - exclude_interval_length_frames / 2,
+            .hi = last_optimal + exclude_interval_length_frames / 2
+        };
+
+        // |optimal_index| is in frames and it is relative to the beginning of the
+        // |search_block|.
+        optimal_index = compute_optimal_index(
+            p->search_block, p->search_block_size,
+            p->target_block, p->ola_window_size,
+            p->energy_candidate_blocks,
+            p->channels,
+            exclude_iterval);
+
+        // Translate |index| w.r.t. the beginning of |audio_buffer| and extract the
+        // optimal block.
+        optimal_index += p->search_block_index;
+        peek_audio_with_zero_prepend(p,
+            optimal_index, p->optimal_block, p->ola_window_size);
+
+        // Make a transition from target block to the optimal block if different.
+        // Target block has the best continuation to the current output.
+        // Optimal block is the most similar block to the target, however, it might
+        // introduce some discontinuity when over-lap-added. Therefore, we combine
+        // them for a smoother transition. The length of transition window is twice
+        // as that of the optimal-block which makes it like a weighting function
+        // where target-block has higher weight close to zero (weight of 1 at index
+        // 0) and lower weight close the end.
+        for (int k = 0; k < p->channels; ++k) {
+            float* ch_opt = p->optimal_block[k];
+            float* ch_target = p->target_block[k];
+            for (int n = 0; n < p->ola_window_size; ++n) {
+                ch_opt[n] = ch_opt[n] * p->transition_window[n]
+                    + ch_target[n] * p->transition_window[p->ola_window_size + n];
+            }
+        }
+    }
+
+    // Next target is one hop ahead of the current optimal.
+    p->target_block_index = optimal_index + p->ola_hop_size;
+}
+
+static void set_output_time(struct mp_scaletempo2 *p, double output_time)
+{
+    p->output_time = output_time;
+    p->search_block_index = get_search_block_index(p, output_time);
+}
+
+static void remove_old_input_frames(struct mp_scaletempo2 *p)
+{
+    const int earliest_used_index = MPMIN(
+        p->target_block_index, p->search_block_index);
+    if (earliest_used_index <= 0)
+        return;  // Nothing to remove.
+
+    // Remove frames from input and adjust indices accordingly.
+    seek_buffer(p, earliest_used_index);
+    p->target_block_index -= earliest_used_index;
+    p->output_time -= earliest_used_index;
+    p->search_block_index -= earliest_used_index;
+}
+
+static bool run_one_wsola_iteration(struct mp_scaletempo2 *p, double playback_rate)
+{
+    if (!can_perform_wsola(p, playback_rate)) {
+        return false;
+    }
+
+    set_output_time(p, get_updated_time(p, playback_rate));
+    remove_old_input_frames(p);
+
+    assert(p->search_block_index + p->search_block_size <= p->input_buffer_frames);
+
+    get_optimal_block(p);
+
+    // Overlap-and-add.
+    for (int k = 0; k < p->channels; ++k) {
+        float* ch_opt_frame = p->optimal_block[k];
+        float* ch_output = p->wsola_output[k] + p->num_complete_frames;
+        if (p->wsola_output_started) {
+            for (int n = 0; n < p->ola_hop_size; ++n) {
+                ch_output[n] = ch_output[n] * p->ola_window[p->ola_hop_size + n] +
+                    ch_opt_frame[n] * p->ola_window[n];
+            }
+
+            // Copy the second half to the output.
+            memcpy(&ch_output[p->ola_hop_size], &ch_opt_frame[p->ola_hop_size],
+                   sizeof(*ch_opt_frame) * p->ola_hop_size);
+        } else {
+            // No overlap for the first iteration.
+            memcpy(ch_output, ch_opt_frame,
+                   sizeof(*ch_opt_frame) * p->ola_window_size);
+        }
+    }
+
+    p->num_complete_frames += p->ola_hop_size;
+    p->wsola_output_started = true;
+    return true;
+}
+
+static int read_input_buffer(struct mp_scaletempo2 *p, int dest_size, float **dest)
+{
+    int frames_to_copy = MPMIN(dest_size, p->input_buffer_frames - p->target_block_index);
+
+    if (frames_to_copy <= 0)
+        return 0; // There is nothing to read from input buffer; return.
+
+    peek_buffer(p, frames_to_copy, p->target_block_index, 0, dest);
+    seek_buffer(p, frames_to_copy);
+    return frames_to_copy;
+}
+
+int mp_scaletempo2_fill_buffer(struct mp_scaletempo2 *p,
+    float **dest, int dest_size, double playback_rate)
+{
+    if (playback_rate == 0) return 0;
+
+    if (p->input_buffer_final_frames > 0) {
+        add_input_buffer_final_silence(p, playback_rate);
+    }
+
+    // Optimize the muted case to issue a single clear instead of performing
+    // the full crossfade and clearing each crossfaded frame.
+    if (playback_rate < p->opts->min_playback_rate
+        || (playback_rate > p->opts->max_playback_rate
+            && p->opts->max_playback_rate > 0))
+    {
+        int frames_to_render = MPMIN(dest_size,
+            (int) (p->input_buffer_frames / playback_rate));
+
+        // Compute accurate number of frames to actually skip in the source data.
+        // Includes the leftover partial frame from last request. However, we can
+        // only skip over complete frames, so a partial frame may remain for next
+        // time.
+        p->muted_partial_frame += frames_to_render * playback_rate;
+        int seek_frames = (int) (p->muted_partial_frame);
+        zero_2d_partial(dest, p->channels, frames_to_render);
+        seek_buffer(p, seek_frames);
+
+        // Determine the partial frame that remains to be skipped for next call. If
+        // the user switches back to playing, it may be off time by this partial
+        // frame, which would be undetectable. If they subsequently switch to
+        // another playback rate that mutes, the code will attempt to line up the
+        // frames again.
+        p->muted_partial_frame -= seek_frames;
+        return frames_to_render;
+    }
+
+    int slower_step = (int) ceilf(p->ola_window_size * playback_rate);
+    int faster_step = (int) ceilf(p->ola_window_size / playback_rate);
+
+    // Optimize the most common |playback_rate| ~= 1 case to use a single copy
+    // instead of copying frame by frame.
+    if (p->ola_window_size <= faster_step && slower_step >= p->ola_window_size) {
+
+        if (p->wsola_output_started) {
+            p->wsola_output_started = false;
+
+            // sync audio precisely again
+            set_output_time(p, p->target_block_index);
+            remove_old_input_frames(p);
+        }
+
+        return read_input_buffer(p, dest_size, dest);
+    }
+
+    int rendered_frames = 0;
+    do {
+        rendered_frames += write_completed_frames_to(p,
+            dest_size - rendered_frames, rendered_frames, dest);
+    } while (rendered_frames < dest_size
+             && run_one_wsola_iteration(p, playback_rate));
+    return rendered_frames;
+}
+
+double mp_scaletempo2_get_latency(struct mp_scaletempo2 *p, double playback_rate)
+{
+    return p->input_buffer_frames - p->output_time
+        - p->input_buffer_added_silence
+        + p->num_complete_frames * playback_rate;
+}
+
+bool mp_scaletempo2_frames_available(struct mp_scaletempo2 *p, double playback_rate)
+{
+    return p->input_buffer_final_frames > p->target_block_index
+        || can_perform_wsola(p, playback_rate)
+        || p->num_complete_frames > 0;
+}
+
+void mp_scaletempo2_destroy(struct mp_scaletempo2 *p)
+{
+    free(p->ola_window);
+    free(p->transition_window);
+    free(p->wsola_output);
+    free(p->optimal_block);
+    free(p->search_block);
+    free(p->target_block);
+    free(p->input_buffer);
+    free(p->energy_candidate_blocks);
+}
+
+void mp_scaletempo2_reset(struct mp_scaletempo2 *p)
+{
+    p->input_buffer_frames = 0;
+    p->input_buffer_final_frames = 0;
+    p->input_buffer_added_silence = 0;
+    p->output_time = 0.0;
+    p->search_block_index = 0;
+    p->target_block_index = 0;
+    // Clear the queue of decoded packets.
+    zero_2d(p->wsola_output, p->channels, p->wsola_output_size);
+    p->num_complete_frames = 0;
+    p->wsola_output_started = false;
+}
+
+// Return a "periodic" Hann window. This is the first L samples of an L+1
+// Hann window. It is perfect reconstruction for overlap-and-add.
+static void get_symmetric_hanning_window(int window_length, float* window)
+{
+    const float scale = 2.0f * M_PI / window_length;
+    for (int n = 0; n < window_length; ++n)
+        window[n] = 0.5f * (1.0f - cosf(n * scale));
+}
+
+
+void mp_scaletempo2_init(struct mp_scaletempo2 *p, int channels, int rate)
+{
+    p->muted_partial_frame = 0;
+    p->output_time = 0;
+    p->search_block_index = 0;
+    p->target_block_index = 0;
+    p->num_complete_frames = 0;
+    p->wsola_output_started = false;
+    p->channels = channels;
+
+    p->samples_per_second = rate;
+    p->num_candidate_blocks = (int)(p->opts->wsola_search_interval_ms
+        * p->samples_per_second / 1000);
+    p->ola_window_size = (int)(p->opts->ola_window_size_ms
+        * p->samples_per_second / 1000);
+    // Make sure window size in an even number.
+    p->ola_window_size += p->ola_window_size & 1;
+    p->ola_hop_size = p->ola_window_size / 2;
+    // |num_candidate_blocks| / 2 is the offset of the center of the search
+    // block to the center of the first (left most) candidate block. The offset
+    // of the center of a candidate block to its left most point is
+    // |ola_window_size| / 2 - 1. Note that |ola_window_size| is even and in
+    // our convention the center belongs to the left half, so we need to subtract
+    // one frame to get the correct offset.
+    //
+    //                             Search Block
+    //              <------------------------------------------->
+    //
+    //   |ola_window_size| / 2 - 1
+    //              <----
+    //
+    //             |num_candidate_blocks| / 2
+    //                   <----------------
+    //                                 center
+    //              X----X----------------X---------------X-----X
+    //              <---------->                     <---------->
+    //                Candidate      ...               Candidate
+    //                   1,          ...         |num_candidate_blocks|
+    p->search_block_center_offset = p->num_candidate_blocks / 2
+        + (p->ola_window_size / 2 - 1);
+    p->ola_window = realloc(p->ola_window, sizeof(float) * p->ola_window_size);
+    get_symmetric_hanning_window(p->ola_window_size, p->ola_window);
+    p->transition_window = realloc(p->transition_window,
+        sizeof(float) * p->ola_window_size * 2);
+    get_symmetric_hanning_window(2 * p->ola_window_size, p->transition_window);
+
+    p->wsola_output_size = p->ola_window_size + p->ola_hop_size;
+    p->wsola_output = realloc_2d(p->wsola_output, p->channels, p->wsola_output_size);
+    // Initialize for overlap-and-add of the first block.
+    zero_2d(p->wsola_output, p->channels, p->wsola_output_size);
+
+    // Auxiliary containers.
+    p->optimal_block = realloc_2d(p->optimal_block, p->channels, p->ola_window_size);
+    p->search_block_size = p->num_candidate_blocks + (p->ola_window_size - 1);
+    p->search_block = realloc_2d(p->search_block, p->channels, p->search_block_size);
+    p->target_block = realloc_2d(p->target_block, p->channels, p->ola_window_size);
+
+    resize_input_buffer(p, 4 * MPMAX(p->ola_window_size, p->search_block_size));
+    p->input_buffer_frames = 0;
+    p->input_buffer_final_frames = 0;
+    p->input_buffer_added_silence = 0;
+
+    p->energy_candidate_blocks = realloc(p->energy_candidate_blocks,
+        sizeof(float) * p->channels * p->num_candidate_blocks);
+}
diff --git a/audio/filter/af_scaletempo2_internals.h b/audio/filter/af_scaletempo2_internals.h
new file mode 100644
index 0000000..6c3c94c
--- /dev/null
+++ b/audio/filter/af_scaletempo2_internals.h
@@ -0,0 +1,134 @@
+// This filter was ported from Chromium
+// (https://chromium.googlesource.com/chromium/chromium/+/51ed77e3f37a9a9b80d6d0a8259e84a8ca635259/media/filters/audio_renderer_algorithm.cc)
+//
+// Copyright 2015 The Chromium Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "common/common.h"
+
+struct mp_scaletempo2_opts {
+    // Max/min supported playback rates for fast/slow audio. Audio outside of these
+    // ranges are muted.
+    // Audio at these speeds would sound better under a frequency domain algorithm.
+    float min_playback_rate;
+    float max_playback_rate;
+    // Overlap-and-add window size in milliseconds.
+    float ola_window_size_ms;
+    // Size of search interval in milliseconds. The search interval is
+    // [-delta delta] around |output_index| * |playback_rate|. So the search
+    // interval is 2 * delta.
+    float wsola_search_interval_ms;
+};
+
+struct mp_scaletempo2 {
+    struct mp_scaletempo2_opts *opts;
+    // Number of channels in audio stream.
+    int channels;
+    // Sample rate of audio stream.
+    int samples_per_second;
+    // If muted, keep track of partial frames that should have been skipped over.
+    double muted_partial_frame;
+    // Book keeping of the current time of generated audio, in frames.
+    // Corresponds to the center of |search_block|. This is increased in
+    // intervals of |ola_hop_size| multiplied by the current playback_rate,
+    // for every WSOLA iteration. This tracks the number of advanced frames as
+    // a double to achieve accurate playback rates beyond the integer precision
+    // of |search_block_index|.
+    // Needs to be adjusted like any other index when frames are evicted from
+    // |input_buffer|.
+    double output_time;
+    // The offset of the center frame of |search_block| w.r.t. its first frame.
+    int search_block_center_offset;
+    // Index of the beginning of the |search_block|, in frames. This may be
+    // negative, which is handled by |peek_audio_with_zero_prepend|.
+    int search_block_index;
+    // Number of Blocks to search to find the most similar one to the target
+    // frame.
+    int num_candidate_blocks;
+    // Index of the beginning of the target block, counted in frames.
+    int target_block_index;
+    // Overlap-and-add window size in frames.
+    int ola_window_size;
+    // The hop size of overlap-and-add in frames. This implementation assumes 50%
+    // overlap-and-add.
+    int ola_hop_size;
+    // Number of frames in |wsola_output| that overlap-and-add is completed for
+    // them and can be copied to output if fill_buffer() is called. It also
+    // specifies the index where the next WSOLA window has to overlap-and-add.
+    int num_complete_frames;
+    // Whether |wsola_output| contains an additional |ola_hop_size| of overlap
+    // frames for the next iteration.
+    bool wsola_output_started;
+    // Overlap-and-add window.
+    float *ola_window;
+    // Transition window, used to update |optimal_block| by a weighted sum of
+    // |optimal_block| and |target_block|.
+    float *transition_window;
+    // This stores a part of the output that is created but couldn't be rendered.
+    // Output is generated frame-by-frame which at some point might exceed the
+    // number of requested samples. Furthermore, due to overlap-and-add,
+    // the last half-window of the output is incomplete, which is stored in this
+    // buffer.
+    float **wsola_output;
+    int wsola_output_size;
+    // Auxiliary variables to avoid allocation in every iteration.
+    // Stores the optimal block in every iteration. This is the most
+    // similar block to |target_block| within |search_block| and it is
+    // overlap-and-added to |wsola_output|.
+    float **optimal_block;
+    // A block of data that search is performed over to find the |optimal_block|.
+    float **search_block;
+    int search_block_size;
+    // Stores the target block, denoted as |target| above. |search_block| is
+    // searched for a block (|optimal_block|) that is most similar to
+    // |target_block|.
+    float **target_block;
+    // Buffered audio data.
+    float **input_buffer;
+    int input_buffer_size;
+    int input_buffer_frames;
+    // How many frames in |input_buffer| need to be flushed by padding with
+    // silence to process the final packet. While this is nonzero, the filter
+    // appends silence to |input_buffer| until these frames are processed.
+    int input_buffer_final_frames;
+    // How many additional frames of silence have been added to |input_buffer|
+    // for padding after the final packet.
+    int input_buffer_added_silence;
+    float *energy_candidate_blocks;
+};
+
+void mp_scaletempo2_destroy(struct mp_scaletempo2 *p);
+void mp_scaletempo2_reset(struct mp_scaletempo2 *p);
+void mp_scaletempo2_init(struct mp_scaletempo2 *p, int channels, int rate);
+double mp_scaletempo2_get_latency(struct mp_scaletempo2 *p, double playback_rate);
+int mp_scaletempo2_fill_input_buffer(struct mp_scaletempo2 *p,
+    uint8_t **planes, int frame_size, double playback_rate);
+void mp_scaletempo2_set_final(struct mp_scaletempo2 *p);
+int mp_scaletempo2_fill_buffer(struct mp_scaletempo2 *p,
+    float **dest, int dest_size, double playback_rate);
+bool mp_scaletempo2_frames_available(struct mp_scaletempo2 *p, double playback_rate);