summaryrefslogtreecommitdiffstats
path: root/media/ffvpx/libavcodec/aarch64
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 14:29:10 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 14:29:10 +0000
commit2aa4a82499d4becd2284cdb482213d541b8804dd (patch)
treeb80bf8bf13c3766139fbacc530efd0dd9d54394c /media/ffvpx/libavcodec/aarch64
parentInitial commit. (diff)
downloadfirefox-2aa4a82499d4becd2284cdb482213d541b8804dd.tar.xz
firefox-2aa4a82499d4becd2284cdb482213d541b8804dd.zip
Adding upstream version 86.0.1.upstream/86.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'media/ffvpx/libavcodec/aarch64')
-rw-r--r--media/ffvpx/libavcodec/aarch64/fft_init_aarch64.c50
-rw-r--r--media/ffvpx/libavcodec/aarch64/fft_neon.S442
-rw-r--r--media/ffvpx/libavcodec/aarch64/h264chroma_init_aarch64.c59
-rw-r--r--media/ffvpx/libavcodec/aarch64/h264cmc_neon.S450
-rw-r--r--media/ffvpx/libavcodec/aarch64/h264dsp_init_aarch64.c102
-rw-r--r--media/ffvpx/libavcodec/aarch64/h264dsp_neon.S498
-rw-r--r--media/ffvpx/libavcodec/aarch64/h264idct_neon.S409
-rw-r--r--media/ffvpx/libavcodec/aarch64/h264pred_init.c93
-rw-r--r--media/ffvpx/libavcodec/aarch64/h264pred_neon.S361
-rw-r--r--media/ffvpx/libavcodec/aarch64/hpeldsp_init_aarch64.c123
-rw-r--r--media/ffvpx/libavcodec/aarch64/hpeldsp_neon.S397
-rw-r--r--media/ffvpx/libavcodec/aarch64/idct.h28
-rw-r--r--media/ffvpx/libavcodec/aarch64/idctdsp_init_aarch64.c41
-rw-r--r--media/ffvpx/libavcodec/aarch64/mdct_neon.S323
-rw-r--r--media/ffvpx/libavcodec/aarch64/moz.build50
-rw-r--r--media/ffvpx/libavcodec/aarch64/mpegaudiodsp_init.c40
-rw-r--r--media/ffvpx/libavcodec/aarch64/mpegaudiodsp_neon.S225
-rw-r--r--media/ffvpx/libavcodec/aarch64/neon.S149
-rw-r--r--media/ffvpx/libavcodec/aarch64/simple_idct_neon.S362
-rw-r--r--media/ffvpx/libavcodec/aarch64/vc1dsp_init_aarch64.c47
-rw-r--r--media/ffvpx/libavcodec/aarch64/videodsp.S28
-rw-r--r--media/ffvpx/libavcodec/aarch64/videodsp_init.c32
-rw-r--r--media/ffvpx/libavcodec/aarch64/vp8dsp.h75
-rw-r--r--media/ffvpx/libavcodec/aarch64/vp8dsp_init_aarch64.c124
-rw-r--r--media/ffvpx/libavcodec/aarch64/vp8dsp_neon.S1790
-rw-r--r--media/ffvpx/libavcodec/aarch64/vp9dsp_init.h29
-rw-r--r--media/ffvpx/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c23
-rw-r--r--media/ffvpx/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c23
-rw-r--r--media/ffvpx/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c273
-rw-r--r--media/ffvpx/libavcodec/aarch64/vp9dsp_init_aarch64.c258
-rw-r--r--media/ffvpx/libavcodec/aarch64/vp9itxfm_16bpp_neon.S2017
-rw-r--r--media/ffvpx/libavcodec/aarch64/vp9itxfm_neon.S1580
-rw-r--r--media/ffvpx/libavcodec/aarch64/vp9lpf_16bpp_neon.S873
-rw-r--r--media/ffvpx/libavcodec/aarch64/vp9lpf_neon.S1334
-rw-r--r--media/ffvpx/libavcodec/aarch64/vp9mc_16bpp_neon.S631
-rw-r--r--media/ffvpx/libavcodec/aarch64/vp9mc_neon.S687
36 files changed, 14026 insertions, 0 deletions
diff --git a/media/ffvpx/libavcodec/aarch64/fft_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/fft_init_aarch64.c
new file mode 100644
index 0000000000..db285205ab
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/fft_init_aarch64.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+
+#include "libavcodec/fft.h"
+
+void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
+
+void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+
+av_cold void ff_fft_init_aarch64(FFTContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ s->fft_permute = ff_fft_permute_neon;
+ s->fft_calc = ff_fft_calc_neon;
+#if CONFIG_MDCT
+ s->imdct_calc = ff_imdct_calc_neon;
+ s->imdct_half = ff_imdct_half_neon;
+ s->mdct_calc = ff_mdct_calc_neon;
+ s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
+#endif
+ }
+}
diff --git a/media/ffvpx/libavcodec/aarch64/fft_neon.S b/media/ffvpx/libavcodec/aarch64/fft_neon.S
new file mode 100644
index 0000000000..862039f97d
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/fft_neon.S
@@ -0,0 +1,442 @@
+/*
+ * ARM NEON optimised FFT
+ *
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2009 Naotoshi Nojiri
+ * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This algorithm (though not any of the implementation details) is
+ * based on libdjbfft by D. J. Bernstein.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define M_SQRT1_2 0.70710678118654752440
+
+.macro transpose d0, d1, s0, s1
+ trn1 \d0, \s0, \s1
+ trn2 \d1, \s0, \s1
+.endm
+
+
+function fft4_neon
+ ld1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
+
+ fadd v4.2s, v0.2s, v1.2s // r0+r1,i0+i1
+ fsub v6.2s, v0.2s, v1.2s // r0-r1,i0-i1
+
+ ext v16.8b, v2.8b, v3.8b, #4
+ ext v17.8b, v3.8b, v2.8b, #4
+
+ fadd v5.2s, v2.2s, v3.2s // i2+i3,r2+r3
+ fsub v7.2s, v16.2s, v17.2s // r3-r2,i2-i3
+
+ fadd v0.2s, v4.2s, v5.2s
+ fsub v2.2s, v4.2s, v5.2s
+ fadd v1.2s, v6.2s, v7.2s
+ fsub v3.2s, v6.2s, v7.2s
+
+ st1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
+
+ ret
+endfunc
+
+function fft8_neon
+ mov x1, x0
+ ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32
+ ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
+ ext v22.8b, v2.8b, v3.8b, #4
+ ext v23.8b, v3.8b, v2.8b, #4
+ fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5
+ fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7
+ fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5
+ fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7
+ rev64 v27.2s, v28.2s // ???
+ fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1
+ fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3
+ fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w
+ ext v6.8b, v4.8b, v5.8b, #4
+ ext v7.8b, v5.8b, v4.8b, #4
+ fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w
+ fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2
+ fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1
+ fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w
+ fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w
+ fadd v0.2s, v20.2s, v21.2s
+ fsub v2.2s, v20.2s, v21.2s
+ fadd v1.2s, v22.2s, v23.2s
+ rev64 v26.2s, v26.2s
+ rev64 v27.2s, v27.2s
+ fsub v3.2s, v22.2s, v23.2s
+ fsub v6.2s, v6.2s, v7.2s
+ fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2
+ fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6
+ fadd v7.2s, v4.2s, v5.2s
+ fsub v18.2s, v2.2s, v6.2s
+ ext v26.8b, v24.8b, v25.8b, #4
+ ext v27.8b, v25.8b, v24.8b, #4
+ fadd v2.2s, v2.2s, v6.2s
+ fsub v16.2s, v0.2s, v7.2s
+ fadd v5.2s, v25.2s, v24.2s
+ fsub v4.2s, v26.2s, v27.2s
+ fadd v0.2s, v0.2s, v7.2s
+ fsub v17.2s, v1.2s, v5.2s
+ fsub v19.2s, v3.2s, v4.2s
+ fadd v3.2s, v3.2s, v4.2s
+ fadd v1.2s, v1.2s, v5.2s
+
+ st1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
+ st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x1]
+
+ ret
+endfunc
+
+function fft16_neon
+ mov x1, x0
+ ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32
+ ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32
+ ext v22.8b, v2.8b, v3.8b, #4
+ ext v23.8b, v3.8b, v2.8b, #4
+ fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5
+ fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7
+ fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5
+ fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7
+ rev64 v27.2s, v28.2s // ???
+ fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1
+ fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3
+ fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w
+ ext v6.8b, v4.8b, v5.8b, #4
+ ext v7.8b, v5.8b, v4.8b, #4
+ fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w
+ fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2
+ fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1
+ fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w
+ fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w
+ fadd v0.2s, v20.2s, v21.2s
+ fsub v2.2s, v20.2s, v21.2s
+ fadd v1.2s, v22.2s, v23.2s
+ rev64 v26.2s, v26.2s
+ rev64 v27.2s, v27.2s
+ fsub v3.2s, v22.2s, v23.2s
+ fsub v6.2s, v6.2s, v7.2s
+ fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2
+ fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6
+ fadd v7.2s, v4.2s, v5.2s
+ fsub v18.2s, v2.2s, v6.2s
+ ld1 {v20.4s,v21.4s}, [x0], #32
+ ld1 {v22.4s,v23.4s}, [x0], #32
+ ext v26.8b, v24.8b, v25.8b, #4
+ ext v27.8b, v25.8b, v24.8b, #4
+ fadd v2.2s, v2.2s, v6.2s
+ fsub v16.2s, v0.2s, v7.2s
+ fadd v5.2s, v25.2s, v24.2s
+ fsub v4.2s, v26.2s, v27.2s
+ transpose v24.2d, v25.2d, v20.2d, v22.2d
+ transpose v26.2d, v27.2d, v21.2d, v23.2d
+ fadd v0.2s, v0.2s, v7.2s
+ fsub v17.2s, v1.2s, v5.2s
+ fsub v19.2s, v3.2s, v4.2s
+ fadd v3.2s, v3.2s, v4.2s
+ fadd v1.2s, v1.2s, v5.2s
+ ext v20.16b, v21.16b, v21.16b, #4
+ ext v21.16b, v23.16b, v23.16b, #4
+
+ zip1 v0.2d, v0.2d, v1.2d // {z[0], z[1]}
+ zip1 v1.2d, v2.2d, v3.2d // {z[2], z[3]}
+ zip1 v2.2d, v16.2d, v17.2d // {z[o1], z[o1+1]}
+ zip1 v3.2d, v18.2d, v19.2d // {z[o1+2],z[o1+3]}
+
+ // 2 x fft4
+ transpose v22.2d, v23.2d, v20.2d, v21.2d
+
+ fadd v4.4s, v24.4s, v25.4s
+ fadd v5.4s, v26.4s, v27.4s
+ fsub v6.4s, v24.4s, v25.4s
+ fsub v7.4s, v22.4s, v23.4s
+
+ ld1 {v23.4s}, [x14]
+
+ fadd v24.4s, v4.4s, v5.4s // {z[o2+0],z[o2+1]}
+ fsub v26.4s, v4.4s, v5.4s // {z[o2+2],z[o2+3]}
+ fadd v25.4s, v6.4s, v7.4s // {z[o3+0],z[o3+1]}
+ fsub v27.4s, v6.4s, v7.4s // {z[o3+2],z[o3+3]}
+
+ //fft_pass_neon_16
+ rev64 v7.4s, v25.4s
+ fmul v25.4s, v25.4s, v23.s[1]
+ fmul v7.4s, v7.4s, v29.4s
+ fmla v25.4s, v7.4s, v23.s[3] // {t1a,t2a,t5a,t6a}
+
+ zip1 v20.4s, v24.4s, v25.4s
+ zip2 v21.4s, v24.4s, v25.4s
+ fneg v22.4s, v20.4s
+ fadd v4.4s, v21.4s, v20.4s
+ fsub v6.4s, v20.4s, v21.4s // just the second half
+ fadd v5.4s, v21.4s, v22.4s // just the first half
+
+ tbl v4.16b, {v4.16b}, v30.16b // trans4_float
+ tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
+
+ fsub v20.4s, v0.4s, v4.4s // {z[o2],z[o2+1]}
+ fadd v16.4s, v0.4s, v4.4s // {z[0], z[1]}
+ fsub v22.4s, v2.4s, v5.4s // {z[o3],z[o3+1]}
+ fadd v18.4s, v2.4s, v5.4s // {z[o1],z[o1+1]}
+
+//second half
+ rev64 v6.4s, v26.4s
+ fmul v26.4s, v26.4s, v23.s[2]
+ rev64 v7.4s, v27.4s
+ fmul v27.4s, v27.4s, v23.s[3]
+ fmul v6.4s, v6.4s, v29.4s
+ fmul v7.4s, v7.4s, v29.4s
+ fmla v26.4s, v6.4s, v23.s[2] // {t1,t2,t5,t6}
+ fmla v27.4s, v7.4s, v23.s[1] // {t1a,t2a,t5a,t6a}
+
+ zip1 v24.4s, v26.4s, v27.4s
+ zip2 v25.4s, v26.4s, v27.4s
+ fneg v26.4s, v24.4s
+ fadd v4.4s, v25.4s, v24.4s
+ fsub v6.4s, v24.4s, v25.4s // just the second half
+ fadd v5.4s, v25.4s, v26.4s // just the first half
+
+ tbl v4.16b, {v4.16b}, v30.16b // trans4_float
+ tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
+
+ fadd v17.4s, v1.4s, v4.4s // {z[2], z[3]}
+ fsub v21.4s, v1.4s, v4.4s // {z[o2+2],z[o2+3]}
+ fadd v19.4s, v3.4s, v5.4s // {z[o1+2],z[o1+3]}
+ fsub v23.4s, v3.4s, v5.4s // {z[o3+2],z[o3+3]}
+
+ st1 {v16.4s,v17.4s}, [x1], #32
+ st1 {v18.4s,v19.4s}, [x1], #32
+ st1 {v20.4s,v21.4s}, [x1], #32
+ st1 {v22.4s,v23.4s}, [x1], #32
+
+ ret
+endfunc
+
+
+const trans4_float, align=4
+ .byte 0, 1, 2, 3
+ .byte 8, 9, 10, 11
+ .byte 4, 5, 6, 7
+ .byte 12, 13, 14, 15
+endconst
+
+const trans8_float, align=4
+ .byte 24, 25, 26, 27
+ .byte 0, 1, 2, 3
+ .byte 28, 29, 30, 31
+ .byte 4, 5, 6, 7
+endconst
+
+function fft_pass_neon
+ sub x6, x2, #1 // n - 1, loop counter
+ lsl x5, x2, #3 // 2 * n * sizeof FFTSample
+ lsl x1, x2, #4 // 2 * n * sizeof FFTComplex
+ add x5, x4, x5 // wim
+ add x3, x1, x2, lsl #5 // 4 * n * sizeof FFTComplex
+ add x2, x0, x2, lsl #5 // &z[o2]
+ add x3, x0, x3 // &z[o3]
+ add x1, x0, x1 // &z[o1]
+ ld1 {v20.4s},[x2] // {z[o2],z[o2+1]}
+ ld1 {v22.4s},[x3] // {z[o3],z[o3+1]}
+ ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]}
+ trn2 v25.2d, v20.2d, v22.2d
+ sub x5, x5, #4 // wim--
+ trn1 v24.2d, v20.2d, v22.2d
+ ld1 {v5.s}[0], [x5], x7 // d5[0] = wim[-1]
+ rev64 v7.4s, v25.4s
+ fmul v25.4s, v25.4s, v4.s[1]
+ ld1 {v16.4s}, [x0] // {z[0],z[1]}
+ fmul v7.4s, v7.4s, v29.4s
+ ld1 {v17.4s}, [x1] // {z[o1],z[o1+1]}
+ prfm pldl1keep, [x2, #16]
+ prfm pldl1keep, [x3, #16]
+ fmla v25.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a}
+ prfm pldl1keep, [x0, #16]
+ prfm pldl1keep, [x1, #16]
+
+ zip1 v20.4s, v24.4s, v25.4s
+ zip2 v21.4s, v24.4s, v25.4s
+ fneg v22.4s, v20.4s
+ fadd v4.4s, v21.4s, v20.4s
+ fsub v6.4s, v20.4s, v21.4s // just the second half
+ fadd v5.4s, v21.4s, v22.4s // just the first half
+
+ tbl v4.16b, {v4.16b}, v30.16b // trans4_float
+ tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
+
+ fadd v20.4s, v16.4s, v4.4s
+ fsub v22.4s, v16.4s, v4.4s
+ fadd v21.4s, v17.4s, v5.4s
+ st1 {v20.4s}, [x0], #16 // {z[0], z[1]}
+ fsub v23.4s, v17.4s, v5.4s
+
+ st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]}
+ st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]}
+ st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]}
+1:
+ ld1 {v20.4s},[x2] // {z[o2],z[o2+1]}
+ ld1 {v22.4s},[x3] // {z[o3],z[o3+1]}
+ ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]}
+ transpose v26.2d, v27.2d, v20.2d, v22.2d
+ ld1 {v5.2s}, [x5], x7 // {wim[-1],wim[0]}
+ rev64 v6.4s, v26.4s
+ fmul v26.4s, v26.4s, v4.s[0]
+ rev64 v7.4s, v27.4s
+ fmul v27.4s, v27.4s, v4.s[1]
+ fmul v6.4s, v6.4s, v29.4s
+ fmul v7.4s, v7.4s, v29.4s
+ ld1 {v16.4s},[x0] // {z[0],z[1]}
+ fmla v26.4s, v6.4s, v5.s[1] // {t1,t2,t5,t6}
+ fmla v27.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a}
+ ld1 {v17.4s},[x1] // {z[o1],z[o1+1]}
+
+ subs x6, x6, #1 // n--
+
+ zip1 v20.4s, v26.4s, v27.4s
+ zip2 v21.4s, v26.4s, v27.4s
+ fneg v22.4s, v20.4s
+ fadd v4.4s, v21.4s, v20.4s
+ fsub v6.4s, v20.4s, v21.4s // just the second half
+ fadd v5.4s, v21.4s, v22.4s // just the first half
+
+ tbl v4.16b, {v4.16b}, v30.16b // trans4_float
+ tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
+
+ fadd v20.4s, v16.4s, v4.4s
+ fsub v22.4s, v16.4s, v4.4s
+ fadd v21.4s, v17.4s, v5.4s
+ st1 {v20.4s}, [x0], #16 // {z[0], z[1]}
+ fsub v23.4s, v17.4s, v5.4s
+
+ st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]}
+ st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]}
+ st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]}
+ b.ne 1b
+
+ ret
+endfunc
+
+.macro def_fft n, n2, n4
+function fft\n\()_neon, align=6
+ sub sp, sp, #16
+ stp x28, x30, [sp]
+ add x28, x0, #\n4*2*8
+ bl fft\n2\()_neon
+ mov x0, x28
+ bl fft\n4\()_neon
+ add x0, x28, #\n4*1*8
+ bl fft\n4\()_neon
+ sub x0, x28, #\n4*2*8
+ ldp x28, x30, [sp], #16
+ movrel x4, X(ff_cos_\n)
+ mov x2, #\n4>>1
+ b fft_pass_neon
+endfunc
+.endm
+
+ def_fft 32, 16, 8
+ def_fft 64, 32, 16
+ def_fft 128, 64, 32
+ def_fft 256, 128, 64
+ def_fft 512, 256, 128
+ def_fft 1024, 512, 256
+ def_fft 2048, 1024, 512
+ def_fft 4096, 2048, 1024
+ def_fft 8192, 4096, 2048
+ def_fft 16384, 8192, 4096
+ def_fft 32768, 16384, 8192
+ def_fft 65536, 32768, 16384
+
+function ff_fft_calc_neon, export=1
+ prfm pldl1keep, [x1]
+ movrel x10, trans4_float
+ ldr w2, [x0]
+ movrel x11, trans8_float
+ sub w2, w2, #2
+ movrel x3, fft_tab_neon
+ ld1 {v30.16b}, [x10]
+ mov x7, #-8
+ movrel x12, pmmp
+ ldr x3, [x3, x2, lsl #3]
+ movrel x13, mppm
+ movrel x14, X(ff_cos_16)
+ ld1 {v31.16b}, [x11]
+ mov x0, x1
+ ld1 {v29.4s}, [x12] // pmmp
+ ld1 {v28.4s}, [x13]
+ br x3
+endfunc
+
+function ff_fft_permute_neon, export=1
+ mov x6, #1
+ ldr w2, [x0] // nbits
+ ldr x3, [x0, #16] // tmp_buf
+ ldr x0, [x0, #8] // revtab
+ lsl x6, x6, x2
+ mov x2, x6
+1:
+ ld1 {v0.2s,v1.2s}, [x1], #16
+ ldr w4, [x0], #4
+ uxth w5, w4
+ lsr w4, w4, #16
+ add x5, x3, x5, lsl #3
+ add x4, x3, x4, lsl #3
+ st1 {v0.2s}, [x5]
+ st1 {v1.2s}, [x4]
+ subs x6, x6, #2
+ b.gt 1b
+
+ sub x1, x1, x2, lsl #3
+1:
+ ld1 {v0.4s,v1.4s}, [x3], #32
+ st1 {v0.4s,v1.4s}, [x1], #32
+ subs x2, x2, #4
+ b.gt 1b
+
+ ret
+endfunc
+
+const fft_tab_neon, relocate=1
+ .quad fft4_neon
+ .quad fft8_neon
+ .quad fft16_neon
+ .quad fft32_neon
+ .quad fft64_neon
+ .quad fft128_neon
+ .quad fft256_neon
+ .quad fft512_neon
+ .quad fft1024_neon
+ .quad fft2048_neon
+ .quad fft4096_neon
+ .quad fft8192_neon
+ .quad fft16384_neon
+ .quad fft32768_neon
+ .quad fft65536_neon
+endconst
+
+const pmmp, align=4
+ .float +1.0, -1.0, -1.0, +1.0
+endconst
+
+const mppm, align=4
+ .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
+endconst
diff --git a/media/ffvpx/libavcodec/aarch64/h264chroma_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/h264chroma_init_aarch64.c
new file mode 100644
index 0000000000..fa6e0eaf15
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/h264chroma_init_aarch64.c
@@ -0,0 +1,59 @@
+/*
+ * ARM NEON optimised H.264 chroma functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/h264chroma.h"
+
+#include "config.h"
+
+void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+void ff_put_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+void ff_put_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+
+void ff_avg_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+void ff_avg_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+void ff_avg_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+
+av_cold void ff_h264chroma_init_aarch64(H264ChromaContext *c, int bit_depth)
+{
+ const int high_bit_depth = bit_depth > 8;
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags) && !high_bit_depth) {
+ c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
+ c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
+ c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon;
+
+ c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon;
+ c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon;
+ c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_neon;
+ }
+}
diff --git a/media/ffvpx/libavcodec/aarch64/h264cmc_neon.S b/media/ffvpx/libavcodec/aarch64/h264cmc_neon.S
new file mode 100644
index 0000000000..8be7578001
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/h264cmc_neon.S
@@ -0,0 +1,450 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+/* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
+.macro h264_chroma_mc8 type, codec=h264
+function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
+ .ifc \type,avg
+ mov x8, x0
+ .endif
+ prfm pldl1strm, [x1]
+ prfm pldl1strm, [x1, x2]
+ .ifc \codec,rv40
+ movrel x6, rv40bias
+ lsr w9, w5, #1
+ lsr w10, w4, #1
+ lsl w9, w9, #3
+ lsl w10, w10, #1
+ add w9, w9, w10
+ add x6, x6, w9, UXTW
+ ld1r {v22.8H}, [x6]
+ .endif
+ .ifc \codec,vc1
+ movi v22.8H, #28
+ .endif
+ mul w7, w4, w5
+ lsl w14, w5, #3
+ lsl w13, w4, #3
+ cmp w7, #0
+ sub w6, w14, w7
+ sub w12, w13, w7
+ sub w4, w7, w13
+ sub w4, w4, w14
+ add w4, w4, #64
+ b.eq 2f
+
+ dup v0.8B, w4
+ dup v1.8B, w12
+ ld1 {v4.8B, v5.8B}, [x1], x2
+ dup v2.8B, w6
+ dup v3.8B, w7
+ ext v5.8B, v4.8B, v5.8B, #1
+1: ld1 {v6.8B, v7.8B}, [x1], x2
+ umull v16.8H, v4.8B, v0.8B
+ umlal v16.8H, v5.8B, v1.8B
+ ext v7.8B, v6.8B, v7.8B, #1
+ ld1 {v4.8B, v5.8B}, [x1], x2
+ umlal v16.8H, v6.8B, v2.8B
+ prfm pldl1strm, [x1]
+ ext v5.8B, v4.8B, v5.8B, #1
+ umlal v16.8H, v7.8B, v3.8B
+ umull v17.8H, v6.8B, v0.8B
+ subs w3, w3, #2
+ umlal v17.8H, v7.8B, v1.8B
+ umlal v17.8H, v4.8B, v2.8B
+ umlal v17.8H, v5.8B, v3.8B
+ prfm pldl1strm, [x1, x2]
+ .ifc \codec,h264
+ rshrn v16.8B, v16.8H, #6
+ rshrn v17.8B, v17.8H, #6
+ .else
+ add v16.8H, v16.8H, v22.8H
+ add v17.8H, v17.8H, v22.8H
+ shrn v16.8B, v16.8H, #6
+ shrn v17.8B, v17.8H, #6
+ .endif
+ .ifc \type,avg
+ ld1 {v20.8B}, [x8], x2
+ ld1 {v21.8B}, [x8], x2
+ urhadd v16.8B, v16.8B, v20.8B
+ urhadd v17.8B, v17.8B, v21.8B
+ .endif
+ st1 {v16.8B}, [x0], x2
+ st1 {v17.8B}, [x0], x2
+ b.gt 1b
+ ret
+
+2: adds w12, w12, w6
+ dup v0.8B, w4
+ b.eq 5f
+ tst w6, w6
+ dup v1.8B, w12
+ b.eq 4f
+
+ ld1 {v4.8B}, [x1], x2
+3: ld1 {v6.8B}, [x1], x2
+ umull v16.8H, v4.8B, v0.8B
+ umlal v16.8H, v6.8B, v1.8B
+ ld1 {v4.8B}, [x1], x2
+ umull v17.8H, v6.8B, v0.8B
+ umlal v17.8H, v4.8B, v1.8B
+ prfm pldl1strm, [x1]
+ .ifc \codec,h264
+ rshrn v16.8B, v16.8H, #6
+ rshrn v17.8B, v17.8H, #6
+ .else
+ add v16.8H, v16.8H, v22.8H
+ add v17.8H, v17.8H, v22.8H
+ shrn v16.8B, v16.8H, #6
+ shrn v17.8B, v17.8H, #6
+ .endif
+ prfm pldl1strm, [x1, x2]
+ .ifc \type,avg
+ ld1 {v20.8B}, [x8], x2
+ ld1 {v21.8B}, [x8], x2
+ urhadd v16.8B, v16.8B, v20.8B
+ urhadd v17.8B, v17.8B, v21.8B
+ .endif
+ subs w3, w3, #2
+ st1 {v16.8B}, [x0], x2
+ st1 {v17.8B}, [x0], x2
+ b.gt 3b
+ ret
+
+4: ld1 {v4.8B, v5.8B}, [x1], x2
+ ld1 {v6.8B, v7.8B}, [x1], x2
+ ext v5.8B, v4.8B, v5.8B, #1
+ ext v7.8B, v6.8B, v7.8B, #1
+ prfm pldl1strm, [x1]
+ subs w3, w3, #2
+ umull v16.8H, v4.8B, v0.8B
+ umlal v16.8H, v5.8B, v1.8B
+ umull v17.8H, v6.8B, v0.8B
+ umlal v17.8H, v7.8B, v1.8B
+ prfm pldl1strm, [x1, x2]
+ .ifc \codec,h264
+ rshrn v16.8B, v16.8H, #6
+ rshrn v17.8B, v17.8H, #6
+ .else
+ add v16.8H, v16.8H, v22.8H
+ add v17.8H, v17.8H, v22.8H
+ shrn v16.8B, v16.8H, #6
+ shrn v17.8B, v17.8H, #6
+ .endif
+ .ifc \type,avg
+ ld1 {v20.8B}, [x8], x2
+ ld1 {v21.8B}, [x8], x2
+ urhadd v16.8B, v16.8B, v20.8B
+ urhadd v17.8B, v17.8B, v21.8B
+ .endif
+ st1 {v16.8B}, [x0], x2
+ st1 {v17.8B}, [x0], x2
+ b.gt 4b
+ ret
+
+5: ld1 {v4.8B}, [x1], x2
+ ld1 {v5.8B}, [x1], x2
+ prfm pldl1strm, [x1]
+ subs w3, w3, #2
+ umull v16.8H, v4.8B, v0.8B
+ umull v17.8H, v5.8B, v0.8B
+ prfm pldl1strm, [x1, x2]
+ .ifc \codec,h264
+ rshrn v16.8B, v16.8H, #6
+ rshrn v17.8B, v17.8H, #6
+ .else
+ add v16.8H, v16.8H, v22.8H
+ add v17.8H, v17.8H, v22.8H
+ shrn v16.8B, v16.8H, #6
+ shrn v17.8B, v17.8H, #6
+ .endif
+ .ifc \type,avg
+ ld1 {v20.8B}, [x8], x2
+ ld1 {v21.8B}, [x8], x2
+ urhadd v16.8B, v16.8B, v20.8B
+ urhadd v17.8B, v17.8B, v21.8B
+ .endif
+ st1 {v16.8B}, [x0], x2
+ st1 {v17.8B}, [x0], x2
+ b.gt 5b
+ ret
+endfunc
+.endm
+
+/* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
+.macro h264_chroma_mc4 type, codec=h264
+function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
+ .ifc \type,avg
+ mov x8, x0
+ .endif
+ prfm pldl1strm, [x1]
+ prfm pldl1strm, [x1, x2]
+ .ifc \codec,rv40
+ movrel x6, rv40bias
+ lsr w9, w5, #1
+ lsr w10, w4, #1
+ lsl w9, w9, #3
+ lsl w10, w10, #1
+ add w9, w9, w10
+ add x6, x6, w9, UXTW
+ ld1r {v22.8H}, [x6]
+ .endif
+ .ifc \codec,vc1
+ movi v22.8H, #28
+ .endif
+ mul w7, w4, w5
+ lsl w14, w5, #3
+ lsl w13, w4, #3
+ cmp w7, #0
+ sub w6, w14, w7
+ sub w12, w13, w7
+ sub w4, w7, w13
+ sub w4, w4, w14
+ add w4, w4, #64
+ b.eq 2f
+
+ dup v24.8B, w4
+ dup v25.8B, w12
+ ld1 {v4.8B}, [x1], x2
+ dup v26.8B, w6
+ dup v27.8B, w7
+ ext v5.8B, v4.8B, v5.8B, #1
+ trn1 v0.2S, v24.2S, v25.2S
+ trn1 v2.2S, v26.2S, v27.2S
+ trn1 v4.2S, v4.2S, v5.2S
+1: ld1 {v6.8B}, [x1], x2
+ ext v7.8B, v6.8B, v7.8B, #1
+ trn1 v6.2S, v6.2S, v7.2S
+ umull v18.8H, v4.8B, v0.8B
+ umlal v18.8H, v6.8B, v2.8B
+ ld1 {v4.8B}, [x1], x2
+ ext v5.8B, v4.8B, v5.8B, #1
+ trn1 v4.2S, v4.2S, v5.2S
+ prfm pldl1strm, [x1]
+ umull v19.8H, v6.8B, v0.8B
+ umlal v19.8H, v4.8B, v2.8B
+ trn1 v30.2D, v18.2D, v19.2D
+ trn2 v31.2D, v18.2D, v19.2D
+ add v18.8H, v30.8H, v31.8H
+ .ifc \codec,h264
+ rshrn v16.8B, v18.8H, #6
+ .else
+ add v18.8H, v18.8H, v22.8H
+ shrn v16.8B, v18.8H, #6
+ .endif
+ subs w3, w3, #2
+ prfm pldl1strm, [x1, x2]
+ .ifc \type,avg
+ ld1 {v20.S}[0], [x8], x2
+ ld1 {v20.S}[1], [x8], x2
+ urhadd v16.8B, v16.8B, v20.8B
+ .endif
+ st1 {v16.S}[0], [x0], x2
+ st1 {v16.S}[1], [x0], x2
+ b.gt 1b
+ ret
+
+2: adds w12, w12, w6
+ dup v30.8B, w4
+ b.eq 5f
+ tst w6, w6
+ dup v31.8B, w12
+ trn1 v0.2S, v30.2S, v31.2S
+ trn2 v1.2S, v30.2S, v31.2S
+ b.eq 4f
+
+ ext v1.8B, v0.8B, v1.8B, #4
+ ld1 {v4.S}[0], [x1], x2
+3: ld1 {v4.S}[1], [x1], x2
+ umull v18.8H, v4.8B, v0.8B
+ ld1 {v4.S}[0], [x1], x2
+ umull v19.8H, v4.8B, v1.8B
+ trn1 v30.2D, v18.2D, v19.2D
+ trn2 v31.2D, v18.2D, v19.2D
+ add v18.8H, v30.8H, v31.8H
+ prfm pldl1strm, [x1]
+ .ifc \codec,h264
+ rshrn v16.8B, v18.8H, #6
+ .else
+ add v18.8H, v18.8H, v22.8H
+ shrn v16.8B, v18.8H, #6
+ .endif
+ .ifc \type,avg
+ ld1 {v20.S}[0], [x8], x2
+ ld1 {v20.S}[1], [x8], x2
+ urhadd v16.8B, v16.8B, v20.8B
+ .endif
+ subs w3, w3, #2
+ prfm pldl1strm, [x1, x2]
+ st1 {v16.S}[0], [x0], x2
+ st1 {v16.S}[1], [x0], x2
+ b.gt 3b
+ ret
+
+4: ld1 {v4.8B}, [x1], x2
+ ld1 {v6.8B}, [x1], x2
+ ext v5.8B, v4.8B, v5.8B, #1
+ ext v7.8B, v6.8B, v7.8B, #1
+ trn1 v4.2S, v4.2S, v5.2S
+ trn1 v6.2S, v6.2S, v7.2S
+ umull v18.8H, v4.8B, v0.8B
+ umull v19.8H, v6.8B, v0.8B
+ subs w3, w3, #2
+ trn1 v30.2D, v18.2D, v19.2D
+ trn2 v31.2D, v18.2D, v19.2D
+ add v18.8H, v30.8H, v31.8H
+ prfm pldl1strm, [x1]
+ .ifc \codec,h264
+ rshrn v16.8B, v18.8H, #6
+ .else
+ add v18.8H, v18.8H, v22.8H
+ shrn v16.8B, v18.8H, #6
+ .endif
+ .ifc \type,avg
+ ld1 {v20.S}[0], [x8], x2
+ ld1 {v20.S}[1], [x8], x2
+ urhadd v16.8B, v16.8B, v20.8B
+ .endif
+ prfm pldl1strm, [x1]
+ st1 {v16.S}[0], [x0], x2
+ st1 {v16.S}[1], [x0], x2
+ b.gt 4b
+ ret
+
+5: ld1 {v4.S}[0], [x1], x2
+ ld1 {v4.S}[1], [x1], x2
+ umull v18.8H, v4.8B, v30.8B
+ subs w3, w3, #2
+ prfm pldl1strm, [x1]
+ .ifc \codec,h264
+ rshrn v16.8B, v18.8H, #6
+ .else
+ add v18.8H, v18.8H, v22.8H
+ shrn v16.8B, v18.8H, #6
+ .endif
+ .ifc \type,avg
+ ld1 {v20.S}[0], [x8], x2
+ ld1 {v20.S}[1], [x8], x2
+ urhadd v16.8B, v16.8B, v20.8B
+ .endif
+ prfm pldl1strm, [x1]
+ st1 {v16.S}[0], [x0], x2
+ st1 {v16.S}[1], [x0], x2
+ b.gt 5b
+ ret
+endfunc
+.endm
+
+.macro h264_chroma_mc2 type
+function ff_\type\()_h264_chroma_mc2_neon, export=1
+ prfm pldl1strm, [x1]
+ prfm pldl1strm, [x1, x2]
+ orr w7, w4, w5
+ cbz w7, 2f
+
+ mul w7, w4, w5
+ lsl w14, w5, #3
+ lsl w13, w4, #3
+ sub w6, w14, w7
+ sub w12, w13, w7
+ sub w4, w7, w13
+ sub w4, w4, w14
+ add w4, w4, #64
+ dup v0.8B, w4
+ dup v2.8B, w12
+ dup v1.8B, w6
+ dup v3.8B, w7
+ trn1 v0.4H, v0.4H, v2.4H
+ trn1 v1.4H, v1.4H, v3.4H
+1:
+ ld1 {v4.S}[0], [x1], x2
+ ld1 {v4.S}[1], [x1], x2
+ rev64 v5.2S, v4.2S
+ ld1 {v5.S}[1], [x1]
+ ext v6.8B, v4.8B, v5.8B, #1
+ ext v7.8B, v5.8B, v4.8B, #1
+ trn1 v4.4H, v4.4H, v6.4H
+ trn1 v5.4H, v5.4H, v7.4H
+ umull v16.8H, v4.8B, v0.8B
+ umlal v16.8H, v5.8B, v1.8B
+ .ifc \type,avg
+ ld1 {v18.H}[0], [x0], x2
+ ld1 {v18.H}[2], [x0]
+ sub x0, x0, x2
+ .endif
+ rev64 v17.4S, v16.4S
+ add v16.8H, v16.8H, v17.8H
+ rshrn v16.8B, v16.8H, #6
+ .ifc \type,avg
+ urhadd v16.8B, v16.8B, v18.8B
+ .endif
+ st1 {v16.H}[0], [x0], x2
+ st1 {v16.H}[2], [x0], x2
+ subs w3, w3, #2
+ b.gt 1b
+ ret
+
+2:
+ ld1 {v16.H}[0], [x1], x2
+ ld1 {v16.H}[1], [x1], x2
+ .ifc \type,avg
+ ld1 {v18.H}[0], [x0], x2
+ ld1 {v18.H}[1], [x0]
+ sub x0, x0, x2
+ urhadd v16.8B, v16.8B, v18.8B
+ .endif
+ st1 {v16.H}[0], [x0], x2
+ st1 {v16.H}[1], [x0], x2
+ subs w3, w3, #2
+ b.gt 2b
+ ret
+endfunc
+.endm
+
+ h264_chroma_mc8 put
+ h264_chroma_mc8 avg
+ h264_chroma_mc4 put
+ h264_chroma_mc4 avg
+ h264_chroma_mc2 put
+ h264_chroma_mc2 avg
+
+#if CONFIG_RV40_DECODER
+const rv40bias
+ .short 0, 16, 32, 16
+ .short 32, 28, 32, 28
+ .short 0, 32, 16, 32
+ .short 32, 28, 32, 28
+endconst
+
+ h264_chroma_mc8 put, rv40
+ h264_chroma_mc8 avg, rv40
+ h264_chroma_mc4 put, rv40
+ h264_chroma_mc4 avg, rv40
+#endif
+
+#if CONFIG_VC1DSP
+ h264_chroma_mc8 put, vc1
+ h264_chroma_mc8 avg, vc1
+ h264_chroma_mc4 put, vc1
+ h264_chroma_mc4 avg, vc1
+#endif
diff --git a/media/ffvpx/libavcodec/aarch64/h264dsp_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/h264dsp_init_aarch64.c
new file mode 100644
index 0000000000..e0f378f5ab
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/h264dsp_init_aarch64.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/h264dsp.h"
+
+void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
+ int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
+ int beta, int8_t *tc0);
+void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
+ int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
+ int beta, int8_t *tc0);
+
+void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
+ int log2_den, int weight, int offset);
+void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
+ int log2_den, int weight, int offset);
+void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
+ int log2_den, int weight, int offset);
+
+void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
+ int height, int log2_den, int weightd,
+ int weights, int offset);
+void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
+ int height, int log2_den, int weightd,
+ int weights, int offset);
+void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
+ int height, int log2_den, int weightd,
+ int weights, int offset);
+
+void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
+ int16_t *block, int stride,
+ const uint8_t nnzc[6*8]);
+void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset,
+ int16_t *block, int stride,
+ const uint8_t nnzc[6*8]);
+void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
+ int16_t *block, int stride,
+ const uint8_t nnzc[6*8]);
+
+void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
+ int16_t *block, int stride,
+ const uint8_t nnzc[6*8]);
+
+av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
+ const int chroma_format_idc)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags) && bit_depth == 8) {
+ c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon;
+ c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon;
+ c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
+ if (chroma_format_idc <= 1)
+ c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
+
+ c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
+ c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
+ c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
+
+ c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
+ c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
+ c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
+
+ c->h264_idct_add = ff_h264_idct_add_neon;
+ c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
+ c->h264_idct_add16 = ff_h264_idct_add16_neon;
+ c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
+ if (chroma_format_idc <= 1)
+ c->h264_idct_add8 = ff_h264_idct_add8_neon;
+ c->h264_idct8_add = ff_h264_idct8_add_neon;
+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon;
+ c->h264_idct8_add4 = ff_h264_idct8_add4_neon;
+ }
+}
diff --git a/media/ffvpx/libavcodec/aarch64/h264dsp_neon.S b/media/ffvpx/libavcodec/aarch64/h264dsp_neon.S
new file mode 100644
index 0000000000..4ec35f2905
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/h264dsp_neon.S
@@ -0,0 +1,498 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+.macro h264_loop_filter_start
+ cmp w2, #0
+ ldr w6, [x4]
+ ccmp w3, #0, #0, ne
+ mov v24.S[0], w6
+ and w6, w6, w6, lsl #16
+ b.eq 1f
+ ands w6, w6, w6, lsl #8
+ b.ge 2f
+1:
+ ret
+2:
+.endm
+
+.macro h264_loop_filter_luma
+ dup v22.16B, w2 // alpha
+ uxtl v24.8H, v24.8B
+ uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0)
+ uxtl v24.4S, v24.4H
+ uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0)
+ sli v24.8H, v24.8H, #8
+ uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0)
+ sli v24.4S, v24.4S, #16
+ cmhi v21.16B, v22.16B, v21.16B // < alpha
+ dup v22.16B, w3 // beta
+ cmlt v23.16B, v24.16B, #0
+ cmhi v28.16B, v22.16B, v28.16B // < beta
+ cmhi v30.16B, v22.16B, v30.16B // < beta
+ bic v21.16B, v21.16B, v23.16B
+ uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0)
+ and v21.16B, v21.16B, v28.16B
+ uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0)
+ cmhi v17.16B, v22.16B, v17.16B // < beta
+ and v21.16B, v21.16B, v30.16B
+ cmhi v19.16B, v22.16B, v19.16B // < beta
+ and v17.16B, v17.16B, v21.16B
+ and v19.16B, v19.16B, v21.16B
+ and v24.16B, v24.16B, v21.16B
+ urhadd v28.16B, v16.16B, v0.16B
+ sub v21.16B, v24.16B, v17.16B
+ uqadd v23.16B, v18.16B, v24.16B
+ uhadd v20.16B, v20.16B, v28.16B
+ sub v21.16B, v21.16B, v19.16B
+ uhadd v28.16B, v4.16B, v28.16B
+ umin v23.16B, v23.16B, v20.16B
+ uqsub v22.16B, v18.16B, v24.16B
+ uqadd v4.16B, v2.16B, v24.16B
+ umax v23.16B, v23.16B, v22.16B
+ uqsub v22.16B, v2.16B, v24.16B
+ umin v28.16B, v4.16B, v28.16B
+ uxtl v4.8H, v0.8B
+ umax v28.16B, v28.16B, v22.16B
+ uxtl2 v20.8H, v0.16B
+ usubw v4.8H, v4.8H, v16.8B
+ usubw2 v20.8H, v20.8H, v16.16B
+ shl v4.8H, v4.8H, #2
+ shl v20.8H, v20.8H, #2
+ uaddw v4.8H, v4.8H, v18.8B
+ uaddw2 v20.8H, v20.8H, v18.16B
+ usubw v4.8H, v4.8H, v2.8B
+ usubw2 v20.8H, v20.8H, v2.16B
+ rshrn v4.8B, v4.8H, #3
+ rshrn2 v4.16B, v20.8H, #3
+ bsl v17.16B, v23.16B, v18.16B
+ bsl v19.16B, v28.16B, v2.16B
+ neg v23.16B, v21.16B
+ uxtl v28.8H, v16.8B
+ smin v4.16B, v4.16B, v21.16B
+ uxtl2 v21.8H, v16.16B
+ smax v4.16B, v4.16B, v23.16B
+ uxtl v22.8H, v0.8B
+ uxtl2 v24.8H, v0.16B
+ saddw v28.8H, v28.8H, v4.8B
+ saddw2 v21.8H, v21.8H, v4.16B
+ ssubw v22.8H, v22.8H, v4.8B
+ ssubw2 v24.8H, v24.8H, v4.16B
+ sqxtun v16.8B, v28.8H
+ sqxtun2 v16.16B, v21.8H
+ sqxtun v0.8B, v22.8H
+ sqxtun2 v0.16B, v24.8H
+.endm
+
+function ff_h264_v_loop_filter_luma_neon, export=1
+ h264_loop_filter_start
+ sxtw x1, w1
+
+ ld1 {v0.16B}, [x0], x1
+ ld1 {v2.16B}, [x0], x1
+ ld1 {v4.16B}, [x0], x1
+ sub x0, x0, x1, lsl #2
+ sub x0, x0, x1, lsl #1
+ ld1 {v20.16B}, [x0], x1
+ ld1 {v18.16B}, [x0], x1
+ ld1 {v16.16B}, [x0], x1
+
+ h264_loop_filter_luma
+
+ sub x0, x0, x1, lsl #1
+ st1 {v17.16B}, [x0], x1
+ st1 {v16.16B}, [x0], x1
+ st1 {v0.16B}, [x0], x1
+ st1 {v19.16B}, [x0]
+
+ ret
+endfunc
+
+function ff_h264_h_loop_filter_luma_neon, export=1
+ h264_loop_filter_start
+
+ sub x0, x0, #4
+ ld1 {v6.8B}, [x0], x1
+ ld1 {v20.8B}, [x0], x1
+ ld1 {v18.8B}, [x0], x1
+ ld1 {v16.8B}, [x0], x1
+ ld1 {v0.8B}, [x0], x1
+ ld1 {v2.8B}, [x0], x1
+ ld1 {v4.8B}, [x0], x1
+ ld1 {v26.8B}, [x0], x1
+ ld1 {v6.D}[1], [x0], x1
+ ld1 {v20.D}[1], [x0], x1
+ ld1 {v18.D}[1], [x0], x1
+ ld1 {v16.D}[1], [x0], x1
+ ld1 {v0.D}[1], [x0], x1
+ ld1 {v2.D}[1], [x0], x1
+ ld1 {v4.D}[1], [x0], x1
+ ld1 {v26.D}[1], [x0], x1
+
+ transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
+
+ h264_loop_filter_luma
+
+ transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
+
+ sub x0, x0, x1, lsl #4
+ add x0, x0, #2
+ st1 {v17.S}[0], [x0], x1
+ st1 {v16.S}[0], [x0], x1
+ st1 {v0.S}[0], [x0], x1
+ st1 {v19.S}[0], [x0], x1
+ st1 {v17.S}[1], [x0], x1
+ st1 {v16.S}[1], [x0], x1
+ st1 {v0.S}[1], [x0], x1
+ st1 {v19.S}[1], [x0], x1
+ st1 {v17.S}[2], [x0], x1
+ st1 {v16.S}[2], [x0], x1
+ st1 {v0.S}[2], [x0], x1
+ st1 {v19.S}[2], [x0], x1
+ st1 {v17.S}[3], [x0], x1
+ st1 {v16.S}[3], [x0], x1
+ st1 {v0.S}[3], [x0], x1
+ st1 {v19.S}[3], [x0], x1
+
+ ret
+endfunc
+
+.macro h264_loop_filter_chroma
+ dup v22.8B, w2 // alpha
+ uxtl v24.8H, v24.8B
+ uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0)
+ uxtl v4.8H, v0.8B
+ uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0)
+ usubw v4.8H, v4.8H, v16.8B
+ sli v24.8H, v24.8H, #8
+ shl v4.8H, v4.8H, #2
+ uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
+ uaddw v4.8H, v4.8H, v18.8B
+ cmhi v26.8B, v22.8B, v26.8B // < alpha
+ usubw v4.8H, v4.8H, v2.8B
+ dup v22.8B, w3 // beta
+ rshrn v4.8B, v4.8H, #3
+ cmhi v28.8B, v22.8B, v28.8B // < beta
+ cmhi v30.8B, v22.8B, v30.8B // < beta
+ smin v4.8B, v4.8B, v24.8B
+ neg v25.8B, v24.8B
+ and v26.8B, v26.8B, v28.8B
+ smax v4.8B, v4.8B, v25.8B
+ and v26.8B, v26.8B, v30.8B
+ uxtl v22.8H, v0.8B
+ and v4.8B, v4.8B, v26.8B
+ uxtl v28.8H, v16.8B
+ saddw v28.8H, v28.8H, v4.8B
+ ssubw v22.8H, v22.8H, v4.8B
+ sqxtun v16.8B, v28.8H
+ sqxtun v0.8B, v22.8H
+.endm
+
+function ff_h264_v_loop_filter_chroma_neon, export=1
+ h264_loop_filter_start
+
+ sub x0, x0, x1, lsl #1
+ ld1 {v18.8B}, [x0], x1
+ ld1 {v16.8B}, [x0], x1
+ ld1 {v0.8B}, [x0], x1
+ ld1 {v2.8B}, [x0]
+
+ h264_loop_filter_chroma
+
+ sub x0, x0, x1, lsl #1
+ st1 {v16.8B}, [x0], x1
+ st1 {v0.8B}, [x0], x1
+
+ ret
+endfunc
+
+function ff_h264_h_loop_filter_chroma_neon, export=1
+ h264_loop_filter_start
+
+ sub x0, x0, #2
+ ld1 {v18.S}[0], [x0], x1
+ ld1 {v16.S}[0], [x0], x1
+ ld1 {v0.S}[0], [x0], x1
+ ld1 {v2.S}[0], [x0], x1
+ ld1 {v18.S}[1], [x0], x1
+ ld1 {v16.S}[1], [x0], x1
+ ld1 {v0.S}[1], [x0], x1
+ ld1 {v2.S}[1], [x0], x1
+
+ transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
+
+ h264_loop_filter_chroma
+
+ transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
+
+ sub x0, x0, x1, lsl #3
+ st1 {v18.S}[0], [x0], x1
+ st1 {v16.S}[0], [x0], x1
+ st1 {v0.S}[0], [x0], x1
+ st1 {v2.S}[0], [x0], x1
+ st1 {v18.S}[1], [x0], x1
+ st1 {v16.S}[1], [x0], x1
+ st1 {v0.S}[1], [x0], x1
+ st1 {v2.S}[1], [x0], x1
+
+ ret
+endfunc
+
+.macro biweight_16 macs, macd
+ dup v0.16B, w5
+ dup v1.16B, w6
+ mov v4.16B, v16.16B
+ mov v6.16B, v16.16B
+1: subs w3, w3, #2
+ ld1 {v20.16B}, [x0], x2
+ \macd v4.8H, v0.8B, v20.8B
+ \macd\()2 v6.8H, v0.16B, v20.16B
+ ld1 {v22.16B}, [x1], x2
+ \macs v4.8H, v1.8B, v22.8B
+ \macs\()2 v6.8H, v1.16B, v22.16B
+ mov v24.16B, v16.16B
+ ld1 {v28.16B}, [x0], x2
+ mov v26.16B, v16.16B
+ \macd v24.8H, v0.8B, v28.8B
+ \macd\()2 v26.8H, v0.16B, v28.16B
+ ld1 {v30.16B}, [x1], x2
+ \macs v24.8H, v1.8B, v30.8B
+ \macs\()2 v26.8H, v1.16B, v30.16B
+ sshl v4.8H, v4.8H, v18.8H
+ sshl v6.8H, v6.8H, v18.8H
+ sqxtun v4.8B, v4.8H
+ sqxtun2 v4.16B, v6.8H
+ sshl v24.8H, v24.8H, v18.8H
+ sshl v26.8H, v26.8H, v18.8H
+ sqxtun v24.8B, v24.8H
+ sqxtun2 v24.16B, v26.8H
+ mov v6.16B, v16.16B
+ st1 {v4.16B}, [x7], x2
+ mov v4.16B, v16.16B
+ st1 {v24.16B}, [x7], x2
+ b.ne 1b
+ ret
+.endm
+
+.macro biweight_8 macs, macd
+ dup v0.8B, w5
+ dup v1.8B, w6
+ mov v2.16B, v16.16B
+ mov v20.16B, v16.16B
+1: subs w3, w3, #2
+ ld1 {v4.8B}, [x0], x2
+ \macd v2.8H, v0.8B, v4.8B
+ ld1 {v5.8B}, [x1], x2
+ \macs v2.8H, v1.8B, v5.8B
+ ld1 {v6.8B}, [x0], x2
+ \macd v20.8H, v0.8B, v6.8B
+ ld1 {v7.8B}, [x1], x2
+ \macs v20.8H, v1.8B, v7.8B
+ sshl v2.8H, v2.8H, v18.8H
+ sqxtun v2.8B, v2.8H
+ sshl v20.8H, v20.8H, v18.8H
+ sqxtun v4.8B, v20.8H
+ mov v20.16B, v16.16B
+ st1 {v2.8B}, [x7], x2
+ mov v2.16B, v16.16B
+ st1 {v4.8B}, [x7], x2
+ b.ne 1b
+ ret
+.endm
+
+.macro biweight_4 macs, macd
+ dup v0.8B, w5
+ dup v1.8B, w6
+ mov v2.16B, v16.16B
+ mov v20.16B,v16.16B
+1: subs w3, w3, #4
+ ld1 {v4.S}[0], [x0], x2
+ ld1 {v4.S}[1], [x0], x2
+ \macd v2.8H, v0.8B, v4.8B
+ ld1 {v5.S}[0], [x1], x2
+ ld1 {v5.S}[1], [x1], x2
+ \macs v2.8H, v1.8B, v5.8B
+ b.lt 2f
+ ld1 {v6.S}[0], [x0], x2
+ ld1 {v6.S}[1], [x0], x2
+ \macd v20.8H, v0.8B, v6.8B
+ ld1 {v7.S}[0], [x1], x2
+ ld1 {v7.S}[1], [x1], x2
+ \macs v20.8H, v1.8B, v7.8B
+ sshl v2.8H, v2.8H, v18.8H
+ sqxtun v2.8B, v2.8H
+ sshl v20.8H, v20.8H, v18.8H
+ sqxtun v4.8B, v20.8H
+ mov v20.16B, v16.16B
+ st1 {v2.S}[0], [x7], x2
+ st1 {v2.S}[1], [x7], x2
+ mov v2.16B, v16.16B
+ st1 {v4.S}[0], [x7], x2
+ st1 {v4.S}[1], [x7], x2
+ b.ne 1b
+ ret
+2: sshl v2.8H, v2.8H, v18.8H
+ sqxtun v2.8B, v2.8H
+ st1 {v2.S}[0], [x7], x2
+ st1 {v2.S}[1], [x7], x2
+ ret
+.endm
+
+.macro biweight_func w
+function ff_biweight_h264_pixels_\w\()_neon, export=1
+ sxtw x2, w2
+ lsr w8, w5, #31
+ add w7, w7, #1
+ eor w8, w8, w6, lsr #30
+ orr w7, w7, #1
+ dup v18.8H, w4
+ lsl w7, w7, w4
+ not v18.16B, v18.16B
+ dup v16.8H, w7
+ mov x7, x0
+ cbz w8, 10f
+ subs w8, w8, #1
+ b.eq 20f
+ subs w8, w8, #1
+ b.eq 30f
+ b 40f
+10: biweight_\w umlal, umlal
+20: neg w5, w5
+ biweight_\w umlal, umlsl
+30: neg w5, w5
+ neg w6, w6
+ biweight_\w umlsl, umlsl
+40: neg w6, w6
+ biweight_\w umlsl, umlal
+endfunc
+.endm
+
+ biweight_func 16
+ biweight_func 8
+ biweight_func 4
+
+.macro weight_16 add
+ dup v0.16B, w4
+1: subs w2, w2, #2
+ ld1 {v20.16B}, [x0], x1
+ umull v4.8H, v0.8B, v20.8B
+ umull2 v6.8H, v0.16B, v20.16B
+ ld1 {v28.16B}, [x0], x1
+ umull v24.8H, v0.8B, v28.8B
+ umull2 v26.8H, v0.16B, v28.16B
+ \add v4.8H, v16.8H, v4.8H
+ srshl v4.8H, v4.8H, v18.8H
+ \add v6.8H, v16.8H, v6.8H
+ srshl v6.8H, v6.8H, v18.8H
+ sqxtun v4.8B, v4.8H
+ sqxtun2 v4.16B, v6.8H
+ \add v24.8H, v16.8H, v24.8H
+ srshl v24.8H, v24.8H, v18.8H
+ \add v26.8H, v16.8H, v26.8H
+ srshl v26.8H, v26.8H, v18.8H
+ sqxtun v24.8B, v24.8H
+ sqxtun2 v24.16B, v26.8H
+ st1 {v4.16B}, [x5], x1
+ st1 {v24.16B}, [x5], x1
+ b.ne 1b
+ ret
+.endm
+
+.macro weight_8 add
+ dup v0.8B, w4
+1: subs w2, w2, #2
+ ld1 {v4.8B}, [x0], x1
+ umull v2.8H, v0.8B, v4.8B
+ ld1 {v6.8B}, [x0], x1
+ umull v20.8H, v0.8B, v6.8B
+ \add v2.8H, v16.8H, v2.8H
+ srshl v2.8H, v2.8H, v18.8H
+ sqxtun v2.8B, v2.8H
+ \add v20.8H, v16.8H, v20.8H
+ srshl v20.8H, v20.8H, v18.8H
+ sqxtun v4.8B, v20.8H
+ st1 {v2.8B}, [x5], x1
+ st1 {v4.8B}, [x5], x1
+ b.ne 1b
+ ret
+.endm
+
+.macro weight_4 add
+ dup v0.8B, w4
+1: subs w2, w2, #4
+ ld1 {v4.S}[0], [x0], x1
+ ld1 {v4.S}[1], [x0], x1
+ umull v2.8H, v0.8B, v4.8B
+ b.lt 2f
+ ld1 {v6.S}[0], [x0], x1
+ ld1 {v6.S}[1], [x0], x1
+ umull v20.8H, v0.8B, v6.8B
+ \add v2.8H, v16.8H, v2.8H
+ srshl v2.8H, v2.8H, v18.8H
+ sqxtun v2.8B, v2.8H
+ \add v20.8H, v16.8H, v20.8H
+ srshl v20.8H, v20.8h, v18.8H
+ sqxtun v4.8B, v20.8H
+ st1 {v2.S}[0], [x5], x1
+ st1 {v2.S}[1], [x5], x1
+ st1 {v4.S}[0], [x5], x1
+ st1 {v4.S}[1], [x5], x1
+ b.ne 1b
+ ret
+2: \add v2.8H, v16.8H, v2.8H
+ srshl v2.8H, v2.8H, v18.8H
+ sqxtun v2.8B, v2.8H
+ st1 {v2.S}[0], [x5], x1
+ st1 {v2.S}[1], [x5], x1
+ ret
+.endm
+
+.macro weight_func w
+function ff_weight_h264_pixels_\w\()_neon, export=1
+ sxtw x1, w1
+ cmp w3, #1
+ mov w6, #1
+ lsl w5, w5, w3
+ dup v16.8H, w5
+ mov x5, x0
+ b.le 20f
+ sub w6, w6, w3
+ dup v18.8H, w6
+ cmp w4, #0
+ b.lt 10f
+ weight_\w shadd
+10: neg w4, w4
+ weight_\w shsub
+20: neg w6, w3
+ dup v18.8H, w6
+ cmp w4, #0
+ b.lt 10f
+ weight_\w add
+10: neg w4, w4
+ weight_\w sub
+endfunc
+.endm
+
+ weight_func 16
+ weight_func 8
+ weight_func 4
diff --git a/media/ffvpx/libavcodec/aarch64/h264idct_neon.S b/media/ffvpx/libavcodec/aarch64/h264idct_neon.S
new file mode 100644
index 0000000000..825ec49f8c
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/h264idct_neon.S
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+function ff_h264_idct_add_neon, export=1
+ ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1]
+ sxtw x2, w2
+ movi v30.8H, #0
+
+ add v4.4H, v0.4H, v2.4H
+ sshr v16.4H, v1.4H, #1
+ st1 {v30.8H}, [x1], #16
+ sshr v17.4H, v3.4H, #1
+ st1 {v30.8H}, [x1], #16
+ sub v5.4H, v0.4H, v2.4H
+ sub v6.4H, v16.4H, v3.4H
+ add v7.4H, v1.4H, v17.4H
+ add v0.4H, v4.4H, v7.4H
+ add v1.4H, v5.4H, v6.4H
+ sub v2.4H, v5.4H, v6.4H
+ sub v3.4H, v4.4H, v7.4H
+
+ transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
+
+ add v4.4H, v0.4H, v2.4H
+ ld1 {v18.S}[0], [x0], x2
+ sshr v16.4H, v3.4H, #1
+ sshr v17.4H, v1.4H, #1
+ ld1 {v18.S}[1], [x0], x2
+ sub v5.4H, v0.4H, v2.4H
+ ld1 {v19.S}[1], [x0], x2
+ add v6.4H, v16.4H, v1.4H
+ ins v4.D[1], v5.D[0]
+ sub v7.4H, v17.4H, v3.4H
+ ld1 {v19.S}[0], [x0], x2
+ ins v6.D[1], v7.D[0]
+ sub x0, x0, x2, lsl #2
+ add v0.8H, v4.8H, v6.8H
+ sub v1.8H, v4.8H, v6.8H
+
+ srshr v0.8H, v0.8H, #6
+ srshr v1.8H, v1.8H, #6
+
+ uaddw v0.8H, v0.8H, v18.8B
+ uaddw v1.8H, v1.8H, v19.8B
+
+ sqxtun v0.8B, v0.8H
+ sqxtun v1.8B, v1.8H
+
+ st1 {v0.S}[0], [x0], x2
+ st1 {v0.S}[1], [x0], x2
+ st1 {v1.S}[1], [x0], x2
+ st1 {v1.S}[0], [x0], x2
+
+ sub x1, x1, #32
+ ret
+endfunc
+
+function ff_h264_idct_dc_add_neon, export=1
+ sxtw x2, w2
+ mov w3, #0
+ ld1r {v2.8H}, [x1]
+ strh w3, [x1]
+ srshr v2.8H, v2.8H, #6
+ ld1 {v0.S}[0], [x0], x2
+ ld1 {v0.S}[1], [x0], x2
+ uaddw v3.8H, v2.8H, v0.8B
+ ld1 {v1.S}[0], [x0], x2
+ ld1 {v1.S}[1], [x0], x2
+ uaddw v4.8H, v2.8H, v1.8B
+ sqxtun v0.8B, v3.8H
+ sqxtun v1.8B, v4.8H
+ sub x0, x0, x2, lsl #2
+ st1 {v0.S}[0], [x0], x2
+ st1 {v0.S}[1], [x0], x2
+ st1 {v1.S}[0], [x0], x2
+ st1 {v1.S}[1], [x0], x2
+ ret
+endfunc
+
+function ff_h264_idct_add16_neon, export=1
+ mov x12, x30
+ mov x6, x0 // dest
+ mov x5, x1 // block_offset
+ mov x1, x2 // block
+ mov w9, w3 // stride
+ movrel x7, scan8
+ mov x10, #16
+ movrel x13, X(ff_h264_idct_dc_add_neon)
+ movrel x14, X(ff_h264_idct_add_neon)
+1: mov w2, w9
+ ldrb w3, [x7], #1
+ ldrsw x0, [x5], #4
+ ldrb w3, [x4, w3, uxtw]
+ subs w3, w3, #1
+ b.lt 2f
+ ldrsh w3, [x1]
+ add x0, x0, x6
+ ccmp w3, #0, #4, eq
+ csel x15, x13, x14, ne
+ blr x15
+2: subs x10, x10, #1
+ add x1, x1, #32
+ b.ne 1b
+ ret x12
+endfunc
+
+function ff_h264_idct_add16intra_neon, export=1
+ mov x12, x30
+ mov x6, x0 // dest
+ mov x5, x1 // block_offset
+ mov x1, x2 // block
+ mov w9, w3 // stride
+ movrel x7, scan8
+ mov x10, #16
+ movrel x13, X(ff_h264_idct_dc_add_neon)
+ movrel x14, X(ff_h264_idct_add_neon)
+1: mov w2, w9
+ ldrb w3, [x7], #1
+ ldrsw x0, [x5], #4
+ ldrb w3, [x4, w3, uxtw]
+ add x0, x0, x6
+ cmp w3, #0
+ ldrsh w3, [x1]
+ csel x15, x13, x14, eq
+ ccmp w3, #0, #0, eq
+ b.eq 2f
+ blr x15
+2: subs x10, x10, #1
+ add x1, x1, #32
+ b.ne 1b
+ ret x12
+endfunc
+
+function ff_h264_idct_add8_neon, export=1
+ sub sp, sp, #0x40
+ stp x19, x20, [sp]
+ mov x12, x30
+ ldp x6, x15, [x0] // dest[0], dest[1]
+ add x5, x1, #16*4 // block_offset
+ add x9, x2, #16*32 // block
+ mov w19, w3 // stride
+ movrel x13, X(ff_h264_idct_dc_add_neon)
+ movrel x14, X(ff_h264_idct_add_neon)
+ movrel x7, scan8, 16
+ mov x10, #0
+ mov x11, #16
+1: mov w2, w19
+ ldrb w3, [x7, x10] // scan8[i]
+ ldrsw x0, [x5, x10, lsl #2] // block_offset[i]
+ ldrb w3, [x4, w3, uxtw] // nnzc[ scan8[i] ]
+ add x0, x0, x6 // block_offset[i] + dst[j-1]
+ add x1, x9, x10, lsl #5 // block + i * 16
+ cmp w3, #0
+ ldrsh w3, [x1] // block[i*16]
+ csel x20, x13, x14, eq
+ ccmp w3, #0, #0, eq
+ b.eq 2f
+ blr x20
+2: add x10, x10, #1
+ cmp x10, #4
+ csel x10, x11, x10, eq // mov x10, #16
+ csel x6, x15, x6, eq
+ cmp x10, #20
+ b.lt 1b
+ ldp x19, x20, [sp]
+ add sp, sp, #0x40
+ ret x12
+endfunc
+
+.macro idct8x8_cols pass
+ .if \pass == 0
+ va .req v18
+ vb .req v30
+ sshr v18.8H, v26.8H, #1
+ add v16.8H, v24.8H, v28.8H
+ ld1 {v30.8H, v31.8H}, [x1]
+ st1 {v19.8H}, [x1], #16
+ st1 {v19.8H}, [x1], #16
+ sub v17.8H, v24.8H, v28.8H
+ sshr v19.8H, v30.8H, #1
+ sub v18.8H, v18.8H, v30.8H
+ add v19.8H, v19.8H, v26.8H
+ .else
+ va .req v30
+ vb .req v18
+ sshr v30.8H, v26.8H, #1
+ sshr v19.8H, v18.8H, #1
+ add v16.8H, v24.8H, v28.8H
+ sub v17.8H, v24.8H, v28.8H
+ sub v30.8H, v30.8H, v18.8H
+ add v19.8H, v19.8H, v26.8H
+ .endif
+ add v26.8H, v17.8H, va.8H
+ sub v28.8H, v17.8H, va.8H
+ add v24.8H, v16.8H, v19.8H
+ sub vb.8H, v16.8H, v19.8H
+ sub v16.8H, v29.8H, v27.8H
+ add v17.8H, v31.8H, v25.8H
+ sub va.8H, v31.8H, v25.8H
+ add v19.8H, v29.8H, v27.8H
+ sub v16.8H, v16.8H, v31.8H
+ sub v17.8H, v17.8H, v27.8H
+ add va.8H, va.8H, v29.8H
+ add v19.8H, v19.8H, v25.8H
+ sshr v25.8H, v25.8H, #1
+ sshr v27.8H, v27.8H, #1
+ sshr v29.8H, v29.8H, #1
+ sshr v31.8H, v31.8H, #1
+ sub v16.8H, v16.8H, v31.8H
+ sub v17.8H, v17.8H, v27.8H
+ add va.8H, va.8H, v29.8H
+ add v19.8H, v19.8H, v25.8H
+ sshr v25.8H, v16.8H, #2
+ sshr v27.8H, v17.8H, #2
+ sshr v29.8H, va.8H, #2
+ sshr v31.8H, v19.8H, #2
+ sub v19.8H, v19.8H, v25.8H
+ sub va.8H, v27.8H, va.8H
+ add v17.8H, v17.8H, v29.8H
+ add v16.8H, v16.8H, v31.8H
+ .if \pass == 0
+ sub v31.8H, v24.8H, v19.8H
+ add v24.8H, v24.8H, v19.8H
+ add v25.8H, v26.8H, v18.8H
+ sub v18.8H, v26.8H, v18.8H
+ add v26.8H, v28.8H, v17.8H
+ add v27.8H, v30.8H, v16.8H
+ sub v29.8H, v28.8H, v17.8H
+ sub v28.8H, v30.8H, v16.8H
+ .else
+ sub v31.8H, v24.8H, v19.8H
+ add v24.8H, v24.8H, v19.8H
+ add v25.8H, v26.8H, v30.8H
+ sub v30.8H, v26.8H, v30.8H
+ add v26.8H, v28.8H, v17.8H
+ sub v29.8H, v28.8H, v17.8H
+ add v27.8H, v18.8H, v16.8H
+ sub v28.8H, v18.8H, v16.8H
+ .endif
+ .unreq va
+ .unreq vb
+.endm
+
+function ff_h264_idct8_add_neon, export=1
+ movi v19.8H, #0
+ sxtw x2, w2
+ ld1 {v24.8H, v25.8H}, [x1]
+ st1 {v19.8H}, [x1], #16
+ st1 {v19.8H}, [x1], #16
+ ld1 {v26.8H, v27.8H}, [x1]
+ st1 {v19.8H}, [x1], #16
+ st1 {v19.8H}, [x1], #16
+ ld1 {v28.8H, v29.8H}, [x1]
+ st1 {v19.8H}, [x1], #16
+ st1 {v19.8H}, [x1], #16
+
+ idct8x8_cols 0
+ transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
+ idct8x8_cols 1
+
+ mov x3, x0
+ srshr v24.8H, v24.8H, #6
+ ld1 {v0.8B}, [x0], x2
+ srshr v25.8H, v25.8H, #6
+ ld1 {v1.8B}, [x0], x2
+ srshr v26.8H, v26.8H, #6
+ ld1 {v2.8B}, [x0], x2
+ srshr v27.8H, v27.8H, #6
+ ld1 {v3.8B}, [x0], x2
+ srshr v28.8H, v28.8H, #6
+ ld1 {v4.8B}, [x0], x2
+ srshr v29.8H, v29.8H, #6
+ ld1 {v5.8B}, [x0], x2
+ srshr v30.8H, v30.8H, #6
+ ld1 {v6.8B}, [x0], x2
+ srshr v31.8H, v31.8H, #6
+ ld1 {v7.8B}, [x0], x2
+ uaddw v24.8H, v24.8H, v0.8B
+ uaddw v25.8H, v25.8H, v1.8B
+ uaddw v26.8H, v26.8H, v2.8B
+ sqxtun v0.8B, v24.8H
+ uaddw v27.8H, v27.8H, v3.8B
+ sqxtun v1.8B, v25.8H
+ uaddw v28.8H, v28.8H, v4.8B
+ sqxtun v2.8B, v26.8H
+ st1 {v0.8B}, [x3], x2
+ uaddw v29.8H, v29.8H, v5.8B
+ sqxtun v3.8B, v27.8H
+ st1 {v1.8B}, [x3], x2
+ uaddw v30.8H, v30.8H, v6.8B
+ sqxtun v4.8B, v28.8H
+ st1 {v2.8B}, [x3], x2
+ uaddw v31.8H, v31.8H, v7.8B
+ sqxtun v5.8B, v29.8H
+ st1 {v3.8B}, [x3], x2
+ sqxtun v6.8B, v30.8H
+ sqxtun v7.8B, v31.8H
+ st1 {v4.8B}, [x3], x2
+ st1 {v5.8B}, [x3], x2
+ st1 {v6.8B}, [x3], x2
+ st1 {v7.8B}, [x3], x2
+
+ sub x1, x1, #128
+ ret
+endfunc
+
+function ff_h264_idct8_dc_add_neon, export=1
+ mov w3, #0
+ sxtw x2, w2
+ ld1r {v31.8H}, [x1]
+ strh w3, [x1]
+ ld1 {v0.8B}, [x0], x2
+ srshr v31.8H, v31.8H, #6
+ ld1 {v1.8B}, [x0], x2
+ ld1 {v2.8B}, [x0], x2
+ uaddw v24.8H, v31.8H, v0.8B
+ ld1 {v3.8B}, [x0], x2
+ uaddw v25.8H, v31.8H, v1.8B
+ ld1 {v4.8B}, [x0], x2
+ uaddw v26.8H, v31.8H, v2.8B
+ ld1 {v5.8B}, [x0], x2
+ uaddw v27.8H, v31.8H, v3.8B
+ ld1 {v6.8B}, [x0], x2
+ uaddw v28.8H, v31.8H, v4.8B
+ ld1 {v7.8B}, [x0], x2
+ uaddw v29.8H, v31.8H, v5.8B
+ uaddw v30.8H, v31.8H, v6.8B
+ uaddw v31.8H, v31.8H, v7.8B
+ sqxtun v0.8B, v24.8H
+ sqxtun v1.8B, v25.8H
+ sqxtun v2.8B, v26.8H
+ sqxtun v3.8B, v27.8H
+ sub x0, x0, x2, lsl #3
+ st1 {v0.8B}, [x0], x2
+ sqxtun v4.8B, v28.8H
+ st1 {v1.8B}, [x0], x2
+ sqxtun v5.8B, v29.8H
+ st1 {v2.8B}, [x0], x2
+ sqxtun v6.8B, v30.8H
+ st1 {v3.8B}, [x0], x2
+ sqxtun v7.8B, v31.8H
+ st1 {v4.8B}, [x0], x2
+ st1 {v5.8B}, [x0], x2
+ st1 {v6.8B}, [x0], x2
+ st1 {v7.8B}, [x0], x2
+ ret
+endfunc
+
+function ff_h264_idct8_add4_neon, export=1
+ mov x12, x30
+ mov x6, x0
+ mov x5, x1
+ mov x1, x2
+ mov w2, w3
+ movrel x7, scan8
+ mov w10, #16
+ movrel x13, X(ff_h264_idct8_dc_add_neon)
+ movrel x14, X(ff_h264_idct8_add_neon)
+1: ldrb w9, [x7], #4
+ ldrsw x0, [x5], #16
+ ldrb w9, [x4, w9, UXTW]
+ subs w9, w9, #1
+ b.lt 2f
+ ldrsh w11, [x1]
+ add x0, x6, x0
+ ccmp w11, #0, #4, eq
+ csel x15, x13, x14, ne
+ blr x15
+2: subs w10, w10, #4
+ add x1, x1, #128
+ b.ne 1b
+ ret x12
+endfunc
+
+const scan8
+ .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
+ .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
+ .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
+ .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
+ .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
+ .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
+ .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
+ .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
+ .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8
+ .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8
+ .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8
+ .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8
+endconst
diff --git a/media/ffvpx/libavcodec/aarch64/h264pred_init.c b/media/ffvpx/libavcodec/aarch64/h264pred_init.c
new file mode 100644
index 0000000000..b144376f90
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/h264pred_init.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/h264pred.h"
+
+void ff_pred16x16_vert_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_hor_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_plane_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_128_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_left_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_top_dc_neon(uint8_t *src, ptrdiff_t stride);
+
+void ff_pred8x8_vert_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_hor_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_plane_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_128_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_left_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_top_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_l0t_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride);
+
+static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
+ const int bit_depth,
+ const int chroma_format_idc)
+{
+ const int high_depth = bit_depth > 8;
+
+ if (high_depth)
+ return;
+
+ if (chroma_format_idc <= 1) {
+ h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon;
+ h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon;
+ if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
+ h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon;
+ h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon;
+ if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 &&
+ codec_id != AV_CODEC_ID_VP8) {
+ h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon;
+ h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon;
+ h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon;
+ h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon;
+ h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon;
+ h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon;
+ h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon;
+ }
+ }
+
+ h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon;
+ h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon;
+ h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon;
+ h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon;
+ h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon;
+ h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon;
+ if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
+ codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
+ h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon;
+}
+
+av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id,
+ int bit_depth, const int chroma_format_idc)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags))
+ h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc);
+}
diff --git a/media/ffvpx/libavcodec/aarch64/h264pred_neon.S b/media/ffvpx/libavcodec/aarch64/h264pred_neon.S
new file mode 100644
index 0000000000..213b40b3e7
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/h264pred_neon.S
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+.macro ldcol.8 rd, rs, rt, n=8, hi=0
+.if \n >= 8 || \hi == 0
+ ld1 {\rd\().b}[0], [\rs], \rt
+ ld1 {\rd\().b}[1], [\rs], \rt
+ ld1 {\rd\().b}[2], [\rs], \rt
+ ld1 {\rd\().b}[3], [\rs], \rt
+.endif
+.if \n >= 8 || \hi == 1
+ ld1 {\rd\().b}[4], [\rs], \rt
+ ld1 {\rd\().b}[5], [\rs], \rt
+ ld1 {\rd\().b}[6], [\rs], \rt
+ ld1 {\rd\().b}[7], [\rs], \rt
+.endif
+.if \n == 16
+ ld1 {\rd\().b}[8], [\rs], \rt
+ ld1 {\rd\().b}[9], [\rs], \rt
+ ld1 {\rd\().b}[10], [\rs], \rt
+ ld1 {\rd\().b}[11], [\rs], \rt
+ ld1 {\rd\().b}[12], [\rs], \rt
+ ld1 {\rd\().b}[13], [\rs], \rt
+ ld1 {\rd\().b}[14], [\rs], \rt
+ ld1 {\rd\().b}[15], [\rs], \rt
+.endif
+.endm
+
+function ff_pred16x16_128_dc_neon, export=1
+ movi v0.16b, #128
+ b .L_pred16x16_dc_end
+endfunc
+
+function ff_pred16x16_top_dc_neon, export=1
+ sub x2, x0, x1
+ ld1 {v0.16b}, [x2]
+ uaddlv h0, v0.16b
+ rshrn v0.8b, v0.8h, #4
+ dup v0.16b, v0.b[0]
+ b .L_pred16x16_dc_end
+endfunc
+
+function ff_pred16x16_left_dc_neon, export=1
+ sub x2, x0, #1
+ ldcol.8 v0, x2, x1, 16
+ uaddlv h0, v0.16b
+ rshrn v0.8b, v0.8h, #4
+ dup v0.16b, v0.b[0]
+ b .L_pred16x16_dc_end
+endfunc
+
+function ff_pred16x16_dc_neon, export=1
+ sub x2, x0, x1
+ sub x3, x0, #1
+ ld1 {v0.16b}, [x2]
+ ldcol.8 v1, x3, x1, 16
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ add v0.4h, v0.4h, v1.4h
+ rshrn v0.8b, v0.8h, #5
+ dup v0.16b, v0.b[0]
+.L_pred16x16_dc_end:
+ mov w3, #8
+6: st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x0], x1
+ subs w3, w3, #1
+ b.ne 6b
+ ret
+endfunc
+
+function ff_pred16x16_hor_neon, export=1
+ sub x2, x0, #1
+ mov w3, #16
+1: ld1r {v0.16b}, [x2], x1
+ st1 {v0.16b}, [x0], x1
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_pred16x16_vert_neon, export=1
+ sub x2, x0, x1
+ add x1, x1, x1
+ ld1 {v0.16b}, [x2], x1
+ mov w3, #8
+1: st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x2], x1
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_pred16x16_plane_neon, export=1
+ sub x3, x0, x1
+ movrel x4, p16weight
+ add x2, x3, #8
+ sub x3, x3, #1
+ ld1 {v0.8b}, [x3]
+ ld1 {v2.8b}, [x2], x1
+ ldcol.8 v1, x3, x1
+ add x3, x3, x1
+ ldcol.8 v3, x3, x1
+ rev64 v0.8b, v0.8b
+ rev64 v1.8b, v1.8b
+ uaddl v7.8h, v2.8b, v3.8b
+ usubl v2.8h, v2.8b, v0.8b
+ usubl v3.8h, v3.8b, v1.8b
+ ld1 {v0.8h}, [x4]
+ mul v2.8h, v2.8h, v0.8h
+ mul v3.8h, v3.8h, v0.8h
+ addp v2.8h, v2.8h, v3.8h
+ addp v2.8h, v2.8h, v2.8h
+ addp v2.4h, v2.4h, v2.4h
+ sshll v3.4s, v2.4h, #2
+ saddw v2.4s, v3.4s, v2.4h
+ rshrn v4.4h, v2.4s, #6
+ trn2 v5.4h, v4.4h, v4.4h
+ add v2.4h, v4.4h, v5.4h
+ shl v3.4h, v2.4h, #3
+ ext v7.16b, v7.16b, v7.16b, #14
+ sub v3.4h, v3.4h, v2.4h // 7 * (b + c)
+ add v7.4h, v7.4h, v0.4h
+ shl v2.4h, v7.4h, #4
+ sub v2.4h, v2.4h, v3.4h
+ shl v3.4h, v4.4h, #4
+ ext v0.16b, v0.16b, v0.16b, #14
+ sub v6.4h, v5.4h, v3.4h
+ mov v0.h[0], wzr
+ mul v0.8h, v0.8h, v4.h[0]
+ dup v1.8h, v2.h[0]
+ dup v2.8h, v4.h[0]
+ dup v3.8h, v6.h[0]
+ shl v2.8h, v2.8h, #3
+ add v1.8h, v1.8h, v0.8h
+ add v3.8h, v3.8h, v2.8h
+ mov w3, #16
+1:
+ sqshrun v0.8b, v1.8h, #5
+ add v1.8h, v1.8h, v2.8h
+ sqshrun2 v0.16b, v1.8h, #5
+ add v1.8h, v1.8h, v3.8h
+ st1 {v0.16b}, [x0], x1
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+const p16weight, align=4
+ .short 1,2,3,4,5,6,7,8
+endconst
+const p8weight, align=4
+ .short 1,2,3,4,1,2,3,4
+endconst
+
+function ff_pred8x8_hor_neon, export=1
+ sub x2, x0, #1
+ mov w3, #8
+1: ld1r {v0.8b}, [x2], x1
+ st1 {v0.8b}, [x0], x1
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_pred8x8_vert_neon, export=1
+ sub x2, x0, x1
+ lsl x1, x1, #1
+ ld1 {v0.8b}, [x2], x1
+ mov w3, #4
+1: st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x2], x1
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_pred8x8_plane_neon, export=1
+ sub x3, x0, x1
+ movrel x4, p8weight
+ movrel x5, p16weight
+ add x2, x3, #4
+ sub x3, x3, #1
+ ld1 {v0.s}[0], [x3]
+ ld1 {v2.s}[0], [x2], x1
+ ldcol.8 v0, x3, x1, 4, hi=1
+ add x3, x3, x1
+ ldcol.8 v3, x3, x1, 4
+ uaddl v7.8h, v2.8b, v3.8b
+ rev32 v0.8b, v0.8b
+ trn1 v2.2s, v2.2s, v3.2s
+ usubl v2.8h, v2.8b, v0.8b
+ ld1 {v6.8h}, [x4]
+ mul v2.8h, v2.8h, v6.8h
+ ld1 {v0.8h}, [x5]
+ saddlp v2.4s, v2.8h
+ addp v2.4s, v2.4s, v2.4s
+ shl v3.4s, v2.4s, #4
+ add v2.4s, v3.4s, v2.4s
+ rshrn v5.4h, v2.4s, #5
+ addp v2.4h, v5.4h, v5.4h
+ shl v3.4h, v2.4h, #1
+ add v3.4h, v3.4h, v2.4h
+ rev64 v7.4h, v7.4h
+ add v7.4h, v7.4h, v0.4h
+ shl v2.4h, v7.4h, #4
+ sub v2.4h, v2.4h, v3.4h
+ ext v0.16b, v0.16b, v0.16b, #14
+ mov v0.h[0], wzr
+ mul v0.8h, v0.8h, v5.h[0]
+ dup v1.8h, v2.h[0]
+ dup v2.8h, v5.h[1]
+ add v1.8h, v1.8h, v0.8h
+ mov w3, #8
+1:
+ sqshrun v0.8b, v1.8h, #5
+ add v1.8h, v1.8h, v2.8h
+ st1 {v0.8b}, [x0], x1
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_pred8x8_128_dc_neon, export=1
+ movi v0.8b, #128
+ movi v1.8b, #128
+ b .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_top_dc_neon, export=1
+ sub x2, x0, x1
+ ld1 {v0.8b}, [x2]
+ uaddlp v0.4h, v0.8b
+ addp v0.4h, v0.4h, v0.4h
+ zip1 v0.8h, v0.8h, v0.8h
+ rshrn v2.8b, v0.8h, #2
+ zip1 v0.8b, v2.8b, v2.8b
+ zip1 v1.8b, v2.8b, v2.8b
+ b .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_left_dc_neon, export=1
+ sub x2, x0, #1
+ ldcol.8 v0, x2, x1
+ uaddlp v0.4h, v0.8b
+ addp v0.4h, v0.4h, v0.4h
+ rshrn v2.8b, v0.8h, #2
+ dup v1.8b, v2.b[1]
+ dup v0.8b, v2.b[0]
+ b .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_dc_neon, export=1
+ sub x2, x0, x1
+ sub x3, x0, #1
+ ld1 {v0.8b}, [x2]
+ ldcol.8 v1, x3, x1
+ uaddlp v0.4h, v0.8b
+ uaddlp v1.4h, v1.8b
+ trn1 v2.2s, v0.2s, v1.2s
+ trn2 v3.2s, v0.2s, v1.2s
+ addp v4.4h, v2.4h, v3.4h
+ addp v5.4h, v4.4h, v4.4h
+ rshrn v6.8b, v5.8h, #3
+ rshrn v7.8b, v4.8h, #2
+ dup v0.8b, v6.b[0]
+ dup v2.8b, v7.b[2]
+ dup v1.8b, v7.b[3]
+ dup v3.8b, v6.b[1]
+ zip1 v0.2s, v0.2s, v2.2s
+ zip1 v1.2s, v1.2s, v3.2s
+.L_pred8x8_dc_end:
+ mov w3, #4
+ add x2, x0, x1, lsl #2
+6: st1 {v0.8b}, [x0], x1
+ st1 {v1.8b}, [x2], x1
+ subs w3, w3, #1
+ b.ne 6b
+ ret
+endfunc
+
+function ff_pred8x8_l0t_dc_neon, export=1
+ sub x2, x0, x1
+ sub x3, x0, #1
+ ld1 {v0.8b}, [x2]
+ ldcol.8 v1, x3, x1, 4
+ zip1 v0.4s, v0.4s, v1.4s
+ uaddlp v0.8h, v0.16b
+ addp v0.8h, v0.8h, v0.8h
+ addp v1.4h, v0.4h, v0.4h
+ rshrn v2.8b, v0.8h, #2
+ rshrn v3.8b, v1.8h, #3
+ dup v4.8b, v3.b[0]
+ dup v6.8b, v2.b[2]
+ dup v5.8b, v2.b[0]
+ zip1 v0.2s, v4.2s, v6.2s
+ zip1 v1.2s, v5.2s, v6.2s
+ b .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_l00_dc_neon, export=1
+ sub x2, x0, #1
+ ldcol.8 v0, x2, x1, 4
+ uaddlp v0.4h, v0.8b
+ addp v0.4h, v0.4h, v0.4h
+ rshrn v0.8b, v0.8h, #2
+ movi v1.8b, #128
+ dup v0.8b, v0.b[0]
+ b .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_0lt_dc_neon, export=1
+ add x3, x0, x1, lsl #2
+ sub x2, x0, x1
+ sub x3, x3, #1
+ ld1 {v0.8b}, [x2]
+ ldcol.8 v1, x3, x1, 4, hi=1
+ zip1 v0.4s, v0.4s, v1.4s
+ uaddlp v0.8h, v0.16b
+ addp v0.8h, v0.8h, v0.8h
+ addp v1.4h, v0.4h, v0.4h
+ rshrn v2.8b, v0.8h, #2
+ rshrn v3.8b, v1.8h, #3
+ dup v4.8b, v2.b[0]
+ dup v5.8b, v2.b[3]
+ dup v6.8b, v2.b[2]
+ dup v7.8b, v3.b[1]
+ zip1 v0.2s, v4.2s, v6.2s
+ zip1 v1.2s, v5.2s, v7.2s
+ b .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_0l0_dc_neon, export=1
+ add x2, x0, x1, lsl #2
+ sub x2, x2, #1
+ ldcol.8 v1, x2, x1, 4
+ uaddlp v2.4h, v1.8b
+ addp v2.4h, v2.4h, v2.4h
+ rshrn v1.8b, v2.8h, #2
+ movi v0.8b, #128
+ dup v1.8b, v1.b[0]
+ b .L_pred8x8_dc_end
+endfunc
diff --git a/media/ffvpx/libavcodec/aarch64/hpeldsp_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/hpeldsp_init_aarch64.c
new file mode 100644
index 0000000000..144ae2bcc4
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/hpeldsp_init_aarch64.c
@@ -0,0 +1,123 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/hpeldsp.h"
+
+void ff_put_pixels16_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_pixels8_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+
+void ff_put_pixels16_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_pixels16_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_pixels8_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_pixels8_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+
+void ff_avg_pixels16_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels8_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+
+void ff_avg_pixels16_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels16_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels16_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+
+av_cold void ff_hpeldsp_init_aarch64(HpelDSPContext *c, int flags)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
+ c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon;
+ c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon;
+ c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon;
+ c->put_pixels_tab[1][0] = ff_put_pixels8_neon;
+ c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon;
+ c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon;
+ c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon;
+
+ c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon;
+ c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon;
+ c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon;
+ c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon;
+ c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon;
+ c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon;
+ c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon;
+ c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon;
+
+ c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
+ c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_neon;
+ c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_neon;
+ c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_neon;
+ c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon;
+ c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_neon;
+ c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_neon;
+ c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_neon;
+
+ c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_neon;
+ c->avg_no_rnd_pixels_tab[1] = ff_avg_pixels16_x2_no_rnd_neon;
+ c->avg_no_rnd_pixels_tab[2] = ff_avg_pixels16_y2_no_rnd_neon;
+ c->avg_no_rnd_pixels_tab[3] = ff_avg_pixels16_xy2_no_rnd_neon;
+ }
+}
diff --git a/media/ffvpx/libavcodec/aarch64/hpeldsp_neon.S b/media/ffvpx/libavcodec/aarch64/hpeldsp_neon.S
new file mode 100644
index 0000000000..a491c173bb
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/hpeldsp_neon.S
@@ -0,0 +1,397 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+.macro pixels16 rnd=1, avg=0
+ .if \avg
+ mov x12, x0
+ .endif
+1: ld1 {v0.16B}, [x1], x2
+ ld1 {v1.16B}, [x1], x2
+ ld1 {v2.16B}, [x1], x2
+ ld1 {v3.16B}, [x1], x2
+ .if \avg
+ ld1 {v4.16B}, [x12], x2
+ urhadd v0.16B, v0.16B, v4.16B
+ ld1 {v5.16B}, [x12], x2
+ urhadd v1.16B, v1.16B, v5.16B
+ ld1 {v6.16B}, [x12], x2
+ urhadd v2.16B, v2.16B, v6.16B
+ ld1 {v7.16B}, [x12], x2
+ urhadd v3.16B, v3.16B, v7.16B
+ .endif
+ subs w3, w3, #4
+ st1 {v0.16B}, [x0], x2
+ st1 {v1.16B}, [x0], x2
+ st1 {v2.16B}, [x0], x2
+ st1 {v3.16B}, [x0], x2
+ b.ne 1b
+ ret
+.endm
+
+.macro pixels16_x2 rnd=1, avg=0
+1: ld1 {v0.16B, v1.16B}, [x1], x2
+ ld1 {v2.16B, v3.16B}, [x1], x2
+ subs w3, w3, #2
+ ext v1.16B, v0.16B, v1.16B, #1
+ avg v0.16B, v0.16B, v1.16B
+ ext v3.16B, v2.16B, v3.16B, #1
+ avg v2.16B, v2.16B, v3.16B
+ .if \avg
+ ld1 {v1.16B}, [x0], x2
+ ld1 {v3.16B}, [x0]
+ urhadd v0.16B, v0.16B, v1.16B
+ urhadd v2.16B, v2.16B, v3.16B
+ sub x0, x0, x2
+ .endif
+ st1 {v0.16B}, [x0], x2
+ st1 {v2.16B}, [x0], x2
+ b.ne 1b
+ ret
+.endm
+
+.macro pixels16_y2 rnd=1, avg=0
+ sub w3, w3, #2
+ ld1 {v0.16B}, [x1], x2
+ ld1 {v1.16B}, [x1], x2
+1: subs w3, w3, #2
+ avg v2.16B, v0.16B, v1.16B
+ ld1 {v0.16B}, [x1], x2
+ avg v3.16B, v0.16B, v1.16B
+ ld1 {v1.16B}, [x1], x2
+ .if \avg
+ ld1 {v4.16B}, [x0], x2
+ ld1 {v5.16B}, [x0]
+ urhadd v2.16B, v2.16B, v4.16B
+ urhadd v3.16B, v3.16B, v5.16B
+ sub x0, x0, x2
+ .endif
+ st1 {v2.16B}, [x0], x2
+ st1 {v3.16B}, [x0], x2
+ b.ne 1b
+
+ avg v2.16B, v0.16B, v1.16B
+ ld1 {v0.16B}, [x1], x2
+ avg v3.16B, v0.16B, v1.16B
+ .if \avg
+ ld1 {v4.16B}, [x0], x2
+ ld1 {v5.16B}, [x0]
+ urhadd v2.16B, v2.16B, v4.16B
+ urhadd v3.16B, v3.16B, v5.16B
+ sub x0, x0, x2
+ .endif
+ st1 {v2.16B}, [x0], x2
+ st1 {v3.16B}, [x0], x2
+
+ ret
+.endm
+
+.macro pixels16_xy2 rnd=1, avg=0
+ sub w3, w3, #2
+ ld1 {v0.16B, v1.16B}, [x1], x2
+ ld1 {v4.16B, v5.16B}, [x1], x2
+NRND movi v26.8H, #1
+ ext v1.16B, v0.16B, v1.16B, #1
+ ext v5.16B, v4.16B, v5.16B, #1
+ uaddl v16.8H, v0.8B, v1.8B
+ uaddl2 v20.8H, v0.16B, v1.16B
+ uaddl v18.8H, v4.8B, v5.8B
+ uaddl2 v22.8H, v4.16B, v5.16B
+1: subs w3, w3, #2
+ ld1 {v0.16B, v1.16B}, [x1], x2
+ add v24.8H, v16.8H, v18.8H
+NRND add v24.8H, v24.8H, v26.8H
+ ext v30.16B, v0.16B, v1.16B, #1
+ add v1.8H, v20.8H, v22.8H
+ mshrn v28.8B, v24.8H, #2
+NRND add v1.8H, v1.8H, v26.8H
+ mshrn2 v28.16B, v1.8H, #2
+ .if \avg
+ ld1 {v16.16B}, [x0]
+ urhadd v28.16B, v28.16B, v16.16B
+ .endif
+ uaddl v16.8H, v0.8B, v30.8B
+ ld1 {v2.16B, v3.16B}, [x1], x2
+ uaddl2 v20.8H, v0.16B, v30.16B
+ st1 {v28.16B}, [x0], x2
+ add v24.8H, v16.8H, v18.8H
+NRND add v24.8H, v24.8H, v26.8H
+ ext v3.16B, v2.16B, v3.16B, #1
+ add v0.8H, v20.8H, v22.8H
+ mshrn v30.8B, v24.8H, #2
+NRND add v0.8H, v0.8H, v26.8H
+ mshrn2 v30.16B, v0.8H, #2
+ .if \avg
+ ld1 {v18.16B}, [x0]
+ urhadd v30.16B, v30.16B, v18.16B
+ .endif
+ uaddl v18.8H, v2.8B, v3.8B
+ uaddl2 v22.8H, v2.16B, v3.16B
+ st1 {v30.16B}, [x0], x2
+ b.gt 1b
+
+ ld1 {v0.16B, v1.16B}, [x1], x2
+ add v24.8H, v16.8H, v18.8H
+NRND add v24.8H, v24.8H, v26.8H
+ ext v30.16B, v0.16B, v1.16B, #1
+ add v1.8H, v20.8H, v22.8H
+ mshrn v28.8B, v24.8H, #2
+NRND add v1.8H, v1.8H, v26.8H
+ mshrn2 v28.16B, v1.8H, #2
+ .if \avg
+ ld1 {v16.16B}, [x0]
+ urhadd v28.16B, v28.16B, v16.16B
+ .endif
+ uaddl v16.8H, v0.8B, v30.8B
+ uaddl2 v20.8H, v0.16B, v30.16B
+ st1 {v28.16B}, [x0], x2
+ add v24.8H, v16.8H, v18.8H
+NRND add v24.8H, v24.8H, v26.8H
+ add v0.8H, v20.8H, v22.8H
+ mshrn v30.8B, v24.8H, #2
+NRND add v0.8H, v0.8H, v26.8H
+ mshrn2 v30.16B, v0.8H, #2
+ .if \avg
+ ld1 {v18.16B}, [x0]
+ urhadd v30.16B, v30.16B, v18.16B
+ .endif
+ st1 {v30.16B}, [x0], x2
+
+ ret
+.endm
+
+.macro pixels8 rnd=1, avg=0
+1: ld1 {v0.8B}, [x1], x2
+ ld1 {v1.8B}, [x1], x2
+ ld1 {v2.8B}, [x1], x2
+ ld1 {v3.8B}, [x1], x2
+ .if \avg
+ ld1 {v4.8B}, [x0], x2
+ urhadd v0.8B, v0.8B, v4.8B
+ ld1 {v5.8B}, [x0], x2
+ urhadd v1.8B, v1.8B, v5.8B
+ ld1 {v6.8B}, [x0], x2
+ urhadd v2.8B, v2.8B, v6.8B
+ ld1 {v7.8B}, [x0], x2
+ urhadd v3.8B, v3.8B, v7.8B
+ sub x0, x0, x2, lsl #2
+ .endif
+ subs w3, w3, #4
+ st1 {v0.8B}, [x0], x2
+ st1 {v1.8B}, [x0], x2
+ st1 {v2.8B}, [x0], x2
+ st1 {v3.8B}, [x0], x2
+ b.ne 1b
+ ret
+.endm
+
+.macro pixels8_x2 rnd=1, avg=0
+1: ld1 {v0.8B, v1.8B}, [x1], x2
+ ext v1.8B, v0.8B, v1.8B, #1
+ ld1 {v2.8B, v3.8B}, [x1], x2
+ ext v3.8B, v2.8B, v3.8B, #1
+ subs w3, w3, #2
+ avg v0.8B, v0.8B, v1.8B
+ avg v2.8B, v2.8B, v3.8B
+ .if \avg
+ ld1 {v4.8B}, [x0], x2
+ ld1 {v5.8B}, [x0]
+ urhadd v0.8B, v0.8B, v4.8B
+ urhadd v2.8B, v2.8B, v5.8B
+ sub x0, x0, x2
+ .endif
+ st1 {v0.8B}, [x0], x2
+ st1 {v2.8B}, [x0], x2
+ b.ne 1b
+ ret
+.endm
+
+.macro pixels8_y2 rnd=1, avg=0
+ sub w3, w3, #2
+ ld1 {v0.8B}, [x1], x2
+ ld1 {v1.8B}, [x1], x2
+1: subs w3, w3, #2
+ avg v4.8B, v0.8B, v1.8B
+ ld1 {v0.8B}, [x1], x2
+ avg v5.8B, v0.8B, v1.8B
+ ld1 {v1.8B}, [x1], x2
+ .if \avg
+ ld1 {v2.8B}, [x0], x2
+ ld1 {v3.8B}, [x0]
+ urhadd v4.8B, v4.8B, v2.8B
+ urhadd v5.8B, v5.8B, v3.8B
+ sub x0, x0, x2
+ .endif
+ st1 {v4.8B}, [x0], x2
+ st1 {v5.8B}, [x0], x2
+ b.ne 1b
+
+ avg v4.8B, v0.8B, v1.8B
+ ld1 {v0.8B}, [x1], x2
+ avg v5.8B, v0.8B, v1.8B
+ .if \avg
+ ld1 {v2.8B}, [x0], x2
+ ld1 {v3.8B}, [x0]
+ urhadd v4.8B, v4.8B, v2.8B
+ urhadd v5.8B, v5.8B, v3.8B
+ sub x0, x0, x2
+ .endif
+ st1 {v4.8B}, [x0], x2
+ st1 {v5.8B}, [x0], x2
+
+ ret
+.endm
+
+.macro pixels8_xy2 rnd=1, avg=0
+ sub w3, w3, #2
+ ld1 {v0.16B}, [x1], x2
+ ld1 {v1.16B}, [x1], x2
+NRND movi v19.8H, #1
+ ext v4.16B, v0.16B, v4.16B, #1
+ ext v6.16B, v1.16B, v6.16B, #1
+ uaddl v16.8H, v0.8B, v4.8B
+ uaddl v17.8H, v1.8B, v6.8B
+1: subs w3, w3, #2
+ ld1 {v0.16B}, [x1], x2
+ add v18.8H, v16.8H, v17.8H
+ ext v4.16B, v0.16B, v4.16B, #1
+NRND add v18.8H, v18.8H, v19.8H
+ uaddl v16.8H, v0.8B, v4.8B
+ mshrn v5.8B, v18.8H, #2
+ ld1 {v1.16B}, [x1], x2
+ add v18.8H, v16.8H, v17.8H
+ .if \avg
+ ld1 {v7.8B}, [x0]
+ urhadd v5.8B, v5.8B, v7.8B
+ .endif
+NRND add v18.8H, v18.8H, v19.8H
+ st1 {v5.8B}, [x0], x2
+ mshrn v7.8B, v18.8H, #2
+ .if \avg
+ ld1 {v5.8B}, [x0]
+ urhadd v7.8B, v7.8B, v5.8B
+ .endif
+ ext v6.16B, v1.16B, v6.16B, #1
+ uaddl v17.8H, v1.8B, v6.8B
+ st1 {v7.8B}, [x0], x2
+ b.gt 1b
+
+ ld1 {v0.16B}, [x1], x2
+ add v18.8H, v16.8H, v17.8H
+ ext v4.16B, v0.16B, v4.16B, #1
+NRND add v18.8H, v18.8H, v19.8H
+ uaddl v16.8H, v0.8B, v4.8B
+ mshrn v5.8B, v18.8H, #2
+ add v18.8H, v16.8H, v17.8H
+ .if \avg
+ ld1 {v7.8B}, [x0]
+ urhadd v5.8B, v5.8B, v7.8B
+ .endif
+NRND add v18.8H, v18.8H, v19.8H
+ st1 {v5.8B}, [x0], x2
+ mshrn v7.8B, v18.8H, #2
+ .if \avg
+ ld1 {v5.8B}, [x0]
+ urhadd v7.8B, v7.8B, v5.8B
+ .endif
+ st1 {v7.8B}, [x0], x2
+
+ ret
+.endm
+
+.macro pixfunc pfx, name, suf, rnd=1, avg=0
+ .if \rnd
+ .macro avg rd, rn, rm
+ urhadd \rd, \rn, \rm
+ .endm
+ .macro mshrn rd, rn, rm
+ rshrn \rd, \rn, \rm
+ .endm
+ .macro mshrn2 rd, rn, rm
+ rshrn2 \rd, \rn, \rm
+ .endm
+ .macro NRND insn:vararg
+ .endm
+ .else
+ .macro avg rd, rn, rm
+ uhadd \rd, \rn, \rm
+ .endm
+ .macro mshrn rd, rn, rm
+ shrn \rd, \rn, \rm
+ .endm
+ .macro mshrn2 rd, rn, rm
+ shrn2 \rd, \rn, \rm
+ .endm
+ .macro NRND insn:vararg
+ \insn
+ .endm
+ .endif
+function ff_\pfx\name\suf\()_neon, export=1
+ \name \rnd, \avg
+endfunc
+ .purgem avg
+ .purgem mshrn
+ .purgem mshrn2
+ .purgem NRND
+.endm
+
+.macro pixfunc2 pfx, name, avg=0
+ pixfunc \pfx, \name, rnd=1, avg=\avg
+ pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg
+.endm
+
+function ff_put_h264_qpel16_mc00_neon, export=1
+ mov w3, #16
+endfunc
+
+ pixfunc put_, pixels16, avg=0
+ pixfunc2 put_, pixels16_x2, avg=0
+ pixfunc2 put_, pixels16_y2, avg=0
+ pixfunc2 put_, pixels16_xy2, avg=0
+
+function ff_avg_h264_qpel16_mc00_neon, export=1
+ mov w3, #16
+endfunc
+
+ pixfunc avg_, pixels16, avg=1
+ pixfunc2 avg_, pixels16_x2, avg=1
+ pixfunc2 avg_, pixels16_y2, avg=1
+ pixfunc2 avg_, pixels16_xy2, avg=1
+
+function ff_put_h264_qpel8_mc00_neon, export=1
+ mov w3, #8
+endfunc
+
+ pixfunc put_, pixels8, avg=0
+ pixfunc2 put_, pixels8_x2, avg=0
+ pixfunc2 put_, pixels8_y2, avg=0
+ pixfunc2 put_, pixels8_xy2, avg=0
+
+function ff_avg_h264_qpel8_mc00_neon, export=1
+ mov w3, #8
+endfunc
+
+ pixfunc avg_, pixels8, avg=1
+ pixfunc avg_, pixels8_x2, avg=1
+ pixfunc avg_, pixels8_y2, avg=1
+ pixfunc avg_, pixels8_xy2, avg=1
diff --git a/media/ffvpx/libavcodec/aarch64/idct.h b/media/ffvpx/libavcodec/aarch64/idct.h
new file mode 100644
index 0000000000..5c49046148
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/idct.h
@@ -0,0 +1,28 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AARCH64_IDCT_H
+#define AVCODEC_AARCH64_IDCT_H
+
+#include <stdint.h>
+
+void ff_simple_idct_neon(int16_t *data);
+void ff_simple_idct_put_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
+void ff_simple_idct_add_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
+
+#endif /* AVCODEC_AARCH64_IDCT_H */
diff --git a/media/ffvpx/libavcodec/aarch64/idctdsp_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/idctdsp_init_aarch64.c
new file mode 100644
index 0000000000..0406e60830
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/idctdsp_init_aarch64.c
@@ -0,0 +1,41 @@
+/*
+ * ARM-NEON-optimized IDCT functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/idctdsp.h"
+#include "idct.h"
+
+av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx,
+ unsigned high_bit_depth)
+{
+ if (!avctx->lowres && !high_bit_depth) {
+ if (avctx->idct_algo == FF_IDCT_AUTO ||
+ avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+ avctx->idct_algo == FF_IDCT_SIMPLENEON) {
+ c->idct_put = ff_simple_idct_put_neon;
+ c->idct_add = ff_simple_idct_add_neon;
+ c->idct = ff_simple_idct_neon;
+ c->perm_type = FF_IDCT_PERM_PARTTRANS;
+ }
+ }
+}
diff --git a/media/ffvpx/libavcodec/aarch64/mdct_neon.S b/media/ffvpx/libavcodec/aarch64/mdct_neon.S
new file mode 100644
index 0000000000..1fd199c972
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/mdct_neon.S
@@ -0,0 +1,323 @@
+/*
+ * AArch64 NEON optimised MDCT
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_imdct_half_neon, export=1
+ sub sp, sp, #32
+ stp x19, x20, [sp]
+ str x30, [sp, #16]
+ mov x12, #1
+ ldr w14, [x0, #28] // mdct_bits
+ ldr x4, [x0, #32] // tcos
+ ldr x3, [x0, #8] // revtab
+ lsl x12, x12, x14 // n = 1 << nbits
+ lsr x14, x12, #2 // n4 = n >> 2
+ add x7, x2, x12, lsl #1
+ mov x12, #-16
+ sub x7, x7, #16
+
+ ld2 {v16.2s,v17.2s}, [x7], x12 // d16=x,n1 d17=x,n0
+ ld2 {v0.2s,v1.2s}, [x2], #16 // d0 =m0,x d1 =m1,x
+ rev64 v17.2s, v17.2s
+ ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2
+ fmul v6.2s, v17.2s, v2.2s
+ fmul v7.2s, v0.2s, v2.2s
+1:
+ subs x14, x14, #2
+ ldr w6, [x3], #4
+ fmul v4.2s, v0.2s, v3.2s
+ fmul v5.2s, v17.2s, v3.2s
+ fsub v4.2s, v6.2s, v4.2s
+ fadd v5.2s, v5.2s, v7.2s
+ ubfm x8, x6, #16, #31
+ ubfm x6, x6, #0, #15
+ add x8, x1, x8, lsl #3
+ add x6, x1, x6, lsl #3
+ b.eq 2f
+ ld2 {v16.2s,v17.2s}, [x7], x12
+ ld2 {v0.2s,v1.2s}, [x2], #16
+ rev64 v17.2s, v17.2s
+ ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2
+ fmul v6.2s, v17.2s, v2.2s
+ fmul v7.2s, v0.2s, v2.2s
+ st2 {v4.s,v5.s}[0], [x6]
+ st2 {v4.s,v5.s}[1], [x8]
+ b 1b
+2:
+ st2 {v4.s,v5.s}[0], [x6]
+ st2 {v4.s,v5.s}[1], [x8]
+
+ mov x19, x0
+ mov x20, x1
+ bl X(ff_fft_calc_neon)
+
+ mov x12, #1
+ ldr w14, [x19, #28] // mdct_bits
+ ldr x4, [x19, #32] // tcos
+ lsl x12, x12, x14 // n = 1 << nbits
+ lsr x14, x12, #3 // n8 = n >> 3
+
+ add x4, x4, x14, lsl #3
+ add x6, x20, x14, lsl #3
+ sub x1, x4, #16
+ sub x3, x6, #16
+
+ mov x7, #-16
+ mov x8, x6
+ mov x0, x3
+
+ ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =i1,r1 d1 =i0,r0
+ ld2 {v20.2s,v21.2s},[x6], #16 // d20=i2,r2 d21=i3,r3
+ ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
+3:
+ subs x14, x14, #2
+ fmul v7.2s, v0.2s, v17.2s
+ ld2 {v18.2s,v19.2s},[x4], #16 // d17=c2,c3 d19=s2,s3
+ fmul v4.2s, v1.2s, v17.2s
+ fmul v6.2s, v21.2s, v19.2s
+ fmul v5.2s, v20.2s, v19.2s
+ fmul v22.2s, v1.2s, v16.2s
+ fmul v23.2s, v21.2s, v18.2s
+ fmul v24.2s, v0.2s, v16.2s
+ fmul v25.2s, v20.2s, v18.2s
+ fadd v7.2s, v7.2s, v22.2s
+ fadd v5.2s, v5.2s, v23.2s
+ fsub v4.2s, v4.2s, v24.2s
+ fsub v6.2s, v6.2s, v25.2s
+ b.eq 4f
+ ld2 {v0.2s,v1.2s}, [x3], x7
+ ld2 {v20.2s,v21.2s},[x6], #16
+ ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
+ rev64 v5.2s, v5.2s
+ rev64 v7.2s, v7.2s
+ st2 {v4.2s,v5.2s}, [x0], x7
+ st2 {v6.2s,v7.2s}, [x8], #16
+ b 3b
+4:
+ rev64 v5.2s, v5.2s
+ rev64 v7.2s, v7.2s
+ st2 {v4.2s,v5.2s}, [x0]
+ st2 {v6.2s,v7.2s}, [x8]
+
+ ldp x19, x20, [sp]
+ ldr x30, [sp, #16]
+ add sp, sp, #32
+
+ ret
+endfunc
+
+function ff_imdct_calc_neon, export=1
+ sub sp, sp, #32
+ stp x19, x20, [sp]
+ str x30, [sp, #16]
+ ldr w3, [x0, #28] // mdct_bits
+ mov x19, #1
+ mov x20, x1
+ lsl x19, x19, x3
+ add x1, x1, x19
+
+ bl X(ff_imdct_half_neon)
+
+ add x0, x20, x19, lsl #2
+ add x1, x20, x19, lsl #1
+ sub x0, x0, #8
+ sub x2, x1, #16
+ mov x3, #-16
+ mov x6, #-8
+1:
+ ld1 {v0.4s}, [x2], x3
+ prfum pldl1keep, [x0, #-16]
+ rev64 v0.4s, v0.4s
+ ld1 {v2.2s,v3.2s}, [x1], #16
+ fneg v4.4s, v0.4s
+ prfum pldl1keep, [x2, #-16]
+ rev64 v2.2s, v2.2s
+ rev64 v3.2s, v3.2s
+ ext v4.16b, v4.16b, v4.16b, #8
+ st1 {v2.2s}, [x0], x6
+ st1 {v3.2s}, [x0], x6
+ st1 {v4.4s}, [x20], #16
+ subs x19, x19, #16
+ b.gt 1b
+
+ ldp x19, x20, [sp], #16
+ ldr x30, [sp], #16
+
+ ret
+endfunc
+
+
+function ff_mdct_calc_neon, export=1
+ sub sp, sp, #32
+ stp x19, x20, [sp]
+ str x30, [sp, #16]
+
+ mov x12, #1
+ ldr w14, [x0, #28] // mdct_bits
+ ldr x4, [x0, #32] // tcos
+ ldr x3, [x0, #8] // revtab
+ lsl x14, x12, x14 // n = 1 << nbits
+ add x7, x2, x14 // in4u
+ sub x9, x7, #16 // in4d
+ add x2, x7, x14, lsl #1 // in3u
+ add x8, x9, x14, lsl #1 // in3d
+ add x5, x4, x14, lsl #1
+ sub x5, x5, #16
+ sub x3, x3, #4
+ mov x12, #-16
+ lsr x13, x14, #1
+
+ ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0
+ ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0
+ ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0
+ rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1
+ rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1
+ ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0
+ fsub v0.2s, v17.2s, v0.2s // in4d-in4u I
+ ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1
+ rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1
+ rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1
+ ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3
+ fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R
+ fsub v16.2s, v16.2s, v1.2s // in0u-in2d R
+ fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I
+1:
+ fmul v7.2s, v0.2s, v21.2s // I*s
+ ldr w10, [x3, x13]
+ fmul v6.2s, v2.2s, v20.2s // -R*c
+ ldr w6, [x3, #4]!
+ fmul v4.2s, v2.2s, v21.2s // -R*s
+ fmul v5.2s, v0.2s, v20.2s // I*c
+ fmul v24.2s, v16.2s, v30.2s // R*c
+ fmul v25.2s, v18.2s, v31.2s // -I*s
+ fmul v22.2s, v16.2s, v31.2s // R*s
+ fmul v23.2s, v18.2s, v30.2s // I*c
+ subs x14, x14, #16
+ subs x13, x13, #8
+ fsub v6.2s, v6.2s, v7.2s // -R*c-I*s
+ fadd v7.2s, v4.2s, v5.2s // -R*s+I*c
+ fsub v24.2s, v25.2s, v24.2s // I*s-R*c
+ fadd v25.2s, v22.2s, v23.2s // R*s-I*c
+ b.eq 1f
+ mov x12, #-16
+ ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0
+ ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0
+ fneg v7.2s, v7.2s // R*s-I*c
+ ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0
+ rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1
+ rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1
+ ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0
+ fsub v0.2s, v17.2s, v0.2s // in4d-in4u I
+ ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1
+ rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1
+ rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1
+ ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3
+ fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R
+ fsub v16.2s, v16.2s, v1.2s // in0u-in2d R
+ fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I
+ ubfm x12, x6, #16, #31
+ ubfm x6, x6, #0, #15
+ add x12, x1, x12, lsl #3
+ add x6, x1, x6, lsl #3
+ st2 {v6.s,v7.s}[0], [x6]
+ st2 {v6.s,v7.s}[1], [x12]
+ ubfm x6, x10, #16, #31
+ ubfm x10, x10, #0, #15
+ add x6 , x1, x6, lsl #3
+ add x10, x1, x10, lsl #3
+ st2 {v24.s,v25.s}[0], [x10]
+ st2 {v24.s,v25.s}[1], [x6]
+ b 1b
+1:
+ fneg v7.2s, v7.2s // R*s-I*c
+ ubfm x12, x6, #16, #31
+ ubfm x6, x6, #0, #15
+ add x12, x1, x12, lsl #3
+ add x6, x1, x6, lsl #3
+ st2 {v6.s,v7.s}[0], [x6]
+ st2 {v6.s,v7.s}[1], [x12]
+ ubfm x6, x10, #16, #31
+ ubfm x10, x10, #0, #15
+ add x6 , x1, x6, lsl #3
+ add x10, x1, x10, lsl #3
+ st2 {v24.s,v25.s}[0], [x10]
+ st2 {v24.s,v25.s}[1], [x6]
+
+ mov x19, x0
+ mov x20, x1
+ bl X(ff_fft_calc_neon)
+
+ mov x12, #1
+ ldr w14, [x19, #28] // mdct_bits
+ ldr x4, [x19, #32] // tcos
+ lsl x12, x12, x14 // n = 1 << nbits
+ lsr x14, x12, #3 // n8 = n >> 3
+
+ add x4, x4, x14, lsl #3
+ add x6, x20, x14, lsl #3
+ sub x1, x4, #16
+ sub x3, x6, #16
+
+ mov x7, #-16
+ mov x8, x6
+ mov x0, x3
+
+ ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =r1,i1 d1 =r0,i0
+ ld2 {v20.2s,v21.2s}, [x6], #16 // d20=r2,i2 d21=r3,i3
+ ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0
+1:
+ subs x14, x14, #2
+ fmul v7.2s, v0.2s, v17.2s // r1*s1,r0*s0
+ ld2 {v18.2s,v19.2s}, [x4], #16 // c2,c3 s2,s3
+ fmul v4.2s, v1.2s, v17.2s // i1*s1,i0*s0
+ fmul v6.2s, v21.2s, v19.2s // i2*s2,i3*s3
+ fmul v5.2s, v20.2s, v19.2s // r2*s2,r3*s3
+ fmul v24.2s, v0.2s, v16.2s // r1*c1,r0*c0
+ fmul v25.2s, v20.2s, v18.2s // r2*c2,r3*c3
+ fmul v22.2s, v21.2s, v18.2s // i2*c2,i3*c3
+ fmul v23.2s, v1.2s, v16.2s // i1*c1,i0*c0
+ fadd v4.2s, v4.2s, v24.2s // i1*s1+r1*c1,i0*s0+r0*c0
+ fadd v6.2s, v6.2s, v25.2s // i2*s2+r2*c2,i3*s3+r3*c3
+ fsub v5.2s, v22.2s, v5.2s // i2*c2-r2*s2,i3*c3-r3*s3
+ fsub v7.2s, v23.2s, v7.2s // i1*c1-r1*s1,i0*c0-r0*s0
+ fneg v4.2s, v4.2s
+ fneg v6.2s, v6.2s
+ b.eq 1f
+ ld2 {v0.2s, v1.2s}, [x3], x7
+ ld2 {v20.2s,v21.2s}, [x6], #16
+ ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0
+ rev64 v5.2s, v5.2s
+ rev64 v7.2s, v7.2s
+ st2 {v4.2s,v5.2s}, [x0], x7
+ st2 {v6.2s,v7.2s}, [x8], #16
+ b 1b
+1:
+ rev64 v5.2s, v5.2s
+ rev64 v7.2s, v7.2s
+ st2 {v4.2s,v5.2s}, [x0]
+ st2 {v6.2s,v7.2s}, [x8]
+
+ ldp x19, x20, [sp], #16
+ ldr x30, [sp], #16
+ ret
+endfunc
diff --git a/media/ffvpx/libavcodec/aarch64/moz.build b/media/ffvpx/libavcodec/aarch64/moz.build
new file mode 100644
index 0000000000..2da948afc4
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/moz.build
@@ -0,0 +1,50 @@
+## -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+## vim: set filetype=python:
+## This Source Code Form is subject to the terms of the Mozilla Public
+## License, v. 2.0. If a copy of the MPL was not distributed with this
+## file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+SOURCES += [
+ 'h264chroma_init_aarch64.c',
+ 'h264cmc_neon.S',
+ 'h264dsp_init_aarch64.c',
+ 'h264dsp_neon.S',
+ 'h264idct_neon.S',
+ 'h264pred_init.c',
+ 'h264pred_neon.S',
+ 'hpeldsp_init_aarch64.c',
+ 'hpeldsp_neon.S',
+ 'idctdsp_init_aarch64.c',
+ 'mdct_neon.S',
+ 'mpegaudiodsp_init.c',
+ 'mpegaudiodsp_neon.S',
+ 'neon.S',
+ 'simple_idct_neon.S',
+ 'videodsp.S',
+ 'videodsp_init.c',
+ 'vp8dsp_init_aarch64.c',
+ 'vp8dsp_neon.S',
+ 'vp9dsp_init_10bpp_aarch64.c',
+ 'vp9dsp_init_12bpp_aarch64.c',
+ 'vp9dsp_init_aarch64.c',
+ 'vp9itxfm_16bpp_neon.S',
+ 'vp9itxfm_neon.S',
+ 'vp9lpf_16bpp_neon.S',
+ 'vp9lpf_neon.S',
+ 'vp9mc_16bpp_neon.S',
+ 'vp9mc_neon.S',
+]
+
+if CONFIG['OS_ARCH'] == 'WINNT':
+ USE_INTEGRATED_CLANGCL_AS = True
+ DEFINES['EXTERN_ASM'] = ''
+
+if CONFIG['MOZ_LIBAV_FFT']:
+ SOURCES += [
+ 'fft_init_aarch64.c',
+ 'fft_neon.S',
+ ]
+
+FINAL_LIBRARY = 'mozavcodec'
+
+include('/media/ffvpx/ffvpxcommon.mozbuild')
diff --git a/media/ffvpx/libavcodec/aarch64/mpegaudiodsp_init.c b/media/ffvpx/libavcodec/aarch64/mpegaudiodsp_init.c
new file mode 100644
index 0000000000..5d966af5f4
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/mpegaudiodsp_init.c
@@ -0,0 +1,40 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/mpegaudiodsp.h"
+#include "config.h"
+
+void ff_mpadsp_apply_window_fixed_neon(int32_t *synth_buf, int32_t *window,
+ int *dither, int16_t *samples, ptrdiff_t incr);
+void ff_mpadsp_apply_window_float_neon(float *synth_buf, float *window,
+ int *dither, float *samples, ptrdiff_t incr);
+
+av_cold void ff_mpadsp_init_aarch64(MPADSPContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ s->apply_window_fixed = ff_mpadsp_apply_window_fixed_neon;
+ s->apply_window_float = ff_mpadsp_apply_window_float_neon;
+ }
+}
diff --git a/media/ffvpx/libavcodec/aarch64/mpegaudiodsp_neon.S b/media/ffvpx/libavcodec/aarch64/mpegaudiodsp_neon.S
new file mode 100644
index 0000000000..b6ef131228
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/mpegaudiodsp_neon.S
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define FRAC_BITS 23 // fractional bits for sb_samples and dct
+#define WFRAC_BITS 16 // fractional bits for window
+#define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 15)
+
+const tbl_rev128_s, align=4
+ .byte 12, 13, 14, 15
+ .byte 8, 9, 10, 11
+ .byte 4, 5, 6, 7
+ .byte 0, 1, 2, 3
+endconst
+
+.macro apply_window type, st
+function ff_mpadsp_apply_window_\type\()_neon, export=1
+ mov x7, x0
+ add x8, x0, #512<<2
+ ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x7], #64
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x7], #64
+ st1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x8], #64
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x8], #64
+ movrel x15, tbl_rev128_s
+ ld1 {v27.4s}, [x15]
+.ifc \type, fixed
+ lsl x4, x4, #1
+.else
+ lsl x4, x4, #2
+.endif
+ add x10, x0, #45<<2
+ add x0, x0, #16<<2
+ add x1, x1, #16<<2
+ add x5, x3, x4, lsl #5
+ sub x5, x5, x4 // samples2
+ neg x13, x4 // -incr
+ mov x9, #64<<2
+.ifc \type, fixed
+ ld1r {v16.2s}, [x2] // dither_state
+ sxtl v16.2d, v16.2s
+ movi v29.2d, #0
+ movi v30.2d, #(1<<OUT_SHIFT)-1
+ trn1 v31.2d, v29.2d, v30.2d
+ trn2 v30.2d, v30.2d, v29.2d
+ trn1 v16.2d, v16.2d, v29.2d
+.else
+ movi v16.4s, #0
+ movi v28.4s, #0
+.endif
+ mov x14, #4
+1:
+ mov x8, x0
+ sub x7, x1, #3<<2
+ sub x6, x1, x14, lsl #4
+ add x7, x7, x14, lsl #4
+ add x11, x6, #(32)<<2 // w + 32
+ add x12, x7, #(32)<<2 // w2 + 32
+ mov x15, #8
+ movi v17.2d, #0
+ movi v18.2d, #0
+ movi v19.2d, #0
+2:
+ subs x15, x15, #1
+ ld1 {v0.4s}, [x8], x9
+ ld1 {v1.4s}, [x10], x9
+ ld1 {v2.4s}, [x6], x9
+ ld1 {v3.4s}, [x7], x9
+ tbl v6.16b, {v0.16b}, v27.16b
+ tbl v7.16b, {v1.16b}, v27.16b
+ ld1 {v4.4s}, [x11], x9
+ ld1 {v5.4s}, [x12], x9
+ MLA v16, v2, v0
+ MLA2 v17, v2, v0
+ MLS v18, v3, v6
+ MLS2 v19, v3, v6
+ MLS v16, v4, v7
+ MLS2 v17, v4, v7
+ MLS v18, v5, v1
+ MLS2 v19, v5, v1
+ b.gt 2b
+
+ cmp x14, #4
+ sub x10, x10, #64<<5 // 64 * 8 * sizeof(int32_t)
+
+.ifc \type, fixed
+ and v28.16b, v16.16b, v30.16b
+ ext v28.16b, v29.16b, v28.16b, #8
+
+ b.eq 4f
+ round_sample v19, 1, 1
+4:
+ round_sample v16, 1, 0
+ shrn v16.2s, v16.2d, #OUT_SHIFT
+ round_sample v19, 0, 0
+ shrn v19.2s, v19.2d, #OUT_SHIFT
+ round_sample v17, 0, 1
+ round_sample v18, 1, 1
+ round_sample v17, 1, 0
+ shrn2 v16.4s, v17.2d, #OUT_SHIFT
+ round_sample v18, 0, 0
+ shrn2 v19.4s, v18.2d, #OUT_SHIFT
+ sqxtn v16.4h, v16.4s
+ sqxtn v18.4h, v19.4s
+.else
+ ext v18.16b, v18.16b, v18.16b, #8
+.endif
+
+ st1 {v16.\st\()}[0], [x3], x4
+ b.eq 4f
+ st1 {v18.\st\()}[1], [x5], x13
+4:
+ st1 {v16.\st\()}[1], [x3], x4
+ st1 {v18.\st\()}[0], [x5], x13
+ st1 {v16.\st\()}[2], [x3], x4
+ st1 {v18.\st\()}[3], [x5], x13
+ st1 {v16.\st\()}[3], [x3], x4
+ st1 {v18.\st\()}[2], [x5], x13
+
+ mov v16.16b, v28.16b
+
+ subs x14, x14, #1
+ add x0, x0, #4<<2
+ sub x10, x10, #4<<2
+ b.gt 1b
+
+// computing samples[16]
+ add x6, x1, #32<<2
+ ld1 {v0.2s}, [x6], x9
+ ld1 {v1.2s}, [x0], x9
+.rept 3
+ ld1 {v2.2s}, [x6], x9
+ ld1 {v3.2s}, [x0], x9
+ MLS v16, v0, v1
+ ld1 {v0.2s}, [x6], x9
+ ld1 {v1.2s}, [x0], x9
+ MLS v16, v2, v3
+.endr
+ ld1 {v2.2s}, [x6], x9
+ ld1 {v3.2s}, [x0], x9
+ MLS v16, v0, v1
+ MLS v16, v2, v3
+
+.ifc \type, fixed
+ and v28.16b, v16.16b, v30.16b
+ shrn v20.2s, v16.2d, #OUT_SHIFT
+ xtn v28.2s, v28.2d
+ sqxtn v20.4h, v20.4s
+ st1 {v28.s}[0], [x2] // save dither_state
+ st1 {v20.h}[0], [x3]
+.else
+ st1 {v16.s}[0], [x3]
+.endif
+
+ ret
+endfunc
+.purgem round_sample
+.purgem MLA
+.purgem MLA2
+.purgem MLS
+.purgem MLS2
+.endm
+
+
+.macro round_sample r, idx, next
+ add \r\().2d, \r\().2d, v28.2d
+.if \idx == 0
+ and v28.16b, \r\().16b, v30.16b
+.else // \idx == 1
+ and v28.16b, \r\().16b, v31.16b
+.endif
+.if \idx != \next
+ .if \next == 0
+ ext v28.16b, v28.16b, v29.16b, #8
+ .else
+ ext v28.16b, v29.16b, v28.16b, #8
+ .endif
+.endif
+.endm
+.macro MLA d, s1, s2
+ smlal \d\().2d, \s1\().2s, \s2\().2s
+.endm
+.macro MLA2 d, s1, s2
+ smlal2 \d\().2d, \s1\().4s, \s2\().4s
+.endm
+.macro MLS d, s1, s2
+ smlsl \d\().2d, \s1\().2s, \s2\().2s
+.endm
+.macro MLS2 d, s1, s2
+ smlsl2 \d\().2d, \s1\().4s, \s2\().4s
+.endm
+apply_window fixed, h
+
+
+// nothing to do for round_sample and ML{A,S}2
+.macro round_sample r, idx, next
+.endm
+.macro MLA2 d, s1, s2
+.endm
+.macro MLS2 d, s1, s2
+.endm
+.macro MLA d, s1, s2
+ fmla \d\().4s, \s1\().4s, \s2\().4s
+.endm
+.macro MLS d, s1, s2
+ fmls \d\().4s, \s1\().4s, \s2\().4s
+.endm
+apply_window float, s
diff --git a/media/ffvpx/libavcodec/aarch64/neon.S b/media/ffvpx/libavcodec/aarch64/neon.S
new file mode 100644
index 0000000000..0fddbecae3
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/neon.S
@@ -0,0 +1,149 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+.macro transpose_8x8B r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
+ trn1 \r8\().8B, \r0\().8B, \r1\().8B
+ trn2 \r9\().8B, \r0\().8B, \r1\().8B
+ trn1 \r1\().8B, \r2\().8B, \r3\().8B
+ trn2 \r3\().8B, \r2\().8B, \r3\().8B
+ trn1 \r0\().8B, \r4\().8B, \r5\().8B
+ trn2 \r5\().8B, \r4\().8B, \r5\().8B
+ trn1 \r2\().8B, \r6\().8B, \r7\().8B
+ trn2 \r7\().8B, \r6\().8B, \r7\().8B
+
+ trn1 \r4\().4H, \r0\().4H, \r2\().4H
+ trn2 \r2\().4H, \r0\().4H, \r2\().4H
+ trn1 \r6\().4H, \r5\().4H, \r7\().4H
+ trn2 \r7\().4H, \r5\().4H, \r7\().4H
+ trn1 \r5\().4H, \r9\().4H, \r3\().4H
+ trn2 \r9\().4H, \r9\().4H, \r3\().4H
+ trn1 \r3\().4H, \r8\().4H, \r1\().4H
+ trn2 \r8\().4H, \r8\().4H, \r1\().4H
+
+ trn1 \r0\().2S, \r3\().2S, \r4\().2S
+ trn2 \r4\().2S, \r3\().2S, \r4\().2S
+
+ trn1 \r1\().2S, \r5\().2S, \r6\().2S
+ trn2 \r5\().2S, \r5\().2S, \r6\().2S
+
+ trn2 \r6\().2S, \r8\().2S, \r2\().2S
+ trn1 \r2\().2S, \r8\().2S, \r2\().2S
+
+ trn1 \r3\().2S, \r9\().2S, \r7\().2S
+ trn2 \r7\().2S, \r9\().2S, \r7\().2S
+.endm
+
+.macro transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+ trn1 \t0\().16B, \r0\().16B, \r1\().16B
+ trn2 \t1\().16B, \r0\().16B, \r1\().16B
+ trn1 \r1\().16B, \r2\().16B, \r3\().16B
+ trn2 \r3\().16B, \r2\().16B, \r3\().16B
+ trn1 \r0\().16B, \r4\().16B, \r5\().16B
+ trn2 \r5\().16B, \r4\().16B, \r5\().16B
+ trn1 \r2\().16B, \r6\().16B, \r7\().16B
+ trn2 \r7\().16B, \r6\().16B, \r7\().16B
+
+ trn1 \r4\().8H, \r0\().8H, \r2\().8H
+ trn2 \r2\().8H, \r0\().8H, \r2\().8H
+ trn1 \r6\().8H, \r5\().8H, \r7\().8H
+ trn2 \r7\().8H, \r5\().8H, \r7\().8H
+ trn1 \r5\().8H, \t1\().8H, \r3\().8H
+ trn2 \t1\().8H, \t1\().8H, \r3\().8H
+ trn1 \r3\().8H, \t0\().8H, \r1\().8H
+ trn2 \t0\().8H, \t0\().8H, \r1\().8H
+
+ trn1 \r0\().4S, \r3\().4S, \r4\().4S
+ trn2 \r4\().4S, \r3\().4S, \r4\().4S
+
+ trn1 \r1\().4S, \r5\().4S, \r6\().4S
+ trn2 \r5\().4S, \r5\().4S, \r6\().4S
+
+ trn2 \r6\().4S, \t0\().4S, \r2\().4S
+ trn1 \r2\().4S, \t0\().4S, \r2\().4S
+
+ trn1 \r3\().4S, \t1\().4S, \r7\().4S
+ trn2 \r7\().4S, \t1\().4S, \r7\().4S
+.endm
+
+.macro transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7
+ trn1 \t4\().16B, \r0\().16B, \r1\().16B
+ trn2 \t5\().16B, \r0\().16B, \r1\().16B
+ trn1 \t6\().16B, \r2\().16B, \r3\().16B
+ trn2 \t7\().16B, \r2\().16B, \r3\().16B
+
+ trn1 \r0\().8H, \t4\().8H, \t6\().8H
+ trn2 \r2\().8H, \t4\().8H, \t6\().8H
+ trn1 \r1\().8H, \t5\().8H, \t7\().8H
+ trn2 \r3\().8H, \t5\().8H, \t7\().8H
+.endm
+
+.macro transpose_4x8B r0, r1, r2, r3, t4, t5, t6, t7
+ trn1 \t4\().8B, \r0\().8B, \r1\().8B
+ trn2 \t5\().8B, \r0\().8B, \r1\().8B
+ trn1 \t6\().8B, \r2\().8B, \r3\().8B
+ trn2 \t7\().8B, \r2\().8B, \r3\().8B
+
+ trn1 \r0\().4H, \t4\().4H, \t6\().4H
+ trn2 \r2\().4H, \t4\().4H, \t6\().4H
+ trn1 \r1\().4H, \t5\().4H, \t7\().4H
+ trn2 \r3\().4H, \t5\().4H, \t7\().4H
+.endm
+
+.macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7
+ trn1 \r4\().4H, \r0\().4H, \r1\().4H
+ trn2 \r5\().4H, \r0\().4H, \r1\().4H
+ trn1 \r6\().4H, \r2\().4H, \r3\().4H
+ trn2 \r7\().4H, \r2\().4H, \r3\().4H
+ trn1 \r0\().2S, \r4\().2S, \r6\().2S
+ trn2 \r2\().2S, \r4\().2S, \r6\().2S
+ trn1 \r1\().2S, \r5\().2S, \r7\().2S
+ trn2 \r3\().2S, \r5\().2S, \r7\().2S
+.endm
+
+.macro transpose_8x8H r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
+ trn1 \r8\().8H, \r0\().8H, \r1\().8H
+ trn2 \r9\().8H, \r0\().8H, \r1\().8H
+ trn1 \r1\().8H, \r2\().8H, \r3\().8H
+ trn2 \r3\().8H, \r2\().8H, \r3\().8H
+ trn1 \r0\().8H, \r4\().8H, \r5\().8H
+ trn2 \r5\().8H, \r4\().8H, \r5\().8H
+ trn1 \r2\().8H, \r6\().8H, \r7\().8H
+ trn2 \r7\().8H, \r6\().8H, \r7\().8H
+
+ trn1 \r4\().4S, \r0\().4S, \r2\().4S
+ trn2 \r2\().4S, \r0\().4S, \r2\().4S
+ trn1 \r6\().4S, \r5\().4S, \r7\().4S
+ trn2 \r7\().4S, \r5\().4S, \r7\().4S
+ trn1 \r5\().4S, \r9\().4S, \r3\().4S
+ trn2 \r9\().4S, \r9\().4S, \r3\().4S
+ trn1 \r3\().4S, \r8\().4S, \r1\().4S
+ trn2 \r8\().4S, \r8\().4S, \r1\().4S
+
+ trn1 \r0\().2D, \r3\().2D, \r4\().2D
+ trn2 \r4\().2D, \r3\().2D, \r4\().2D
+
+ trn1 \r1\().2D, \r5\().2D, \r6\().2D
+ trn2 \r5\().2D, \r5\().2D, \r6\().2D
+
+ trn2 \r6\().2D, \r8\().2D, \r2\().2D
+ trn1 \r2\().2D, \r8\().2D, \r2\().2D
+
+ trn1 \r3\().2D, \r9\().2D, \r7\().2D
+ trn2 \r7\().2D, \r9\().2D, \r7\().2D
+
+.endm
diff --git a/media/ffvpx/libavcodec/aarch64/simple_idct_neon.S b/media/ffvpx/libavcodec/aarch64/simple_idct_neon.S
new file mode 100644
index 0000000000..5e4d021a97
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/simple_idct_neon.S
@@ -0,0 +1,362 @@
+/*
+ * ARM NEON IDCT
+ *
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
+ *
+ * Based on Simple IDCT
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define Z1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z4c ((1<<(COL_SHIFT-1))/Z4)
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+#define z1 v0.H[0]
+#define z2 v0.H[1]
+#define z3 v0.H[2]
+#define z4 v0.H[3]
+#define z5 v0.H[4]
+#define z6 v0.H[5]
+#define z7 v0.H[6]
+#define z4c v0.H[7]
+
+const idct_coeff_neon, align=4
+ .short Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z4c
+endconst
+
+.macro idct_start data
+ prfm pldl1keep, [\data]
+ mov x10, x30
+ movrel x3, idct_coeff_neon
+ ld1 {v0.2D}, [x3]
+.endm
+
+.macro idct_end
+ br x10
+.endm
+
+.macro smull1 a, b, c
+ smull \a, \b, \c
+.endm
+
+.macro smlal1 a, b, c
+ smlal \a, \b, \c
+.endm
+
+.macro smlsl1 a, b, c
+ smlsl \a, \b, \c
+.endm
+
+.macro idct_col4_top y1, y2, y3, y4, i, l
+ smull\i v7.4S, \y3\l, z2
+ smull\i v16.4S, \y3\l, z6
+ smull\i v17.4S, \y2\l, z1
+ add v19.4S, v23.4S, v7.4S
+ smull\i v18.4S, \y2\l, z3
+ add v20.4S, v23.4S, v16.4S
+ smull\i v5.4S, \y2\l, z5
+ sub v21.4S, v23.4S, v16.4S
+ smull\i v6.4S, \y2\l, z7
+ sub v22.4S, v23.4S, v7.4S
+
+ smlal\i v17.4S, \y4\l, z3
+ smlsl\i v18.4S, \y4\l, z7
+ smlsl\i v5.4S, \y4\l, z1
+ smlsl\i v6.4S, \y4\l, z5
+.endm
+
+.macro idct_row4_neon y1, y2, y3, y4, pass
+ ld1 {\y1\().2D,\y2\().2D}, [x2], #32
+ movi v23.4S, #1<<2, lsl #8
+ orr v5.16B, \y1\().16B, \y2\().16B
+ ld1 {\y3\().2D,\y4\().2D}, [x2], #32
+ orr v6.16B, \y3\().16B, \y4\().16B
+ orr v5.16B, v5.16B, v6.16B
+ mov x3, v5.D[1]
+ smlal v23.4S, \y1\().4H, z4
+
+ idct_col4_top \y1, \y2, \y3, \y4, 1, .4H
+
+ cmp x3, #0
+ b.eq \pass\()f
+
+ smull2 v7.4S, \y1\().8H, z4
+ smlal2 v17.4S, \y2\().8H, z5
+ smlsl2 v18.4S, \y2\().8H, z1
+ smull2 v16.4S, \y3\().8H, z2
+ smlal2 v5.4S, \y2\().8H, z7
+ add v19.4S, v19.4S, v7.4S
+ sub v20.4S, v20.4S, v7.4S
+ sub v21.4S, v21.4S, v7.4S
+ add v22.4S, v22.4S, v7.4S
+ smlal2 v6.4S, \y2\().8H, z3
+ smull2 v7.4S, \y3\().8H, z6
+ smlal2 v17.4S, \y4\().8H, z7
+ smlsl2 v18.4S, \y4\().8H, z5
+ smlal2 v5.4S, \y4\().8H, z3
+ smlsl2 v6.4S, \y4\().8H, z1
+ add v19.4S, v19.4S, v7.4S
+ sub v20.4S, v20.4S, v16.4S
+ add v21.4S, v21.4S, v16.4S
+ sub v22.4S, v22.4S, v7.4S
+
+\pass: add \y3\().4S, v19.4S, v17.4S
+ add \y4\().4S, v20.4S, v18.4S
+ shrn \y1\().4H, \y3\().4S, #ROW_SHIFT
+ shrn \y2\().4H, \y4\().4S, #ROW_SHIFT
+ add v7.4S, v21.4S, v5.4S
+ add v16.4S, v22.4S, v6.4S
+ shrn \y3\().4H, v7.4S, #ROW_SHIFT
+ shrn \y4\().4H, v16.4S, #ROW_SHIFT
+ sub v22.4S, v22.4S, v6.4S
+ sub v19.4S, v19.4S, v17.4S
+ sub v21.4S, v21.4S, v5.4S
+ shrn2 \y1\().8H, v22.4S, #ROW_SHIFT
+ sub v20.4S, v20.4S, v18.4S
+ shrn2 \y2\().8H, v21.4S, #ROW_SHIFT
+ shrn2 \y3\().8H, v20.4S, #ROW_SHIFT
+ shrn2 \y4\().8H, v19.4S, #ROW_SHIFT
+
+ trn1 v16.8H, \y1\().8H, \y2\().8H
+ trn2 v17.8H, \y1\().8H, \y2\().8H
+ trn1 v18.8H, \y3\().8H, \y4\().8H
+ trn2 v19.8H, \y3\().8H, \y4\().8H
+ trn1 \y1\().4S, v16.4S, v18.4S
+ trn1 \y2\().4S, v17.4S, v19.4S
+ trn2 \y3\().4S, v16.4S, v18.4S
+ trn2 \y4\().4S, v17.4S, v19.4S
+.endm
+
+.macro declare_idct_col4_neon i, l
+function idct_col4_neon\i
+ dup v23.4H, z4c
+.if \i == 1
+ add v23.4H, v23.4H, v24.4H
+.else
+ mov v5.D[0], v24.D[1]
+ add v23.4H, v23.4H, v5.4H
+.endif
+ smull v23.4S, v23.4H, z4
+
+ idct_col4_top v24, v25, v26, v27, \i, \l
+
+ mov x4, v28.D[\i - 1]
+ mov x5, v29.D[\i - 1]
+ cmp x4, #0
+ b.eq 1f
+
+ smull\i v7.4S, v28\l, z4
+ add v19.4S, v19.4S, v7.4S
+ sub v20.4S, v20.4S, v7.4S
+ sub v21.4S, v21.4S, v7.4S
+ add v22.4S, v22.4S, v7.4S
+
+1: mov x4, v30.D[\i - 1]
+ cmp x5, #0
+ b.eq 2f
+
+ smlal\i v17.4S, v29\l, z5
+ smlsl\i v18.4S, v29\l, z1
+ smlal\i v5.4S, v29\l, z7
+ smlal\i v6.4S, v29\l, z3
+
+2: mov x5, v31.D[\i - 1]
+ cmp x4, #0
+ b.eq 3f
+
+ smull\i v7.4S, v30\l, z6
+ smull\i v16.4S, v30\l, z2
+ add v19.4S, v19.4S, v7.4S
+ sub v22.4S, v22.4S, v7.4S
+ sub v20.4S, v20.4S, v16.4S
+ add v21.4S, v21.4S, v16.4S
+
+3: cmp x5, #0
+ b.eq 4f
+
+ smlal\i v17.4S, v31\l, z7
+ smlsl\i v18.4S, v31\l, z5
+ smlal\i v5.4S, v31\l, z3
+ smlsl\i v6.4S, v31\l, z1
+
+4: addhn v7.4H, v19.4S, v17.4S
+ addhn2 v7.8H, v20.4S, v18.4S
+ subhn v18.4H, v20.4S, v18.4S
+ subhn2 v18.8H, v19.4S, v17.4S
+
+ addhn v16.4H, v21.4S, v5.4S
+ addhn2 v16.8H, v22.4S, v6.4S
+ subhn v17.4H, v22.4S, v6.4S
+ subhn2 v17.8H, v21.4S, v5.4S
+
+ ret
+endfunc
+.endm
+
+declare_idct_col4_neon 1, .4H
+declare_idct_col4_neon 2, .8H
+
+function ff_simple_idct_put_neon, export=1
+ idct_start x2
+
+ idct_row4_neon v24, v25, v26, v27, 1
+ idct_row4_neon v28, v29, v30, v31, 2
+ bl idct_col4_neon1
+
+ sqshrun v1.8B, v7.8H, #COL_SHIFT-16
+ sqshrun2 v1.16B, v16.8H, #COL_SHIFT-16
+ sqshrun v3.8B, v17.8H, #COL_SHIFT-16
+ sqshrun2 v3.16B, v18.8H, #COL_SHIFT-16
+
+ bl idct_col4_neon2
+
+ sqshrun v2.8B, v7.8H, #COL_SHIFT-16
+ sqshrun2 v2.16B, v16.8H, #COL_SHIFT-16
+ sqshrun v4.8B, v17.8H, #COL_SHIFT-16
+ sqshrun2 v4.16B, v18.8H, #COL_SHIFT-16
+
+ zip1 v16.4S, v1.4S, v2.4S
+ zip2 v17.4S, v1.4S, v2.4S
+
+ st1 {v16.D}[0], [x0], x1
+ st1 {v16.D}[1], [x0], x1
+
+ zip1 v18.4S, v3.4S, v4.4S
+ zip2 v19.4S, v3.4S, v4.4S
+
+ st1 {v17.D}[0], [x0], x1
+ st1 {v17.D}[1], [x0], x1
+ st1 {v18.D}[0], [x0], x1
+ st1 {v18.D}[1], [x0], x1
+ st1 {v19.D}[0], [x0], x1
+ st1 {v19.D}[1], [x0], x1
+
+ idct_end
+endfunc
+
+function ff_simple_idct_add_neon, export=1
+ idct_start x2
+
+ idct_row4_neon v24, v25, v26, v27, 1
+ idct_row4_neon v28, v29, v30, v31, 2
+ bl idct_col4_neon1
+
+ sshr v1.8H, v7.8H, #COL_SHIFT-16
+ sshr v2.8H, v16.8H, #COL_SHIFT-16
+ sshr v3.8H, v17.8H, #COL_SHIFT-16
+ sshr v4.8H, v18.8H, #COL_SHIFT-16
+
+ bl idct_col4_neon2
+
+ sshr v7.8H, v7.8H, #COL_SHIFT-16
+ sshr v16.8H, v16.8H, #COL_SHIFT-16
+ sshr v17.8H, v17.8H, #COL_SHIFT-16
+ sshr v18.8H, v18.8H, #COL_SHIFT-16
+
+ mov x9, x0
+ ld1 {v19.D}[0], [x0], x1
+ zip1 v23.2D, v1.2D, v7.2D
+ zip2 v24.2D, v1.2D, v7.2D
+ ld1 {v19.D}[1], [x0], x1
+ zip1 v25.2D, v2.2D, v16.2D
+ zip2 v26.2D, v2.2D, v16.2D
+ ld1 {v20.D}[0], [x0], x1
+ zip1 v27.2D, v3.2D, v17.2D
+ zip2 v28.2D, v3.2D, v17.2D
+ ld1 {v20.D}[1], [x0], x1
+ zip1 v29.2D, v4.2D, v18.2D
+ zip2 v30.2D, v4.2D, v18.2D
+ ld1 {v21.D}[0], [x0], x1
+ uaddw v23.8H, v23.8H, v19.8B
+ uaddw2 v24.8H, v24.8H, v19.16B
+ ld1 {v21.D}[1], [x0], x1
+ sqxtun v23.8B, v23.8H
+ sqxtun2 v23.16B, v24.8H
+ ld1 {v22.D}[0], [x0], x1
+ uaddw v24.8H, v25.8H, v20.8B
+ uaddw2 v25.8H, v26.8H, v20.16B
+ ld1 {v22.D}[1], [x0], x1
+ sqxtun v24.8B, v24.8H
+ sqxtun2 v24.16B, v25.8H
+ st1 {v23.D}[0], [x9], x1
+ uaddw v25.8H, v27.8H, v21.8B
+ uaddw2 v26.8H, v28.8H, v21.16B
+ st1 {v23.D}[1], [x9], x1
+ sqxtun v25.8B, v25.8H
+ sqxtun2 v25.16B, v26.8H
+ st1 {v24.D}[0], [x9], x1
+ uaddw v26.8H, v29.8H, v22.8B
+ uaddw2 v27.8H, v30.8H, v22.16B
+ st1 {v24.D}[1], [x9], x1
+ sqxtun v26.8B, v26.8H
+ sqxtun2 v26.16B, v27.8H
+ st1 {v25.D}[0], [x9], x1
+ st1 {v25.D}[1], [x9], x1
+ st1 {v26.D}[0], [x9], x1
+ st1 {v26.D}[1], [x9], x1
+
+ idct_end
+endfunc
+
+function ff_simple_idct_neon, export=1
+ idct_start x0
+
+ mov x2, x0
+ idct_row4_neon v24, v25, v26, v27, 1
+ idct_row4_neon v28, v29, v30, v31, 2
+ sub x2, x2, #128
+ bl idct_col4_neon1
+
+ sshr v1.8H, v7.8H, #COL_SHIFT-16
+ sshr v2.8H, v16.8H, #COL_SHIFT-16
+ sshr v3.8H, v17.8H, #COL_SHIFT-16
+ sshr v4.8H, v18.8H, #COL_SHIFT-16
+
+ bl idct_col4_neon2
+
+ sshr v7.8H, v7.8H, #COL_SHIFT-16
+ sshr v16.8H, v16.8H, #COL_SHIFT-16
+ sshr v17.8H, v17.8H, #COL_SHIFT-16
+ sshr v18.8H, v18.8H, #COL_SHIFT-16
+
+ zip1 v23.2D, v1.2D, v7.2D
+ zip2 v24.2D, v1.2D, v7.2D
+ st1 {v23.2D,v24.2D}, [x2], #32
+ zip1 v25.2D, v2.2D, v16.2D
+ zip2 v26.2D, v2.2D, v16.2D
+ st1 {v25.2D,v26.2D}, [x2], #32
+ zip1 v27.2D, v3.2D, v17.2D
+ zip2 v28.2D, v3.2D, v17.2D
+ st1 {v27.2D,v28.2D}, [x2], #32
+ zip1 v29.2D, v4.2D, v18.2D
+ zip2 v30.2D, v4.2D, v18.2D
+ st1 {v29.2D,v30.2D}, [x2], #32
+
+ idct_end
+endfunc
diff --git a/media/ffvpx/libavcodec/aarch64/vc1dsp_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/vc1dsp_init_aarch64.c
new file mode 100644
index 0000000000..13dfd74940
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/vc1dsp_init_aarch64.c
@@ -0,0 +1,47 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/vc1dsp.h"
+
+#include "config.h"
+
+void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+
+av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
+ dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
+ dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
+ dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
+ }
+}
diff --git a/media/ffvpx/libavcodec/aarch64/videodsp.S b/media/ffvpx/libavcodec/aarch64/videodsp.S
new file mode 100644
index 0000000000..24067cc2af
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/videodsp.S
@@ -0,0 +1,28 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_prefetch_aarch64, export=1
+ subs w2, w2, #2
+ prfm pldl1strm, [x0]
+ prfm pldl1strm, [x0, x1]
+ add x0, x0, x1, lsl #1
+ b.gt X(ff_prefetch_aarch64)
+ ret
+endfunc
diff --git a/media/ffvpx/libavcodec/aarch64/videodsp_init.c b/media/ffvpx/libavcodec/aarch64/videodsp_init.c
new file mode 100644
index 0000000000..6f667a6d3e
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/videodsp_init.c
@@ -0,0 +1,32 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/videodsp.h"
+
+void ff_prefetch_aarch64(uint8_t *mem, ptrdiff_t stride, int h);
+
+av_cold void ff_videodsp_init_aarch64(VideoDSPContext *ctx, int bpc)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_armv8(cpu_flags))
+ ctx->prefetch = ff_prefetch_aarch64;
+}
diff --git a/media/ffvpx/libavcodec/aarch64/vp8dsp.h b/media/ffvpx/libavcodec/aarch64/vp8dsp.h
new file mode 100644
index 0000000000..871fed7a95
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/vp8dsp.h
@@ -0,0 +1,75 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AARCH64_VP8DSP_H
+#define AVCODEC_AARCH64_VP8DSP_H
+
+#include "libavcodec/vp8dsp.h"
+
+#define VP8_LF_Y(hv, inner, opt) \
+ void ff_vp8_##hv##_loop_filter16##inner##_##opt(uint8_t *dst, \
+ ptrdiff_t stride, \
+ int flim_E, int flim_I, \
+ int hev_thresh)
+
+#define VP8_LF_UV(hv, inner, opt) \
+ void ff_vp8_##hv##_loop_filter8uv##inner##_##opt(uint8_t *dstU, \
+ uint8_t *dstV, \
+ ptrdiff_t stride, \
+ int flim_E, int flim_I, \
+ int hev_thresh)
+
+#define VP8_LF_SIMPLE(hv, opt) \
+ void ff_vp8_##hv##_loop_filter16_simple_##opt(uint8_t *dst, \
+ ptrdiff_t stride, \
+ int flim)
+
+#define VP8_LF_HV(inner, opt) \
+ VP8_LF_Y(h, inner, opt); \
+ VP8_LF_Y(v, inner, opt); \
+ VP8_LF_UV(h, inner, opt); \
+ VP8_LF_UV(v, inner, opt)
+
+#define VP8_LF(opt) \
+ VP8_LF_HV(, opt); \
+ VP8_LF_HV(_inner, opt); \
+ VP8_LF_SIMPLE(h, opt); \
+ VP8_LF_SIMPLE(v, opt)
+
+#define VP8_MC(n, opt) \
+ void ff_put_vp8_##n##_##opt(uint8_t *dst, ptrdiff_t dststride, \
+ uint8_t *src, ptrdiff_t srcstride, \
+ int h, int x, int y)
+
+#define VP8_EPEL(w, opt) \
+ VP8_MC(pixels ## w, opt); \
+ VP8_MC(epel ## w ## _h4, opt); \
+ VP8_MC(epel ## w ## _h6, opt); \
+ VP8_MC(epel ## w ## _v4, opt); \
+ VP8_MC(epel ## w ## _h4v4, opt); \
+ VP8_MC(epel ## w ## _h6v4, opt); \
+ VP8_MC(epel ## w ## _v6, opt); \
+ VP8_MC(epel ## w ## _h4v6, opt); \
+ VP8_MC(epel ## w ## _h6v6, opt)
+
+#define VP8_BILIN(w, opt) \
+ VP8_MC(bilin ## w ## _h, opt); \
+ VP8_MC(bilin ## w ## _v, opt); \
+ VP8_MC(bilin ## w ## _hv, opt)
+
+#endif /* AVCODEC_AARCH64_VP8DSP_H */
diff --git a/media/ffvpx/libavcodec/aarch64/vp8dsp_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/vp8dsp_init_aarch64.c
new file mode 100644
index 0000000000..fc7e831d17
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/vp8dsp_init_aarch64.c
@@ -0,0 +1,124 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/vp8dsp.h"
+#include "vp8dsp.h"
+
+void ff_vp8_luma_dc_wht_neon(int16_t block[4][4][16], int16_t dc[16]);
+
+void ff_vp8_idct_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add4y_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add4uv_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
+
+VP8_LF(neon);
+
+VP8_EPEL(16, neon);
+VP8_EPEL(8, neon);
+VP8_EPEL(4, neon);
+
+VP8_BILIN(16, neon);
+VP8_BILIN(8, neon);
+VP8_BILIN(4, neon);
+
+av_cold void ff_vp78dsp_init_aarch64(VP8DSPContext *dsp)
+{
+ if (!have_neon(av_get_cpu_flags()))
+ return;
+ dsp->put_vp8_epel_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon;
+ dsp->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_neon;
+ dsp->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_neon;
+ dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_neon;
+
+ dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon;
+ dsp->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_neon;
+ dsp->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_neon;
+ dsp->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_neon;
+ dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_neon;
+ dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_neon;
+ dsp->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_neon;
+ dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_neon;
+ dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_neon;
+
+ dsp->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_neon;
+ dsp->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_neon;
+ dsp->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_neon;
+ dsp->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_neon;
+ dsp->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_neon;
+ dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_neon;
+ dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_neon;
+ dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_neon;
+
+ dsp->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon;
+ dsp->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_neon;
+ dsp->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_neon;
+ dsp->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_neon;
+ dsp->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilin16_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilin16_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_neon;
+ dsp->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilin16_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_neon;
+
+ dsp->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon;
+ dsp->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_neon;
+ dsp->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_neon;
+ dsp->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_neon;
+ dsp->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_neon;
+ dsp->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_neon;
+
+ dsp->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_neon;
+ dsp->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_neon;
+ dsp->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_neon;
+ dsp->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_neon;
+ dsp->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_neon;
+}
+
+av_cold void ff_vp8dsp_init_aarch64(VP8DSPContext *dsp)
+{
+ if (!have_neon(av_get_cpu_flags()))
+ return;
+ dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_neon;
+
+ dsp->vp8_idct_add = ff_vp8_idct_add_neon;
+ dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_neon;
+ dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_neon;
+ dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_neon;
+
+ dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_neon;
+ dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon;
+ dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_neon;
+ dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_neon;
+
+ dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_neon;
+ dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_neon;
+ dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_neon;
+ dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_neon;
+
+ dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_neon;
+ dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_neon;
+}
diff --git a/media/ffvpx/libavcodec/aarch64/vp8dsp_neon.S b/media/ffvpx/libavcodec/aarch64/vp8dsp_neon.S
new file mode 100644
index 0000000000..4bbf16d1a4
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/vp8dsp_neon.S
@@ -0,0 +1,1790 @@
+/*
+ * VP8 NEON optimisations
+ *
+ * Copyright (c) 2010 Rob Clark <rob@ti.com>
+ * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com>
+ * Copyright (c) 2019 Martin Storsjo <martin@martin.st>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+function ff_vp8_luma_dc_wht_neon, export=1
+ ld1 {v0.4h - v3.4h}, [x1]
+ movi v30.8h, #0
+
+ add v4.4h, v0.4h, v3.4h
+ add v6.4h, v1.4h, v2.4h
+ st1 {v30.8h}, [x1], #16
+ sub v7.4h, v1.4h, v2.4h
+ sub v5.4h, v0.4h, v3.4h
+ st1 {v30.8h}, [x1]
+ add v0.4h, v4.4h, v6.4h
+ add v1.4h, v5.4h, v7.4h
+ sub v2.4h, v4.4h, v6.4h
+ sub v3.4h, v5.4h, v7.4h
+
+ movi v16.4h, #3
+
+ transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
+
+ add v0.4h, v0.4h, v16.4h
+
+ add v4.4h, v0.4h, v3.4h
+ add v6.4h, v1.4h, v2.4h
+ sub v7.4h, v1.4h, v2.4h
+ sub v5.4h, v0.4h, v3.4h
+ add v0.4h, v4.4h, v6.4h
+ add v1.4h, v5.4h, v7.4h
+ sub v2.4h, v4.4h, v6.4h
+ sub v3.4h, v5.4h, v7.4h
+
+ sshr v0.4h, v0.4h, #3
+ sshr v1.4h, v1.4h, #3
+ sshr v2.4h, v2.4h, #3
+ sshr v3.4h, v3.4h, #3
+
+ mov x3, #32
+ st1 {v0.h}[0], [x0], x3
+ st1 {v1.h}[0], [x0], x3
+ st1 {v2.h}[0], [x0], x3
+ st1 {v3.h}[0], [x0], x3
+ st1 {v0.h}[1], [x0], x3
+ st1 {v1.h}[1], [x0], x3
+ st1 {v2.h}[1], [x0], x3
+ st1 {v3.h}[1], [x0], x3
+ st1 {v0.h}[2], [x0], x3
+ st1 {v1.h}[2], [x0], x3
+ st1 {v2.h}[2], [x0], x3
+ st1 {v3.h}[2], [x0], x3
+ st1 {v0.h}[3], [x0], x3
+ st1 {v1.h}[3], [x0], x3
+ st1 {v2.h}[3], [x0], x3
+ st1 {v3.h}[3], [x0], x3
+
+ ret
+endfunc
+
+function ff_vp8_idct_add_neon, export=1
+ ld1 {v0.8b - v3.8b}, [x1]
+ mov w4, #20091
+ movk w4, #35468/2, lsl #16
+ dup v4.2s, w4
+
+ smull v26.4s, v1.4h, v4.h[0]
+ smull v27.4s, v3.4h, v4.h[0]
+ sqdmulh v20.4h, v1.4h, v4.h[1]
+ sqdmulh v23.4h, v3.4h, v4.h[1]
+ shrn v21.4h, v26.4s, #16
+ shrn v22.4h, v27.4s, #16
+ add v21.4h, v21.4h, v1.4h
+ add v22.4h, v22.4h, v3.4h
+
+ add v16.4h, v0.4h, v2.4h
+ sub v17.4h, v0.4h, v2.4h
+
+ add v18.4h, v21.4h, v23.4h
+ sub v19.4h, v20.4h, v22.4h
+
+ add v0.4h, v16.4h, v18.4h
+ add v1.4h, v17.4h, v19.4h
+ sub v3.4h, v16.4h, v18.4h
+ sub v2.4h, v17.4h, v19.4h
+
+ transpose_4x4H v0, v1, v2, v3, v24, v5, v6, v7
+
+ movi v29.8h, #0
+ smull v26.4s, v1.4h, v4.h[0]
+ st1 {v29.8h}, [x1], #16
+ smull v27.4s, v3.4h, v4.h[0]
+ st1 {v29.16b}, [x1]
+ sqdmulh v21.4h, v1.4h, v4.h[1]
+ sqdmulh v23.4h, v3.4h, v4.h[1]
+ shrn v20.4h, v26.4s, #16
+ shrn v22.4h, v27.4s, #16
+ add v20.4h, v20.4h, v1.4h
+ add v22.4h, v22.4h, v3.4h
+ add v16.4h, v0.4h, v2.4h
+ sub v17.4h, v0.4h, v2.4h
+
+ add v18.4h, v20.4h, v23.4h
+ ld1 {v24.s}[0], [x0], x2
+ sub v19.4h, v21.4h, v22.4h
+ ld1 {v25.s}[0], [x0], x2
+ add v0.4h, v16.4h, v18.4h
+ add v1.4h, v17.4h, v19.4h
+ ld1 {v26.s}[0], [x0], x2
+ sub v3.4h, v16.4h, v18.4h
+ sub v2.4h, v17.4h, v19.4h
+ ld1 {v27.s}[0], [x0], x2
+ srshr v0.4h, v0.4h, #3
+ srshr v1.4h, v1.4h, #3
+ srshr v2.4h, v2.4h, #3
+ srshr v3.4h, v3.4h, #3
+
+ sub x0, x0, x2, lsl #2
+
+ transpose_4x4H v0, v1, v2, v3, v5, v6, v7, v16
+
+ uaddw v0.8h, v0.8h, v24.8b
+ uaddw v1.8h, v1.8h, v25.8b
+ uaddw v2.8h, v2.8h, v26.8b
+ uaddw v3.8h, v3.8h, v27.8b
+ sqxtun v0.8b, v0.8h
+ sqxtun v1.8b, v1.8h
+ sqxtun v2.8b, v2.8h
+ sqxtun v3.8b, v3.8h
+
+ st1 {v0.s}[0], [x0], x2
+ st1 {v1.s}[0], [x0], x2
+ st1 {v2.s}[0], [x0], x2
+ st1 {v3.s}[0], [x0], x2
+
+ ret
+endfunc
+
+function ff_vp8_idct_dc_add4uv_neon, export=1
+ movi v0.4h, #0
+ mov x3, #32
+ ld1r {v16.4h}, [x1]
+ st1 {v0.h}[0], [x1], x3
+ ld1r {v17.4h}, [x1]
+ st1 {v0.h}[0], [x1], x3
+ ld1r {v18.4h}, [x1]
+ st1 {v0.h}[0], [x1], x3
+ ld1r {v19.4h}, [x1]
+ st1 {v0.h}[0], [x1], x3
+ ins v16.d[1], v17.d[0]
+ ins v18.d[1], v19.d[0]
+ mov x3, x0
+ srshr v16.8h, v16.8h, #3 // dc >>= 3
+ ld1 {v0.8b}, [x0], x2
+ srshr v18.8h, v18.8h, #3
+ ld1 {v1.8b}, [x0], x2
+ uaddw v20.8h, v16.8h, v0.8b
+ ld1 {v2.8b}, [x0], x2
+ uaddw v0.8h, v16.8h, v1.8b
+ ld1 {v3.8b}, [x0], x2
+ uaddw v22.8h, v16.8h, v2.8b
+ ld1 {v4.8b}, [x0], x2
+ uaddw v2.8h, v16.8h, v3.8b
+ ld1 {v5.8b}, [x0], x2
+ uaddw v24.8h, v18.8h, v4.8b
+ ld1 {v6.8b}, [x0], x2
+ uaddw v4.8h, v18.8h, v5.8b
+ ld1 {v7.8b}, [x0], x2
+ uaddw v26.8h, v18.8h, v6.8b
+ sqxtun v20.8b, v20.8h
+ uaddw v6.8h, v18.8h, v7.8b
+ sqxtun v21.8b, v0.8h
+ sqxtun v22.8b, v22.8h
+ st1 {v20.8b}, [x3], x2
+ sqxtun v23.8b, v2.8h
+ st1 {v21.8b}, [x3], x2
+ sqxtun v24.8b, v24.8h
+ st1 {v22.8b}, [x3], x2
+ sqxtun v25.8b, v4.8h
+ st1 {v23.8b}, [x3], x2
+ sqxtun v26.8b, v26.8h
+ st1 {v24.8b}, [x3], x2
+ sqxtun v27.8b, v6.8h
+ st1 {v25.8b}, [x3], x2
+ st1 {v26.8b}, [x3], x2
+ st1 {v27.8b}, [x3], x2
+
+ ret
+endfunc
+
+function ff_vp8_idct_dc_add4y_neon, export=1
+ movi v0.16b, #0
+ mov x3, #32
+ ld1r {v16.4h}, [x1]
+ st1 {v0.h}[0], [x1], x3
+ ld1r {v17.4h}, [x1]
+ st1 {v0.h}[0], [x1], x3
+ zip1 v16.2d, v16.2d, v17.2d
+ ld1r {v18.4h}, [x1]
+ st1 {v0.h}[0], [x1], x3
+ ld1r {v19.4h}, [x1]
+ st1 {v0.h}[0], [x1], x3
+ zip1 v18.2d, v18.2d, v19.2d
+ srshr v16.8h, v16.8h, #3 // dc >>= 3
+ ld1 {v0.16b}, [x0], x2
+ srshr v18.8h, v18.8h, #3
+ ld1 {v1.16b}, [x0], x2
+ uaddw v20.8h, v16.8h, v0.8b
+ ld1 {v2.16b}, [x0], x2
+ uaddw2 v0.8h, v18.8h, v0.16b
+ ld1 {v3.16b}, [x0], x2
+ uaddw v21.8h, v16.8h, v1.8b
+ uaddw2 v1.8h, v18.8h, v1.16b
+ uaddw v22.8h, v16.8h, v2.8b
+ uaddw2 v2.8h, v18.8h, v2.16b
+ uaddw v23.8h, v16.8h, v3.8b
+ uaddw2 v3.8h, v18.8h, v3.16b
+ sub x0, x0, x2, lsl #2
+ sqxtun v20.8b, v20.8h
+ sqxtun2 v20.16b, v0.8h
+ sqxtun v21.8b, v21.8h
+ sqxtun2 v21.16b, v1.8h
+ sqxtun v22.8b, v22.8h
+ st1 {v20.16b}, [x0], x2
+ sqxtun2 v22.16b, v2.8h
+ st1 {v21.16b}, [x0], x2
+ sqxtun v23.8b, v23.8h
+ st1 {v22.16b}, [x0], x2
+ sqxtun2 v23.16b, v3.8h
+ st1 {v23.16b}, [x0], x2
+
+ ret
+endfunc
+
+function ff_vp8_idct_dc_add_neon, export=1
+ mov w3, #0
+ ld1r {v2.8h}, [x1]
+ strh w3, [x1]
+ srshr v2.8h, v2.8h, #3
+ ld1 {v0.s}[0], [x0], x2
+ ld1 {v0.s}[1], [x0], x2
+ uaddw v3.8h, v2.8h, v0.8b
+ ld1 {v1.s}[0], [x0], x2
+ ld1 {v1.s}[1], [x0], x2
+ uaddw v4.8h, v2.8h, v1.8b
+ sqxtun v0.8b, v3.8h
+ sqxtun v1.8b, v4.8h
+ sub x0, x0, x2, lsl #2
+ st1 {v0.s}[0], [x0], x2
+ st1 {v0.s}[1], [x0], x2
+ st1 {v1.s}[0], [x0], x2
+ st1 {v1.s}[1], [x0], x2
+ ret
+endfunc
+
+// Register layout:
+// P3..Q3 -> v0..v7
+// flim_E -> v22
+// flim_I -> v23
+// hev_thresh -> x5
+//
+.macro vp8_loop_filter, inner=0, simple=0, hev_thresh
+ .if \simple
+ uabd v17.16b, v3.16b, v4.16b // abs(P0-Q0)
+ uabd v23.16b, v2.16b, v5.16b // abs(P1-Q1)
+ uqadd v17.16b, v17.16b, v17.16b // abs(P0-Q0) * 2
+ ushr v18.16b, v23.16b, #1 // abs(P1-Q1) / 2
+ uqadd v19.16b, v17.16b, v18.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
+ movi v21.16b, #0x80
+ cmhs v16.16b, v22.16b, v19.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
+ .else
+ // calculate hev and normal_limit:
+ uabd v20.16b, v2.16b, v3.16b // abs(P1-P0)
+ uabd v21.16b, v5.16b, v4.16b // abs(Q1-Q0)
+ uabd v18.16b, v0.16b, v1.16b // abs(P3-P2)
+ uabd v19.16b, v1.16b, v2.16b // abs(P2-P1)
+ cmhs v16.16b, v23.16b, v20.16b // abs(P1-P0) <= flim_I
+ cmhs v17.16b, v23.16b, v21.16b // abs(Q1-Q0) <= flim_I
+ cmhs v18.16b, v23.16b, v18.16b // abs(P3-P2) <= flim_I
+ cmhs v19.16b, v23.16b, v19.16b // abs(P2-P1) <= flim_I
+ and v16.16b, v17.16b, v16.16b
+ uabd v17.16b, v7.16b, v6.16b // abs(Q3-Q2)
+ and v16.16b, v16.16b, v19.16b
+ uabd v19.16b, v6.16b, v5.16b // abs(Q2-Q1)
+ and v16.16b, v16.16b, v18.16b
+ cmhs v18.16b, v23.16b, v17.16b // abs(Q3-Q2) <= flim_I
+ cmhs v19.16b, v23.16b, v19.16b // abs(Q2-Q1) <= flim_I
+ uabd v17.16b, v3.16b, v4.16b // abs(P0-Q0)
+ uabd v23.16b, v2.16b, v5.16b // abs(P1-Q1)
+ and v16.16b, v16.16b, v18.16b
+ uqadd v17.16b, v17.16b, v17.16b // abs(P0-Q0) * 2
+ and v16.16b, v16.16b, v19.16b
+ ushr v18.16b, v23.16b, #1 // abs(P1-Q1) / 2
+ dup v23.16b, \hev_thresh // hev_thresh
+ uqadd v19.16b, v17.16b, v18.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
+ cmhi v20.16b, v20.16b, v23.16b // abs(P1-P0) > hev_thresh
+ cmhs v19.16b, v22.16b, v19.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
+ cmhi v22.16b, v21.16b, v23.16b // abs(Q1-Q0) > hev_thresh
+ and v16.16b, v16.16b, v19.16b
+ movi v21.16b, #0x80
+ orr v17.16b, v20.16b, v22.16b
+ .endif
+
+ // at this point:
+ // v16: normal_limit
+ // v17: hev
+
+ // convert to signed value:
+ eor v3.16b, v3.16b, v21.16b // PS0 = P0 ^ 0x80
+ eor v4.16b, v4.16b, v21.16b // QS0 = Q0 ^ 0x80
+
+ movi v20.8h, #3
+ ssubl v18.8h, v4.8b, v3.8b // QS0 - PS0
+ ssubl2 v19.8h, v4.16b, v3.16b // (widened to 16bit)
+ eor v2.16b, v2.16b, v21.16b // PS1 = P1 ^ 0x80
+ eor v5.16b, v5.16b, v21.16b // QS1 = Q1 ^ 0x80
+ mul v18.8h, v18.8h, v20.8h // w = 3 * (QS0 - PS0)
+ mul v19.8h, v19.8h, v20.8h
+
+ sqsub v20.16b, v2.16b, v5.16b // clamp(PS1-QS1)
+ movi v22.16b, #4
+ movi v23.16b, #3
+ .if \inner
+ and v20.16b, v20.16b, v17.16b // if(hev) w += clamp(PS1-QS1)
+ .endif
+ saddw v18.8h, v18.8h, v20.8b // w += clamp(PS1-QS1)
+ saddw2 v19.8h, v19.8h, v20.16b
+ sqxtn v18.8b, v18.8h // narrow result back into v18
+ sqxtn2 v18.16b, v19.8h
+ .if !\inner && !\simple
+ eor v1.16b, v1.16b, v21.16b // PS2 = P2 ^ 0x80
+ eor v6.16b, v6.16b, v21.16b // QS2 = Q2 ^ 0x80
+ .endif
+ and v18.16b, v18.16b, v16.16b // w &= normal_limit
+
+ // registers used at this point..
+ // v0 -> P3 (don't corrupt)
+ // v1-v6 -> PS2-QS2
+ // v7 -> Q3 (don't corrupt)
+ // v17 -> hev
+ // v18 -> w
+ // v21 -> #0x80
+ // v22 -> #4
+ // v23 -> #3
+ // v16, v19, v29 -> unused
+ //
+ // filter_common: is4tap==1
+ // c1 = clamp(w + 4) >> 3;
+ // c2 = clamp(w + 3) >> 3;
+ // Q0 = s2u(QS0 - c1);
+ // P0 = s2u(PS0 + c2);
+
+ .if \simple
+ sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
+ sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
+ sshr v19.16b, v19.16b, #3 // c1 >>= 3
+ sshr v20.16b, v20.16b, #3 // c2 >>= 3
+ sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
+ sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
+ eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
+ eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
+ eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
+ eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
+ .elseif \inner
+ // the !is4tap case of filter_common, only used for inner blocks
+ // c3 = ((c1&~hev) + 1) >> 1;
+ // Q1 = s2u(QS1 - c3);
+ // P1 = s2u(PS1 + c3);
+ sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
+ sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
+ sshr v19.16b, v19.16b, #3 // c1 >>= 3
+ sshr v20.16b, v20.16b, #3 // c2 >>= 3
+ sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
+ sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
+ bic v19.16b, v19.16b, v17.16b // c1 & ~hev
+ eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
+ srshr v19.16b, v19.16b, #1 // c3 >>= 1
+ eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
+ sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-c3)
+ sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+c3)
+ eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
+ eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
+ .else
+ and v20.16b, v18.16b, v17.16b // w & hev
+ sqadd v19.16b, v20.16b, v22.16b // c1 = clamp((w&hev)+4)
+ sqadd v20.16b, v20.16b, v23.16b // c2 = clamp((w&hev)+3)
+ sshr v19.16b, v19.16b, #3 // c1 >>= 3
+ sshr v20.16b, v20.16b, #3 // c2 >>= 3
+ bic v18.16b, v18.16b, v17.16b // w &= ~hev
+ sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
+ sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
+
+ // filter_mbedge:
+ // a = clamp((27*w + 63) >> 7);
+ // Q0 = s2u(QS0 - a);
+ // P0 = s2u(PS0 + a);
+ // a = clamp((18*w + 63) >> 7);
+ // Q1 = s2u(QS1 - a);
+ // P1 = s2u(PS1 + a);
+ // a = clamp((9*w + 63) >> 7);
+ // Q2 = s2u(QS2 - a);
+ // P2 = s2u(PS2 + a);
+ movi v17.8h, #63
+ sshll v22.8h, v18.8b, #3
+ sshll2 v23.8h, v18.16b, #3
+ saddw v22.8h, v22.8h, v18.8b
+ saddw2 v23.8h, v23.8h, v18.16b
+ add v16.8h, v17.8h, v22.8h
+ add v17.8h, v17.8h, v23.8h // 9*w + 63
+ add v19.8h, v16.8h, v22.8h
+ add v20.8h, v17.8h, v23.8h // 18*w + 63
+ add v22.8h, v19.8h, v22.8h
+ add v23.8h, v20.8h, v23.8h // 27*w + 63
+ sqshrn v16.8b, v16.8h, #7
+ sqshrn2 v16.16b, v17.8h, #7 // clamp(( 9*w + 63)>>7)
+ sqshrn v19.8b, v19.8h, #7
+ sqshrn2 v19.16b, v20.8h, #7 // clamp((18*w + 63)>>7)
+ sqshrn v22.8b, v22.8h, #7
+ sqshrn2 v22.16b, v23.8h, #7 // clamp((27*w + 63)>>7)
+ sqadd v1.16b, v1.16b, v16.16b // PS2 = clamp(PS2+a)
+ sqsub v6.16b, v6.16b, v16.16b // QS2 = clamp(QS2-a)
+ sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+a)
+ sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-a)
+ sqadd v3.16b, v3.16b, v22.16b // PS0 = clamp(PS0+a)
+ sqsub v4.16b, v4.16b, v22.16b // QS0 = clamp(QS0-a)
+ eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
+ eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
+ eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
+ eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
+ eor v1.16b, v1.16b, v21.16b // P2 = PS2 ^ 0x80
+ eor v6.16b, v6.16b, v21.16b // Q2 = QS2 ^ 0x80
+ .endif
+.endm
+
+.macro vp8_v_loop_filter16 name, inner=0, simple=0
+function ff_vp8_v_loop_filter16\name\()_neon, export=1
+ sub x0, x0, x1, lsl #1+!\simple
+
+ // Load pixels:
+ .if !\simple
+ ld1 {v0.16b}, [x0], x1 // P3
+ ld1 {v1.16b}, [x0], x1 // P2
+ .endif
+ ld1 {v2.16b}, [x0], x1 // P1
+ ld1 {v3.16b}, [x0], x1 // P0
+ ld1 {v4.16b}, [x0], x1 // Q0
+ ld1 {v5.16b}, [x0], x1 // Q1
+ .if !\simple
+ ld1 {v6.16b}, [x0], x1 // Q2
+ ld1 {v7.16b}, [x0] // Q3
+ dup v23.16b, w3 // flim_I
+ .endif
+ dup v22.16b, w2 // flim_E
+
+ vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
+
+ // back up to P2: dst -= stride * 6
+ sub x0, x0, x1, lsl #2
+ .if !\simple
+ sub x0, x0, x1, lsl #1
+
+ // Store pixels:
+ st1 {v1.16b}, [x0], x1 // P2
+ .endif
+ st1 {v2.16b}, [x0], x1 // P1
+ st1 {v3.16b}, [x0], x1 // P0
+ st1 {v4.16b}, [x0], x1 // Q0
+ st1 {v5.16b}, [x0], x1 // Q1
+ .if !\simple
+ st1 {v6.16b}, [x0] // Q2
+ .endif
+
+ ret
+endfunc
+.endm
+
+vp8_v_loop_filter16
+vp8_v_loop_filter16 _inner, inner=1
+vp8_v_loop_filter16 _simple, simple=1
+
+.macro vp8_v_loop_filter8uv name, inner=0
+function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
+ sub x0, x0, x2, lsl #2
+ sub x1, x1, x2, lsl #2
+ // Load pixels:
+ ld1 {v0.d}[0], [x0], x2 // P3
+ ld1 {v0.d}[1], [x1], x2 // P3
+ ld1 {v1.d}[0], [x0], x2 // P2
+ ld1 {v1.d}[1], [x1], x2 // P2
+ ld1 {v2.d}[0], [x0], x2 // P1
+ ld1 {v2.d}[1], [x1], x2 // P1
+ ld1 {v3.d}[0], [x0], x2 // P0
+ ld1 {v3.d}[1], [x1], x2 // P0
+ ld1 {v4.d}[0], [x0], x2 // Q0
+ ld1 {v4.d}[1], [x1], x2 // Q0
+ ld1 {v5.d}[0], [x0], x2 // Q1
+ ld1 {v5.d}[1], [x1], x2 // Q1
+ ld1 {v6.d}[0], [x0], x2 // Q2
+ ld1 {v6.d}[1], [x1], x2 // Q2
+ ld1 {v7.d}[0], [x0] // Q3
+ ld1 {v7.d}[1], [x1] // Q3
+
+ dup v22.16b, w3 // flim_E
+ dup v23.16b, w4 // flim_I
+
+ vp8_loop_filter inner=\inner, hev_thresh=w5
+
+ // back up to P2: u,v -= stride * 6
+ sub x0, x0, x2, lsl #2
+ sub x1, x1, x2, lsl #2
+ sub x0, x0, x2, lsl #1
+ sub x1, x1, x2, lsl #1
+
+ // Store pixels:
+
+ st1 {v1.d}[0], [x0], x2 // P2
+ st1 {v1.d}[1], [x1], x2 // P2
+ st1 {v2.d}[0], [x0], x2 // P1
+ st1 {v2.d}[1], [x1], x2 // P1
+ st1 {v3.d}[0], [x0], x2 // P0
+ st1 {v3.d}[1], [x1], x2 // P0
+ st1 {v4.d}[0], [x0], x2 // Q0
+ st1 {v4.d}[1], [x1], x2 // Q0
+ st1 {v5.d}[0], [x0], x2 // Q1
+ st1 {v5.d}[1], [x1], x2 // Q1
+ st1 {v6.d}[0], [x0] // Q2
+ st1 {v6.d}[1], [x1] // Q2
+
+ ret
+endfunc
+.endm
+
+vp8_v_loop_filter8uv
+vp8_v_loop_filter8uv _inner, inner=1
+
+.macro vp8_h_loop_filter16 name, inner=0, simple=0
+function ff_vp8_h_loop_filter16\name\()_neon, export=1
+
+ sub x0, x0, #4
+ // Load pixels:
+ ld1 {v0.d}[0], [x0], x1
+ ld1 {v1.d}[0], [x0], x1
+ ld1 {v2.d}[0], [x0], x1
+ ld1 {v3.d}[0], [x0], x1
+ ld1 {v4.d}[0], [x0], x1
+ ld1 {v5.d}[0], [x0], x1
+ ld1 {v6.d}[0], [x0], x1
+ ld1 {v7.d}[0], [x0], x1
+ ld1 {v0.d}[1], [x0], x1
+ ld1 {v1.d}[1], [x0], x1
+ ld1 {v2.d}[1], [x0], x1
+ ld1 {v3.d}[1], [x0], x1
+ ld1 {v4.d}[1], [x0], x1
+ ld1 {v5.d}[1], [x0], x1
+ ld1 {v6.d}[1], [x0], x1
+ ld1 {v7.d}[1], [x0], x1
+
+ transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
+
+ dup v22.16b, w2 // flim_E
+ .if !\simple
+ dup v23.16b, w3 // flim_I
+ .endif
+
+ vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
+
+ sub x0, x0, x1, lsl #4 // backup 16 rows
+
+ transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
+
+ // Store pixels:
+ st1 {v0.d}[0], [x0], x1
+ st1 {v1.d}[0], [x0], x1
+ st1 {v2.d}[0], [x0], x1
+ st1 {v3.d}[0], [x0], x1
+ st1 {v4.d}[0], [x0], x1
+ st1 {v5.d}[0], [x0], x1
+ st1 {v6.d}[0], [x0], x1
+ st1 {v7.d}[0], [x0], x1
+ st1 {v0.d}[1], [x0], x1
+ st1 {v1.d}[1], [x0], x1
+ st1 {v2.d}[1], [x0], x1
+ st1 {v3.d}[1], [x0], x1
+ st1 {v4.d}[1], [x0], x1
+ st1 {v5.d}[1], [x0], x1
+ st1 {v6.d}[1], [x0], x1
+ st1 {v7.d}[1], [x0]
+
+ ret
+endfunc
+.endm
+
+vp8_h_loop_filter16
+vp8_h_loop_filter16 _inner, inner=1
+vp8_h_loop_filter16 _simple, simple=1
+
+.macro vp8_h_loop_filter8uv name, inner=0
+function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
+ sub x0, x0, #4
+ sub x1, x1, #4
+
+ // Load pixels:
+ ld1 {v0.d}[0], [x0], x2 // load u
+ ld1 {v0.d}[1], [x1], x2 // load v
+ ld1 {v1.d}[0], [x0], x2
+ ld1 {v1.d}[1], [x1], x2
+ ld1 {v2.d}[0], [x0], x2
+ ld1 {v2.d}[1], [x1], x2
+ ld1 {v3.d}[0], [x0], x2
+ ld1 {v3.d}[1], [x1], x2
+ ld1 {v4.d}[0], [x0], x2
+ ld1 {v4.d}[1], [x1], x2
+ ld1 {v5.d}[0], [x0], x2
+ ld1 {v5.d}[1], [x1], x2
+ ld1 {v6.d}[0], [x0], x2
+ ld1 {v6.d}[1], [x1], x2
+ ld1 {v7.d}[0], [x0], x2
+ ld1 {v7.d}[1], [x1], x2
+
+ transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
+
+ dup v22.16b, w3 // flim_E
+ dup v23.16b, w4 // flim_I
+
+ vp8_loop_filter inner=\inner, hev_thresh=w5
+
+ sub x0, x0, x2, lsl #3 // backup u 8 rows
+ sub x1, x1, x2, lsl #3 // backup v 8 rows
+
+ transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
+
+ // Store pixels:
+ st1 {v0.d}[0], [x0], x2 // load u
+ st1 {v0.d}[1], [x1], x2 // load v
+ st1 {v1.d}[0], [x0], x2
+ st1 {v1.d}[1], [x1], x2
+ st1 {v2.d}[0], [x0], x2
+ st1 {v2.d}[1], [x1], x2
+ st1 {v3.d}[0], [x0], x2
+ st1 {v3.d}[1], [x1], x2
+ st1 {v4.d}[0], [x0], x2
+ st1 {v4.d}[1], [x1], x2
+ st1 {v5.d}[0], [x0], x2
+ st1 {v5.d}[1], [x1], x2
+ st1 {v6.d}[0], [x0], x2
+ st1 {v6.d}[1], [x1], x2
+ st1 {v7.d}[0], [x0]
+ st1 {v7.d}[1], [x1]
+
+ ret
+
+endfunc
+.endm
+
+vp8_h_loop_filter8uv
+vp8_h_loop_filter8uv _inner, inner=1
+
+
+function ff_put_vp8_pixels16_neon, export=1
+1:
+ subs w4, w4, #4
+ ld1 {v0.16b}, [x2], x3
+ ld1 {v1.16b}, [x2], x3
+ ld1 {v2.16b}, [x2], x3
+ ld1 {v3.16b}, [x2], x3
+ st1 {v0.16b}, [x0], x1
+ st1 {v1.16b}, [x0], x1
+ st1 {v2.16b}, [x0], x1
+ st1 {v3.16b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function ff_put_vp8_pixels8_neon, export=1
+1:
+ subs w4, w4, #4
+ ld1 {v0.8b}, [x2], x3
+ ld1 {v0.d}[1], [x2], x3
+ ld1 {v1.8b}, [x2], x3
+ ld1 {v1.d}[1], [x2], x3
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.d}[1], [x0], x1
+ st1 {v1.8b}, [x0], x1
+ st1 {v1.d}[1], [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+/* 4/6-tap 8th-pel MC */
+
+.macro vp8_epel8_h6 d, s0, s1
+ ext v22.8b, \s0\().8b, \s1\().8b, #1
+ uxtl v18.8h, \s0\().8b
+ ext v23.8b, \s0\().8b, \s1\().8b, #2
+ uxtl v19.8h, v22.8b
+ ext v24.8b, \s0\().8b, \s1\().8b, #3
+ uxtl v21.8h, v23.8b
+ ext v25.8b, \s0\().8b, \s1\().8b, #4
+ uxtl v22.8h, v24.8b
+ ext v26.8b, \s0\().8b, \s1\().8b, #5
+ uxtl v25.8h, v25.8b
+ mul v21.8h, v21.8h, v0.h[2]
+ uxtl v26.8h, v26.8b
+ mul v22.8h, v22.8h, v0.h[3]
+ mls v21.8h, v19.8h, v0.h[1]
+ mls v22.8h, v25.8h, v0.h[4]
+ mla v21.8h, v18.8h, v0.h[0]
+ mla v22.8h, v26.8h, v0.h[5]
+ sqadd v22.8h, v21.8h, v22.8h
+ sqrshrun \d\().8b, v22.8h, #7
+.endm
+
+.macro vp8_epel16_h6 d0, v0, v1
+ ext v22.16b, \v0\().16b, \v1\().16b, #3
+ ext v23.16b, \v0\().16b, \v1\().16b, #4
+ uxtl v19.8h, v22.8b
+ uxtl2 v22.8h, v22.16b
+ ext v3.16b, \v0\().16b, \v1\().16b, #2
+ uxtl v20.8h, v23.8b
+ uxtl2 v23.8h, v23.16b
+ ext v16.16b, \v0\().16b, \v1\().16b, #1
+ uxtl v18.8h, v3.8b
+ uxtl2 v3.8h, v3.16b
+ ext v2.16b, \v0\().16b, \v1\().16b, #5
+ uxtl v21.8h, v2.8b
+ uxtl2 v2.8h, v2.16b
+ uxtl v17.8h, v16.8b
+ uxtl2 v16.8h, v16.16b
+ mul v19.8h, v19.8h, v0.h[3]
+ mul v18.8h, v18.8h, v0.h[2]
+ mul v3.8h, v3.8h, v0.h[2]
+ mul v22.8h, v22.8h, v0.h[3]
+ mls v19.8h, v20.8h, v0.h[4]
+ uxtl v20.8h, \v0\().8b
+ uxtl2 v1.8h, \v0\().16b
+ mls v18.8h, v17.8h, v0.h[1]
+ mls v3.8h, v16.8h, v0.h[1]
+ mls v22.8h, v23.8h, v0.h[4]
+ mla v18.8h, v20.8h, v0.h[0]
+ mla v19.8h, v21.8h, v0.h[5]
+ mla v3.8h, v1.8h, v0.h[0]
+ mla v22.8h, v2.8h, v0.h[5]
+ sqadd v19.8h, v18.8h, v19.8h
+ sqadd v22.8h, v3.8h, v22.8h
+ sqrshrun \d0\().8b, v19.8h, #7
+ sqrshrun2 \d0\().16b, v22.8h, #7
+.endm
+
+.macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
+ uxtl \s0\().8h, \s0\().8b
+ uxtl \s3\().8h, \s3\().8b
+ uxtl \s6\().8h, \s6\().8b
+ uxtl \s1\().8h, \s1\().8b
+ uxtl \s4\().8h, \s4\().8b
+ uxtl \s2\().8h, \s2\().8b
+ uxtl \s5\().8h, \s5\().8b
+ mul \s0\().8h, \s0\().8h, v0.h[0]
+ mul v31.8h , \s3\().8h, v0.h[3]
+ mul \s3\().8h, \s3\().8h, v0.h[2]
+ mul \s6\().8h, \s6\().8h, v0.h[5]
+
+ mls \s0\().8h, \s1\().8h, v0.h[1]
+ mls v31.8h , \s4\().8h, v0.h[4]
+ mls \s3\().8h, \s2\().8h, v0.h[1]
+ mls \s6\().8h, \s5\().8h, v0.h[4]
+
+ mla \s0\().8h, \s2\().8h, v0.h[2]
+ mla v31.8h , \s5\().8h, v0.h[5]
+ mla \s3\().8h, \s1\().8h, v0.h[0]
+ mla \s6\().8h, \s4\().8h, v0.h[3]
+ sqadd v31.8h , \s0\().8h, v31.8h
+ sqadd \s6\().8h, \s3\().8h, \s6\().8h
+ sqrshrun \d0\().8b, v31.8h, #7
+ sqrshrun \d1\().8b, \s6\().8h, #7
+.endm
+
+.macro vp8_epel8_h4 d, v0, v1
+ ext v22.8b, \v0\().8b, \v1\().8b, #1
+ uxtl v19.8h, \v0\().8b
+ ext v23.8b, \v0\().8b, \v1\().8b, #2
+ uxtl v20.8h, v22.8b
+ ext v25.8b, \v0\().8b, \v1\().8b, #3
+ uxtl v22.8h, v23.8b
+ uxtl v25.8h, v25.8b
+ mul v20.8h, v20.8h, v0.h[2]
+ mul v22.8h, v22.8h, v0.h[3]
+ mls v20.8h, v19.8h, v0.h[1]
+ mls v22.8h, v25.8h, v0.h[4]
+ sqadd v22.8h, v20.8h, v22.8h
+ sqrshrun \d\().8b, v22.8h, #7
+.endm
+
+.macro vp8_epel8_v4_y2 d0, s0, s1, s2, s3, s4
+ uxtl \s0\().8h, \s0\().8b
+ uxtl \s1\().8h, \s1\().8b
+ uxtl \s2\().8h, \s2\().8b
+ uxtl \s3\().8h, \s3\().8b
+ uxtl \s4\().8h, \s4\().8b
+ mul v21.8h, \s1\().8h, v0.h[2]
+ mul v23.8h, \s2\().8h, v0.h[3]
+ mul \s2\().8h, \s2\().8h, v0.h[2]
+ mul v22.8h, \s3\().8h, v0.h[3]
+ mls v21.8h, \s0\().8h, v0.h[1]
+ mls v23.8h, \s3\().8h, v0.h[4]
+ mls \s2\().8h, \s1\().8h, v0.h[1]
+ mls v22.8h, \s4\().8h, v0.h[4]
+ sqadd v21.8h, v21.8h, v23.8h
+ sqadd \s2\().8h, \s2\().8h, v22.8h
+ sqrshrun \d0\().8b, v21.8h, #7
+ sqrshrun2 \d0\().16b, \s2\().8h, #7
+.endm
+
+
+// note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
+// arithmetic can be used to apply filters
+const subpel_filters, align=4
+ .short 0, 6, 123, 12, 1, 0, 0, 0
+ .short 2, 11, 108, 36, 8, 1, 0, 0
+ .short 0, 9, 93, 50, 6, 0, 0, 0
+ .short 3, 16, 77, 77, 16, 3, 0, 0
+ .short 0, 6, 50, 93, 9, 0, 0, 0
+ .short 1, 8, 36, 108, 11, 2, 0, 0
+ .short 0, 1, 12, 123, 6, 0, 0, 0
+endconst
+
+function ff_put_vp8_epel16_v6_neon, export=1
+ sub x2, x2, x3, lsl #1
+
+ sxtw x4, w4
+ sxtw x6, w6
+ movrel x17, subpel_filters, -16
+ add x6, x17, x6, lsl #4 // y
+ ld1 {v0.8h}, [x6]
+1:
+ ld1 {v1.1d - v2.1d}, [x2], x3
+ ld1 {v3.1d - v4.1d}, [x2], x3
+ ld1 {v16.1d - v17.1d}, [x2], x3
+ ld1 {v18.1d - v19.1d}, [x2], x3
+ ld1 {v20.1d - v21.1d}, [x2], x3
+ ld1 {v22.1d - v23.1d}, [x2], x3
+ ld1 {v24.1d - v25.1d}, [x2]
+ sub x2, x2, x3, lsl #2
+
+ vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
+ vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
+
+ st1 {v1.1d - v2.1d}, [x0], x1
+ st1 {v3.1d - v4.1d}, [x0], x1
+ subs x4, x4, #2
+ b.ne 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_epel16_h6_neon, export=1
+ sub x2, x2, #2
+ sxtw x5, w5 // x
+
+ // first pass (horizontal):
+ movrel x17, subpel_filters, -16
+ add x5, x17, x5, lsl #4 // x
+ ld1 {v0.8h}, [x5]
+1:
+ ld1 {v1.16b, v2.16b}, [x2], x3
+ vp8_epel16_h6 v1, v1, v2
+ st1 {v1.16b}, [x0], x1
+
+ subs w4, w4, #1
+ b.ne 1b
+ ret
+endfunc
+
+
+function ff_put_vp8_epel16_h6v6_neon, export=1
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, #2
+
+ // first pass (horizontal):
+ movrel x17, subpel_filters, -16
+ sxtw x5, w5 // x
+ add x16, x17, x5, lsl #4 // x
+ sub sp, sp, #336+16
+ ld1 {v0.8h}, [x16]
+ add x7, sp, #15
+ sxtw x4, w4
+ add x16, x4, #5 // h
+ bic x7, x7, #15
+1:
+ ld1 {v1.16b, v2.16b}, [x2], x3
+ vp8_epel16_h6 v1, v1, v2
+ st1 {v1.16b}, [x7], #16
+ subs x16, x16, #1
+ b.ne 1b
+
+
+ // second pass (vertical):
+ sxtw x6, w6
+ add x6, x17, x6, lsl #4 // y
+ add x7, sp, #15
+ ld1 {v0.8h}, [x6]
+ bic x7, x7, #15
+2:
+ ld1 {v1.8b - v4.8b}, [x7], #32
+ ld1 {v16.8b - v19.8b}, [x7], #32
+ ld1 {v20.8b - v23.8b}, [x7], #32
+ ld1 {v24.8b - v25.8b}, [x7]
+ sub x7, x7, #64
+
+ vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
+ vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
+ trn1 v1.2d, v1.2d, v2.2d
+ trn1 v3.2d, v3.2d, v4.2d
+
+ st1 {v1.16b}, [x0], x1
+ st1 {v3.16b}, [x0], x1
+ subs x4, x4, #2
+ b.ne 2b
+
+ add sp, sp, #336+16
+ ret
+endfunc
+
+function ff_put_vp8_epel8_v6_neon, export=1
+ sub x2, x2, x3, lsl #1
+
+ movrel x7, subpel_filters, -16
+ add x6, x7, w6, uxtw #4
+ ld1 {v0.8h}, [x6]
+1:
+ ld1 {v2.8b}, [x2], x3
+ ld1 {v3.8b}, [x2], x3
+ ld1 {v4.8b}, [x2], x3
+ ld1 {v5.8b}, [x2], x3
+ ld1 {v6.8b}, [x2], x3
+ ld1 {v7.8b}, [x2], x3
+ ld1 {v28.8b}, [x2]
+
+ sub x2, x2, x3, lsl #2
+
+ vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28
+
+ st1 {v2.8b}, [x0], x1
+ st1 {v3.8b}, [x0], x1
+ subs w4, w4, #2
+ b.ne 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_epel8_h6_neon, export=1
+ sub x2, x2, #2
+
+ movrel x7, subpel_filters, -16
+ add x5, x7, w5, uxtw #4
+ ld1 {v0.8h}, [x5]
+1:
+ ld1 {v2.8b, v3.8b}, [x2], x3
+
+ vp8_epel8_h6 v2, v2, v3
+
+ st1 {v2.8b}, [x0], x1
+ subs w4, w4, #1
+ b.ne 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_epel8_h6v6_neon, export=1
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, #2
+ sxtw x4, w4
+
+ // first pass (horizontal):
+ movrel x17, subpel_filters, -16
+ sxtw x5, w5
+ add x5, x17, x5, lsl #4 // x
+ sub sp, sp, #168+16
+ ld1 {v0.8h}, [x5]
+ add x7, sp, #15
+ add x16, x4, #5 // h
+ bic x7, x7, #15
+1:
+ ld1 {v1.8b, v2.8b}, [x2], x3
+
+ vp8_epel8_h6 v1, v1, v2
+
+ st1 {v1.8b}, [x7], #8
+ subs x16, x16, #1
+ b.ne 1b
+
+ // second pass (vertical):
+ sxtw x6, w6
+ add x6, x17, x6, lsl #4 // y
+ add x7, sp, #15
+ ld1 {v0.8h}, [x6]
+ bic x7, x7, #15
+2:
+ ld1 {v1.8b - v4.8b}, [x7], #32
+ ld1 {v5.8b - v7.8b}, [x7]
+
+ sub x7, x7, #16
+
+ vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
+
+ st1 {v1.8b}, [x0], x1
+ st1 {v2.8b}, [x0], x1
+ subs x4, x4, #2
+ b.ne 2b
+
+ add sp, sp, #168+16
+ ret
+endfunc
+
+function ff_put_vp8_epel8_v4_neon, export=1
+ sub x2, x2, x3
+
+ movrel x7, subpel_filters, -16
+ add x6, x7, w6, uxtw #4
+ ld1 {v0.8h}, [x6]
+1:
+ ld1 {v2.8b}, [x2], x3
+ ld1 {v3.8b}, [x2], x3
+ ld1 {v4.8b}, [x2], x3
+ ld1 {v5.8b}, [x2], x3
+ ld1 {v6.8b}, [x2]
+ sub x2, x2, x3, lsl #1
+
+ vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6
+
+ st1 {v2.d}[0], [x0], x1
+ st1 {v2.d}[1], [x0], x1
+ subs w4, w4, #2
+ b.ne 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_epel8_h4_neon, export=1
+ sub x2, x2, #1
+
+ movrel x7, subpel_filters, -16
+ add x5, x7, w5, uxtw #4
+ ld1 {v0.8h}, [x5]
+1:
+ ld1 {v2.8b,v3.8b}, [x2], x3
+
+ vp8_epel8_h4 v2, v2, v3
+
+ st1 {v2.8b}, [x0], x1
+ subs w4, w4, #1
+ b.ne 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_epel8_h4v6_neon, export=1
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, #1
+ sxtw x4, w4
+
+ // first pass (horizontal):
+ movrel x17, subpel_filters, -16
+ sxtw x5, w5
+ add x5, x17, x5, lsl #4 // x
+ sub sp, sp, #168+16
+ ld1 {v0.8h}, [x5]
+ add x7, sp, #15
+ add x16, x4, #5 // h
+ bic x7, x7, #15
+1:
+ ld1 {v1.8b, v2.8b}, [x2], x3
+
+ vp8_epel8_h4 v1, v1, v2
+
+ st1 {v1.8b}, [x7], #8
+ subs x16, x16, #1
+ b.ne 1b
+
+ // second pass (vertical):
+ sxtw x6, w6
+ add x6, x17, x6, lsl #4 // y
+ add x7, sp, #15
+ ld1 {v0.8h}, [x6]
+ bic x7, x7, #15
+2:
+ ld1 {v1.8b - v4.8b}, [x7], #32
+ ld1 {v5.8b - v7.8b}, [x7]
+
+ sub x7, x7, #16
+
+ vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
+
+ st1 {v1.8b}, [x0], x1
+ st1 {v2.8b}, [x0], x1
+ subs x4, x4, #2
+ b.ne 2b
+
+ add sp, sp, #168+16
+ ret
+endfunc
+
+function ff_put_vp8_epel8_h4v4_neon, export=1
+ sub x2, x2, x3
+ sub x2, x2, #1
+ sxtw x4, w4
+
+
+ // first pass (horizontal):
+ movrel x17, subpel_filters, -16
+ sxtw x5, w5
+ add x5, x17, x5, lsl #4 // x
+ sub sp, sp, #168+16
+ ld1 {v0.8h}, [x5]
+ add x7, sp, #15
+ add x16, x4, #3 // h
+ bic x7, x7, #15
+1:
+ ld1 {v1.8b, v2.8b}, [x2], x3
+
+ vp8_epel8_h4 v1, v1, v2
+
+ st1 {v1.8b}, [x7], #8
+ subs x16, x16, #1
+ b.ne 1b
+
+ // second pass (vertical):
+ sxtw x6, w6
+ add x6, x17, x6, lsl #4 // y
+ add x7, sp, #15
+ ld1 {v0.8h}, [x6]
+ bic x7, x7, #15
+2:
+ ld1 {v1.8b - v2.8b}, [x7], #16
+ ld1 {v3.8b - v5.8b}, [x7]
+
+ vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
+
+ st1 {v1.d}[0], [x0], x1
+ st1 {v1.d}[1], [x0], x1
+ subs x4, x4, #2
+ b.ne 2b
+
+ add sp, sp, #168+16
+ ret
+endfunc
+
+function ff_put_vp8_epel8_h6v4_neon, export=1
+ sub x2, x2, x3
+ sub x2, x2, #2
+ sxtw x4, w4
+
+
+ // first pass (horizontal):
+ movrel x17, subpel_filters, -16
+ sxtw x5, w5
+ add x5, x17, x5, lsl #4 // x
+ sub sp, sp, #168+16
+ ld1 {v0.8h}, [x5]
+ add x7, sp, #15
+ add x16, x4, #3 // h
+ bic x7, x7, #15
+1:
+ ld1 {v1.8b, v2.8b}, [x2], x3
+
+ vp8_epel8_h6 v1, v1, v2
+
+ st1 {v1.8b}, [x7], #8
+ subs x16, x16, #1
+ b.ne 1b
+
+ // second pass (vertical):
+ sxtw x6, w6
+ add x6, x17, x6, lsl #4 // y
+ add x7, sp, #15
+ ld1 {v0.8h}, [x6]
+ bic x7, x7, #15
+2:
+ ld1 {v1.8b - v2.8b}, [x7], #16
+ ld1 {v3.8b - v5.8b}, [x7]
+
+ vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
+
+ st1 {v1.d}[0], [x0], x1
+ st1 {v1.d}[1], [x0], x1
+ subs x4, x4, #2
+ b.ne 2b
+
+ add sp, sp, #168+16
+ ret
+endfunc
+
+function ff_put_vp8_epel4_v6_neon, export=1
+ sub x2, x2, x3, lsl #1
+
+ movrel x7, subpel_filters, -16
+ add x6, x7, w6, uxtw #4
+ ld1 {v0.8h}, [x6]
+1:
+ ld1r {v2.2s}, [x2], x3
+ ld1r {v3.2s}, [x2], x3
+ ld1r {v4.2s}, [x2], x3
+ ld1r {v5.2s}, [x2], x3
+ ld1r {v6.2s}, [x2], x3
+ ld1r {v7.2s}, [x2], x3
+ ld1r {v28.2s}, [x2]
+ sub x2, x2, x3, lsl #2
+ ld1 {v2.s}[1], [x2], x3
+ ld1 {v3.s}[1], [x2], x3
+ ld1 {v4.s}[1], [x2], x3
+ ld1 {v5.s}[1], [x2], x3
+ ld1 {v6.s}[1], [x2], x3
+ ld1 {v7.s}[1], [x2], x3
+ ld1 {v28.s}[1], [x2]
+ sub x2, x2, x3, lsl #2
+
+ vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28
+
+ st1 {v2.s}[0], [x0], x1
+ st1 {v3.s}[0], [x0], x1
+ st1 {v2.s}[1], [x0], x1
+ st1 {v3.s}[1], [x0], x1
+ subs w4, w4, #4
+ b.ne 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_epel4_h6_neon, export=1
+ sub x2, x2, #2
+
+ movrel x7, subpel_filters, -16
+ add x5, x7, w5, uxtw #4
+ ld1 {v0.8h}, [x5]
+1:
+ ld1 {v2.8b,v3.8b}, [x2], x3
+ vp8_epel8_h6 v2, v2, v3
+ st1 {v2.s}[0], [x0], x1
+ subs w4, w4, #1
+ b.ne 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_epel4_h6v6_neon, export=1
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, #2
+
+ movrel x7, subpel_filters, -16
+ add x5, x7, w5, uxtw #4
+ ld1 {v0.8h}, [x5]
+
+ sub sp, sp, #52
+ add w8, w4, #5
+ mov x9, sp
+1:
+ ld1 {v2.8b,v3.8b}, [x2], x3
+ vp8_epel8_h6 v2, v2, v3
+ st1 {v2.s}[0], [x9], #4
+ subs w8, w8, #1
+ b.ne 1b
+
+ add x6, x7, w6, uxtw #4
+ ld1 {v0.8h}, [x6]
+ mov x9, sp
+2:
+ ld1 {v2.8b,v3.8b}, [x9], #16
+ ld1 {v6.8b}, [x9], #8
+ ld1r {v28.2s}, [x9]
+ sub x9, x9, #16
+ ld1 {v4.8b,v5.8b}, [x9], #16
+ ld1 {v7.8b}, [x9], #8
+ ld1 {v28.s}[1], [x9]
+ sub x9, x9, #16
+ trn1 v1.2s, v2.2s, v4.2s
+ trn2 v4.2s, v2.2s, v4.2s
+ trn1 v2.2s, v3.2s, v5.2s
+ trn2 v5.2s, v3.2s, v5.2s
+ trn1 v3.2s, v6.2s, v7.2s
+ trn2 v7.2s, v6.2s, v7.2s
+ vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28
+ st1 {v2.s}[0], [x0], x1
+ st1 {v3.s}[0], [x0], x1
+ st1 {v2.s}[1], [x0], x1
+ st1 {v3.s}[1], [x0], x1
+ subs w4, w4, #4
+ b.ne 2b
+
+ add sp, sp, #52
+ ret
+endfunc
+
+function ff_put_vp8_epel4_h4v6_neon, export=1
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, #1
+
+ movrel x7, subpel_filters, -16
+ add x5, x7, w5, uxtw #4
+ ld1 {v0.8h}, [x5]
+
+ sub sp, sp, #52
+ add w8, w4, #5
+ mov x9, sp
+1:
+ ld1 {v2.8b}, [x2], x3
+ vp8_epel8_h4 v2, v2, v2
+ st1 {v2.s}[0], [x9], #4
+ subs w8, w8, #1
+ b.ne 1b
+
+ add x6, x7, w6, uxtw #4
+ ld1 {v0.8h}, [x6]
+ mov x9, sp
+2:
+ ld1 {v2.8b,v3.8b}, [x9], #16
+ ld1 {v6.8b}, [x9], #8
+ ld1r {v28.2s}, [x9]
+ sub x9, x9, #16
+ ld1 {v4.8b,v5.8b}, [x9], #16
+ ld1 {v7.8b}, [x9], #8
+ ld1 {v28.s}[1], [x9]
+ sub x9, x9, #16
+ trn1 v1.2s, v2.2s, v4.2s
+ trn2 v4.2s, v2.2s, v4.2s
+ trn1 v2.2s, v3.2s, v5.2s
+ trn2 v5.2s, v3.2s, v5.2s
+ trn1 v3.2s, v6.2s, v7.2s
+ trn2 v7.2s, v6.2s, v7.2s
+ vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28
+ st1 {v2.s}[0], [x0], x1
+ st1 {v3.s}[0], [x0], x1
+ st1 {v2.s}[1], [x0], x1
+ st1 {v3.s}[1], [x0], x1
+ subs w4, w4, #4
+ b.ne 2b
+
+ add sp, sp, #52
+ ret
+endfunc
+
+function ff_put_vp8_epel4_h6v4_neon, export=1
+ sub x2, x2, x3
+ sub x2, x2, #2
+
+ movrel x7, subpel_filters, -16
+ add x5, x7, w5, uxtw #4
+ ld1 {v0.8h}, [x5]
+
+ sub sp, sp, #44
+ add w8, w4, #3
+ mov x9, sp
+1:
+ ld1 {v2.8b,v3.8b}, [x2], x3
+ vp8_epel8_h6 v2, v2, v3
+ st1 {v2.s}[0], [x9], #4
+ subs w8, w8, #1
+ b.ne 1b
+
+ add x6, x7, w6, uxtw #4
+ ld1 {v0.8h}, [x6]
+ mov x9, sp
+2:
+ ld1 {v2.8b,v3.8b}, [x9], #16
+ ld1r {v6.2s}, [x9]
+ sub x9, x9, #8
+ ld1 {v4.8b,v5.8b}, [x9], #16
+ ld1 {v6.s}[1], [x9]
+ sub x9, x9, #8
+ trn1 v1.2s, v2.2s, v4.2s
+ trn2 v4.2s, v2.2s, v4.2s
+ trn1 v2.2s, v3.2s, v5.2s
+ trn2 v5.2s, v3.2s, v5.2s
+ vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6
+ st1 {v1.s}[0], [x0], x1
+ st1 {v1.s}[2], [x0], x1
+ st1 {v1.s}[1], [x0], x1
+ st1 {v1.s}[3], [x0], x1
+ subs w4, w4, #4
+ b.ne 2b
+
+ add sp, sp, #44
+ ret
+endfunc
+
+function ff_put_vp8_epel4_h4_neon, export=1
+ sub x2, x2, #1
+
+ movrel x7, subpel_filters, -16
+ add x5, x7, w5, uxtw #4
+ ld1 {v0.8h}, [x5]
+1:
+ ld1 {v2.8b}, [x2], x3
+ vp8_epel8_h4 v2, v2, v2
+ st1 {v2.s}[0], [x0], x1
+ subs w4, w4, #1
+ b.ne 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_epel4_v4_neon, export=1
+ sub x2, x2, x3
+
+ movrel x7, subpel_filters, -16
+ add x6, x7, w6, uxtw #4
+ ld1 {v0.8h}, [x6]
+1:
+ ld1r {v2.2s}, [x2], x3
+ ld1r {v3.2s}, [x2], x3
+ ld1r {v4.2s}, [x2], x3
+ ld1r {v5.2s}, [x2], x3
+ ld1r {v6.2s}, [x2]
+ sub x2, x2, x3, lsl #1
+ ld1 {v2.s}[1], [x2], x3
+ ld1 {v3.s}[1], [x2], x3
+ ld1 {v4.s}[1], [x2], x3
+ ld1 {v5.s}[1], [x2], x3
+ ld1 {v6.s}[1], [x2]
+ sub x2, x2, x3, lsl #1
+
+ vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6
+
+ st1 {v2.s}[0], [x0], x1
+ st1 {v2.s}[2], [x0], x1
+ st1 {v2.s}[1], [x0], x1
+ st1 {v2.s}[3], [x0], x1
+ subs w4, w4, #4
+ b.ne 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_epel4_h4v4_neon, export=1
+ sub x2, x2, x3
+ sub x2, x2, #1
+
+ movrel x7, subpel_filters, -16
+ add x5, x7, w5, uxtw #4
+ ld1 {v0.8h}, [x5]
+
+ sub sp, sp, #44
+ add w8, w4, #3
+ mov x9, sp
+1:
+ ld1 {v2.8b}, [x2], x3
+ vp8_epel8_h4 v2, v2, v3
+ st1 {v2.s}[0], [x9], #4
+ subs w8, w8, #1
+ b.ne 1b
+
+ add x6, x7, w6, uxtw #4
+ ld1 {v0.8h}, [x6]
+ mov x9, sp
+2:
+ ld1 {v2.8b,v3.8b}, [x9], #16
+ ld1r {v6.2s}, [x9]
+ sub x9, x9, #8
+ ld1 {v4.8b,v5.8b}, [x9], #16
+ ld1 {v6.s}[1], [x9]
+ sub x9, x9, #8
+ trn1 v1.2s, v2.2s, v4.2s
+ trn2 v4.2s, v2.2s, v4.2s
+ trn1 v2.2s, v3.2s, v5.2s
+ trn2 v5.2s, v3.2s, v5.2s
+ vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6
+ st1 {v1.s}[0], [x0], x1
+ st1 {v1.s}[2], [x0], x1
+ st1 {v1.s}[1], [x0], x1
+ st1 {v1.s}[3], [x0], x1
+ subs w4, w4, #4
+ b.ne 2b
+
+ add sp, sp, #44
+ ret
+endfunc
+
+/* Bilinear MC */
+
+function ff_put_vp8_bilin16_h_neon, export=1
+ mov w7, #8
+ dup v0.8b, w5
+ sub w5, w7, w5
+ dup v1.8b, w5
+1:
+ subs w4, w4, #2
+ ld1 {v2.8b,v3.8b,v4.8b}, [x2], x3
+ ext v5.8b, v3.8b, v4.8b, #1
+ ext v4.8b, v2.8b, v3.8b, #1
+ umull v16.8h, v2.8b, v1.8b
+ umlal v16.8h, v4.8b, v0.8b
+ ld1 {v18.8b,v19.8b,v20.8b}, [x2], x3
+ umull v6.8h, v3.8b, v1.8b
+ umlal v6.8h, v5.8b, v0.8b
+ ext v21.8b, v19.8b, v20.8b, #1
+ ext v20.8b, v18.8b, v19.8b, #1
+ umull v22.8h, v18.8b, v1.8b
+ umlal v22.8h, v20.8b, v0.8b
+ umull v24.8h, v19.8b, v1.8b
+ umlal v24.8h, v21.8b, v0.8b
+ rshrn v4.8b, v16.8h, #3
+ rshrn2 v4.16b, v6.8h, #3
+ rshrn v6.8b, v22.8h, #3
+ rshrn2 v6.16b, v24.8h, #3
+ st1 {v4.16b}, [x0], x1
+ st1 {v6.16b}, [x0], x1
+ b.gt 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_bilin16_v_neon, export=1
+ mov w7, #8
+ dup v0.16b, w6
+ sub w6, w7, w6
+ dup v1.16b, w6
+
+ ld1 {v2.16b}, [x2], x3
+1:
+ subs w4, w4, #2
+ ld1 {v4.16b}, [x2], x3
+ umull v6.8h, v2.8b, v1.8b
+ umlal v6.8h, v4.8b, v0.8b
+ umull2 v16.8h, v2.16b, v1.16b
+ umlal2 v16.8h, v4.16b, v0.16b
+ ld1 {v2.16b}, [x2], x3
+ umull v18.8h, v4.8b, v1.8b
+ umlal v18.8h, v2.8b, v0.8b
+ umull2 v20.8h, v4.16b, v1.16b
+ umlal2 v20.8h, v2.16b, v0.16b
+ rshrn v4.8b, v6.8h, #3
+ rshrn2 v4.16b, v16.8h, #3
+ rshrn v6.8b, v18.8h, #3
+ rshrn2 v6.16b, v20.8h, #3
+ st1 {v4.16b}, [x0], x1
+ st1 {v6.16b}, [x0], x1
+ b.gt 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_bilin16_hv_neon, export=1
+ mov w7, #8
+ dup v0.8b, w5 // mx
+ sub w5, w7, w5
+ dup v1.8b, w5
+ dup v2.16b, w6 // my
+ sub w6, w7, w6
+ dup v3.16b, w6
+
+ ld1 {v4.8b,v5.8b,v6.8b}, [x2], x3
+
+ ext v7.8b, v5.8b, v6.8b, #1
+ ext v6.8b, v4.8b, v5.8b, #1
+ umull v16.8h, v4.8b, v1.8b
+ umlal v16.8h, v6.8b, v0.8b
+ umull v18.8h, v5.8b, v1.8b
+ umlal v18.8h, v7.8b, v0.8b
+ rshrn v4.8b, v16.8h, #3
+ rshrn2 v4.16b, v18.8h, #3
+1:
+ subs w4, w4, #2
+ ld1 {v18.8b,v19.8b,v20.8b}, [x2], x3
+ ext v21.8b, v19.8b, v20.8b, #1
+ ext v20.8b, v18.8b, v19.8b, #1
+ umull v22.8h, v18.8b, v1.8b
+ umlal v22.8h, v20.8b, v0.8b
+ ld1 {v26.8b,v27.8b,v28.8b}, [x2], x3
+ umull v24.8h, v19.8b, v1.8b
+ umlal v24.8h, v21.8b, v0.8b
+ ext v29.8b, v27.8b, v28.8b, #1
+ ext v28.8b, v26.8b, v27.8b, #1
+ umull v16.8h, v26.8b, v1.8b
+ umlal v16.8h, v28.8b, v0.8b
+ umull v18.8h, v27.8b, v1.8b
+ umlal v18.8h, v29.8b, v0.8b
+ rshrn v6.8b, v22.8h, #3
+ rshrn2 v6.16b, v24.8h, #3
+ umull v24.8h, v4.8b, v3.8b
+ umlal v24.8h, v6.8b, v2.8b
+ umull2 v30.8h, v4.16b, v3.16b
+ umlal2 v30.8h, v6.16b, v2.16b
+ rshrn v4.8b, v16.8h, #3
+ rshrn2 v4.16b, v18.8h, #3
+ umull v20.8h, v6.8b, v3.8b
+ umlal v20.8h, v4.8b, v2.8b
+ umull2 v22.8h, v6.16b, v3.16b
+ umlal2 v22.8h, v4.16b, v2.16b
+ rshrn v24.8b, v24.8h, #3
+ rshrn2 v24.16b, v30.8h, #3
+ st1 {v24.16b}, [x0], x1
+ rshrn v20.8b, v20.8h, #3
+ rshrn2 v20.16b, v22.8h, #3
+ st1 {v20.16b}, [x0], x1
+ b.gt 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_bilin8_h_neon, export=1
+ mov w7, #8
+ dup v0.8b, w5
+ sub w5, w7, w5
+ dup v1.8b, w5
+1:
+ subs w4, w4, #2
+ ld1 {v2.8b,v3.8b}, [x2], x3
+ ext v3.8b, v2.8b, v3.8b, #1
+ umull v4.8h, v2.8b, v1.8b
+ umlal v4.8h, v3.8b, v0.8b
+ ld1 {v6.8b,v7.8b}, [x2], x3
+ ext v7.8b, v6.8b, v7.8b, #1
+ umull v16.8h, v6.8b, v1.8b
+ umlal v16.8h, v7.8b, v0.8b
+ rshrn v4.8b, v4.8h, #3
+ rshrn v16.8b, v16.8h, #3
+ st1 {v4.8b}, [x0], x1
+ st1 {v16.8b}, [x0], x1
+ b.gt 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_bilin8_v_neon, export=1
+ mov w7, #8
+ dup v0.8b, w6
+ sub w6, w7, w6
+ dup v1.8b, w6
+
+ ld1 {v2.8b}, [x2], x3
+1:
+ subs w4, w4, #2
+ ld1 {v3.8b}, [x2], x3
+ umull v4.8h, v2.8b, v1.8b
+ umlal v4.8h, v3.8b, v0.8b
+ ld1 {v2.8b}, [x2], x3
+ umull v6.8h, v3.8b, v1.8b
+ umlal v6.8h, v2.8b, v0.8b
+ rshrn v4.8b, v4.8h, #3
+ rshrn v6.8b, v6.8h, #3
+ st1 {v4.8b}, [x0], x1
+ st1 {v6.8b}, [x0], x1
+ b.gt 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_bilin8_hv_neon, export=1
+ mov w7, #8
+ dup v0.8b, w5 // mx
+ sub w5, w7, w5
+ dup v1.8b, w5
+ dup v2.8b, w6 // my
+ sub w6, w7, w6
+ dup v3.8b, w6
+
+ ld1 {v4.8b,v5.8b}, [x2], x3
+ ext v5.8b, v4.8b, v5.8b, #1
+ umull v18.8h, v4.8b, v1.8b
+ umlal v18.8h, v5.8b, v0.8b
+ rshrn v22.8b, v18.8h, #3
+1:
+ subs w4, w4, #2
+ ld1 {v6.8b,v7.8b}, [x2], x3
+ ext v7.8b, v6.8b, v7.8b, #1
+ umull v16.8h, v6.8b, v1.8b
+ umlal v16.8h, v7.8b, v0.8b
+ ld1 {v4.8b,v5.8b}, [x2], x3
+ ext v5.8b, v4.8b, v5.8b, #1
+ umull v18.8h, v4.8b, v1.8b
+ umlal v18.8h, v5.8b, v0.8b
+ rshrn v16.8b, v16.8h, #3
+ umull v20.8h, v22.8b, v3.8b
+ umlal v20.8h, v16.8b, v2.8b
+ rshrn v22.8b, v18.8h, #3
+ umull v24.8h, v16.8b, v3.8b
+ umlal v24.8h, v22.8b, v2.8b
+ rshrn v20.8b, v20.8h, #3
+ st1 {v20.8b}, [x0], x1
+ rshrn v23.8b, v24.8h, #3
+ st1 {v23.8b}, [x0], x1
+ b.gt 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_bilin4_h_neon, export=1
+ mov w7, #8
+ dup v0.8b, w5
+ sub w5, w7, w5
+ dup v1.8b, w5
+1:
+ subs w4, w4, #2
+ ld1 {v2.8b}, [x2], x3
+ ext v3.8b, v2.8b, v3.8b, #1
+ ld1 {v6.8b}, [x2], x3
+ ext v7.8b, v6.8b, v7.8b, #1
+ trn1 v2.2s, v2.2s, v6.2s
+ trn1 v3.2s, v3.2s, v7.2s
+ umull v4.8h, v2.8b, v1.8b
+ umlal v4.8h, v3.8b, v0.8b
+ rshrn v4.8b, v4.8h, #3
+ st1 {v4.s}[0], [x0], x1
+ st1 {v4.s}[1], [x0], x1
+ b.gt 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_bilin4_v_neon, export=1
+ mov w7, #8
+ dup v0.8b, w6
+ sub w6, w7, w6
+ dup v1.8b, w6
+
+ ld1r {v2.2s}, [x2], x3
+1:
+ ld1r {v3.2s}, [x2]
+ ld1 {v2.s}[1], [x2], x3
+ ld1 {v3.s}[1], [x2], x3
+ umull v4.8h, v2.8b, v1.8b
+ umlal v4.8h, v3.8b, v0.8b
+ trn2 v2.2s, v3.2s, v2.2s
+ rshrn v4.8b, v4.8h, #3
+ st1 {v4.s}[0], [x0], x1
+ st1 {v4.s}[1], [x0], x1
+ subs w4, w4, #2
+ b.gt 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_bilin4_hv_neon, export=1
+ mov w7, #8
+ dup v0.8b, w5 // mx
+ sub w5, w7, w5
+ dup v1.8b, w5
+ dup v2.8b, w6 // my
+ sub w6, w7, w6
+ dup v3.8b, w6
+
+ ld1 {v4.8b}, [x2], x3
+ ext v5.8b, v4.8b, v4.8b, #1
+ umull v18.8h, v4.8b, v1.8b
+ umlal v18.8h, v5.8b, v0.8b
+ rshrn v22.8b, v18.8h, #3
+1:
+ subs w4, w4, #2
+ ld1 {v6.8b}, [x2], x3
+ ext v7.8b, v6.8b, v6.8b, #1
+ ld1 {v4.8b}, [x2], x3
+ ext v5.8b, v4.8b, v4.8b, #1
+ trn1 v6.2s, v6.2s, v4.2s
+ trn1 v7.2s, v7.2s, v5.2s
+ umull v16.8h, v6.8b, v1.8b
+ umlal v16.8h, v7.8b, v0.8b
+ rshrn v16.8b, v16.8h, #3
+ umull v20.8h, v16.8b, v2.8b
+ trn1 v22.2s, v22.2s, v16.2s
+ umlal v20.8h, v22.8b, v3.8b
+ rev64 v22.2s, v16.2s
+ rshrn v20.8b, v20.8h, #3
+ st1 {v20.s}[0], [x0], x1
+ st1 {v20.s}[1], [x0], x1
+ b.gt 1b
+
+ ret
+endfunc
diff --git a/media/ffvpx/libavcodec/aarch64/vp9dsp_init.h b/media/ffvpx/libavcodec/aarch64/vp9dsp_init.h
new file mode 100644
index 0000000000..9df1752c62
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AARCH64_VP9DSP_INIT_H
+#define AVCODEC_AARCH64_VP9DSP_INIT_H
+
+#include "libavcodec/vp9dsp.h"
+
+void ff_vp9dsp_init_10bpp_aarch64(VP9DSPContext *dsp);
+void ff_vp9dsp_init_12bpp_aarch64(VP9DSPContext *dsp);
+
+#endif /* AVCODEC_AARCH64_VP9DSP_INIT_H */
diff --git a/media/ffvpx/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c
new file mode 100644
index 0000000000..0fa0d7f8c2
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPP 10
+#define INIT_FUNC ff_vp9dsp_init_10bpp_aarch64
+#include "vp9dsp_init_16bpp_aarch64_template.c"
diff --git a/media/ffvpx/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c
new file mode 100644
index 0000000000..dae2232403
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPP 12
+#define INIT_FUNC ff_vp9dsp_init_12bpp_aarch64
+#include "vp9dsp_init_16bpp_aarch64_template.c"
diff --git a/media/ffvpx/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c
new file mode 100644
index 0000000000..8dcfdeaaf7
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavutil/aarch64/cpu.h"
+#include "vp9dsp_init.h"
+
+#define declare_fpel(type, sz, suffix) \
+void ff_vp9_##type##sz##suffix##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my)
+
+#define decl_mc_func(op, filter, dir, sz, bpp) \
+void ff_vp9_##op##_##filter##sz##_##dir##_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my)
+
+#define define_8tap_2d_fn(op, filter, sz, bpp) \
+static void op##_##filter##sz##_hv_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int h, int mx, int my) \
+{ \
+ LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz * 2]); \
+ /* We only need h + 7 lines, but the horizontal filter assumes an \
+ * even number of rows, so filter h + 8 lines here. */ \
+ ff_vp9_put_##filter##sz##_h_##bpp##_neon(temp, 2 * sz, \
+ src - 3 * src_stride, src_stride, \
+ h + 8, mx, 0); \
+ ff_vp9_##op##_##filter##sz##_v_##bpp##_neon(dst, dst_stride, \
+ temp + 3 * 2 * sz, 2 * sz, \
+ h, 0, my); \
+}
+
+#define decl_filter_funcs(op, dir, sz, bpp) \
+ decl_mc_func(op, regular, dir, sz, bpp); \
+ decl_mc_func(op, sharp, dir, sz, bpp); \
+ decl_mc_func(op, smooth, dir, sz, bpp)
+
+#define decl_mc_funcs(sz, bpp) \
+ decl_filter_funcs(put, h, sz, bpp); \
+ decl_filter_funcs(avg, h, sz, bpp); \
+ decl_filter_funcs(put, v, sz, bpp); \
+ decl_filter_funcs(avg, v, sz, bpp); \
+ decl_filter_funcs(put, hv, sz, bpp); \
+ decl_filter_funcs(avg, hv, sz, bpp)
+
+#define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
+#define ff_vp9_copy64_neon ff_vp9_copy64_aarch64
+#define ff_vp9_copy128_neon ff_vp9_copy128_aarch64
+
+declare_fpel(copy, 128, );
+declare_fpel(copy, 64, );
+declare_fpel(copy, 32, );
+declare_fpel(copy, 16, );
+declare_fpel(copy, 8, );
+declare_fpel(avg, 64, _16);
+declare_fpel(avg, 32, _16);
+declare_fpel(avg, 16, _16);
+declare_fpel(avg, 8, _16);
+declare_fpel(avg, 4, _16);
+
+decl_mc_funcs(64, BPP);
+decl_mc_funcs(32, BPP);
+decl_mc_funcs(16, BPP);
+decl_mc_funcs(8, BPP);
+decl_mc_funcs(4, BPP);
+
+#define define_8tap_2d_funcs(sz, bpp) \
+ define_8tap_2d_fn(put, regular, sz, bpp) \
+ define_8tap_2d_fn(put, sharp, sz, bpp) \
+ define_8tap_2d_fn(put, smooth, sz, bpp) \
+ define_8tap_2d_fn(avg, regular, sz, bpp) \
+ define_8tap_2d_fn(avg, sharp, sz, bpp) \
+ define_8tap_2d_fn(avg, smooth, sz, bpp)
+
+define_8tap_2d_funcs(64, BPP)
+define_8tap_2d_funcs(32, BPP)
+define_8tap_2d_funcs(16, BPP)
+define_8tap_2d_funcs(8, BPP)
+define_8tap_2d_funcs(4, BPP)
+
+static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+#define init_fpel(idx1, idx2, sz, type, suffix) \
+ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##suffix
+
+#define init_copy(idx, sz, suffix) \
+ init_fpel(idx, 0, sz, copy, suffix)
+
+#define init_avg(idx, sz, suffix) \
+ init_fpel(idx, 1, sz, avg, suffix)
+
+#define init_copy_avg(idx, sz1, sz2) \
+ init_copy(idx, sz2, _neon); \
+ init_avg (idx, sz1, _16_neon)
+
+ if (have_armv8(cpu_flags)) {
+ init_copy(0, 128, _aarch64);
+ init_copy(1, 64, _aarch64);
+ init_copy(2, 32, _aarch64);
+ }
+
+ if (have_neon(cpu_flags)) {
+#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx, bpp) \
+ dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_##bpp##_neon
+
+#define init_mc_funcs(idx, dir, mx, my, sz, pfx, bpp) \
+ init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
+ init_mc_func(idx, 0, put, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx, bpp); \
+ init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx, bpp); \
+ init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
+ init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx, bpp); \
+ init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx, bpp)
+
+#define init_mc_funcs_dirs(idx, sz, bpp) \
+ init_mc_funcs(idx, v, 0, 1, sz, ff_vp9_, bpp); \
+ init_mc_funcs(idx, h, 1, 0, sz, ff_vp9_, bpp); \
+ init_mc_funcs(idx, hv, 1, 1, sz, , bpp)
+
+
+ init_avg(0, 64, _16_neon);
+ init_avg(1, 32, _16_neon);
+ init_avg(2, 16, _16_neon);
+ init_copy_avg(3, 8, 16);
+ init_copy_avg(4, 4, 8);
+
+ init_mc_funcs_dirs(0, 64, BPP);
+ init_mc_funcs_dirs(1, 32, BPP);
+ init_mc_funcs_dirs(2, 16, BPP);
+ init_mc_funcs_dirs(3, 8, BPP);
+ init_mc_funcs_dirs(4, 4, BPP);
+ }
+}
+
+#define define_itxfm2(type_a, type_b, sz, bpp) \
+void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_##bpp##_neon(uint8_t *_dst, \
+ ptrdiff_t stride, \
+ int16_t *_block, int eob)
+#define define_itxfm(type_a, type_b, sz, bpp) define_itxfm2(type_a, type_b, sz, bpp)
+
+#define define_itxfm_funcs(sz, bpp) \
+ define_itxfm(idct, idct, sz, bpp); \
+ define_itxfm(iadst, idct, sz, bpp); \
+ define_itxfm(idct, iadst, sz, bpp); \
+ define_itxfm(iadst, iadst, sz, bpp)
+
+define_itxfm_funcs(4, BPP);
+define_itxfm_funcs(8, BPP);
+define_itxfm_funcs(16, BPP);
+define_itxfm(idct, idct, 32, BPP);
+define_itxfm(iwht, iwht, 4, BPP);
+
+
+static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+#define init_itxfm2(tx, sz, bpp) \
+ dsp->itxfm_add[tx][DCT_DCT] = ff_vp9_idct_idct_##sz##_add_##bpp##_neon; \
+ dsp->itxfm_add[tx][DCT_ADST] = ff_vp9_iadst_idct_##sz##_add_##bpp##_neon; \
+ dsp->itxfm_add[tx][ADST_DCT] = ff_vp9_idct_iadst_##sz##_add_##bpp##_neon; \
+ dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_##bpp##_neon
+#define init_itxfm(tx, sz, bpp) init_itxfm2(tx, sz, bpp)
+
+#define init_idct2(tx, nm, bpp) \
+ dsp->itxfm_add[tx][DCT_DCT] = \
+ dsp->itxfm_add[tx][ADST_DCT] = \
+ dsp->itxfm_add[tx][DCT_ADST] = \
+ dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_##bpp##_neon
+#define init_idct(tx, nm, bpp) init_idct2(tx, nm, bpp)
+
+ init_itxfm(TX_4X4, 4x4, BPP);
+ init_itxfm(TX_8X8, 8x8, BPP);
+ init_itxfm(TX_16X16, 16x16, BPP);
+ init_idct(TX_32X32, idct_idct_32x32, BPP);
+ init_idct(4, iwht_iwht_4x4, BPP);
+ }
+}
+
+#define define_loop_filter(dir, wd, size, bpp) \
+void ff_vp9_loop_filter_##dir##_##wd##_##size##_##bpp##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
+
+#define define_loop_filters(wd, size, bpp) \
+ define_loop_filter(h, wd, size, bpp); \
+ define_loop_filter(v, wd, size, bpp)
+
+define_loop_filters(4, 8, BPP);
+define_loop_filters(8, 8, BPP);
+define_loop_filters(16, 8, BPP);
+
+define_loop_filters(16, 16, BPP);
+
+define_loop_filters(44, 16, BPP);
+define_loop_filters(48, 16, BPP);
+define_loop_filters(84, 16, BPP);
+define_loop_filters(88, 16, BPP);
+
+static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+#define init_lpf_func_8(idx1, idx2, dir, wd, bpp) \
+ dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_8_##bpp##_neon
+
+#define init_lpf_func_16(idx, dir, bpp) \
+ dsp->loop_filter_16[idx] = ff_vp9_loop_filter_##dir##_16_16_##bpp##_neon
+
+#define init_lpf_func_mix2(idx1, idx2, idx3, dir, wd, bpp) \
+ dsp->loop_filter_mix2[idx1][idx2][idx3] = ff_vp9_loop_filter_##dir##_##wd##_16_##bpp##_neon
+
+#define init_lpf_funcs_8_wd(idx, wd, bpp) \
+ init_lpf_func_8(idx, 0, h, wd, bpp); \
+ init_lpf_func_8(idx, 1, v, wd, bpp)
+
+#define init_lpf_funcs_16(bpp) \
+ init_lpf_func_16(0, h, bpp); \
+ init_lpf_func_16(1, v, bpp)
+
+#define init_lpf_funcs_mix2_wd(idx1, idx2, wd, bpp) \
+ init_lpf_func_mix2(idx1, idx2, 0, h, wd, bpp); \
+ init_lpf_func_mix2(idx1, idx2, 1, v, wd, bpp)
+
+#define init_lpf_funcs_8(bpp) \
+ init_lpf_funcs_8_wd(0, 4, bpp); \
+ init_lpf_funcs_8_wd(1, 8, bpp); \
+ init_lpf_funcs_8_wd(2, 16, bpp)
+
+#define init_lpf_funcs_mix2(bpp) \
+ init_lpf_funcs_mix2_wd(0, 0, 44, bpp); \
+ init_lpf_funcs_mix2_wd(0, 1, 48, bpp); \
+ init_lpf_funcs_mix2_wd(1, 0, 84, bpp); \
+ init_lpf_funcs_mix2_wd(1, 1, 88, bpp)
+
+ init_lpf_funcs_8(BPP);
+ init_lpf_funcs_16(BPP);
+ init_lpf_funcs_mix2(BPP);
+ }
+}
+
+av_cold void INIT_FUNC(VP9DSPContext *dsp)
+{
+ vp9dsp_mc_init_aarch64(dsp);
+ vp9dsp_loopfilter_init_aarch64(dsp);
+ vp9dsp_itxfm_init_aarch64(dsp);
+}
diff --git a/media/ffvpx/libavcodec/aarch64/vp9dsp_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_aarch64.c
new file mode 100644
index 0000000000..4c699759fe
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_aarch64.c
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2016 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/vp9dsp.h"
+#include "vp9dsp_init.h"
+
+#define declare_fpel(type, sz) \
+void ff_vp9_##type##sz##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my)
+
+#define declare_copy_avg(sz) \
+ declare_fpel(copy, sz); \
+ declare_fpel(avg , sz)
+
+#define decl_mc_func(op, filter, dir, sz) \
+void ff_vp9_##op##_##filter##sz##_##dir##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my)
+
+#define define_8tap_2d_fn(op, filter, sz) \
+static void op##_##filter##sz##_hv_neon(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my) \
+{ \
+ LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz]); \
+ /* We only need h + 7 lines, but the horizontal filter assumes an \
+ * even number of rows, so filter h + 8 lines here. */ \
+ ff_vp9_put_##filter##sz##_h_neon(temp, sz, \
+ src - 3 * src_stride, src_stride, \
+ h + 8, mx, 0); \
+ ff_vp9_##op##_##filter##sz##_v_neon(dst, dst_stride, \
+ temp + 3 * sz, sz, \
+ h, 0, my); \
+}
+
+#define decl_filter_funcs(op, dir, sz) \
+ decl_mc_func(op, regular, dir, sz); \
+ decl_mc_func(op, sharp, dir, sz); \
+ decl_mc_func(op, smooth, dir, sz)
+
+#define decl_mc_funcs(sz) \
+ decl_filter_funcs(put, h, sz); \
+ decl_filter_funcs(avg, h, sz); \
+ decl_filter_funcs(put, v, sz); \
+ decl_filter_funcs(avg, v, sz); \
+ decl_filter_funcs(put, hv, sz); \
+ decl_filter_funcs(avg, hv, sz)
+
+#define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
+#define ff_vp9_copy64_neon ff_vp9_copy64_aarch64
+
+declare_copy_avg(64);
+declare_copy_avg(32);
+declare_copy_avg(16);
+declare_copy_avg(8);
+declare_copy_avg(4);
+
+decl_mc_funcs(64);
+decl_mc_funcs(32);
+decl_mc_funcs(16);
+decl_mc_funcs(8);
+decl_mc_funcs(4);
+
+#define define_8tap_2d_funcs(sz) \
+ define_8tap_2d_fn(put, regular, sz) \
+ define_8tap_2d_fn(put, sharp, sz) \
+ define_8tap_2d_fn(put, smooth, sz) \
+ define_8tap_2d_fn(avg, regular, sz) \
+ define_8tap_2d_fn(avg, sharp, sz) \
+ define_8tap_2d_fn(avg, smooth, sz)
+
+define_8tap_2d_funcs(64)
+define_8tap_2d_funcs(32)
+define_8tap_2d_funcs(16)
+define_8tap_2d_funcs(8)
+define_8tap_2d_funcs(4)
+
+static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+#define init_fpel(idx1, idx2, sz, type, suffix) \
+ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##suffix
+
+#define init_copy(idx, sz, suffix) \
+ init_fpel(idx, 0, sz, copy, suffix)
+
+#define init_avg(idx, sz, suffix) \
+ init_fpel(idx, 1, sz, avg, suffix)
+
+#define init_copy_avg(idx, sz) \
+ init_copy(idx, sz, _neon); \
+ init_avg (idx, sz, _neon)
+
+ if (have_armv8(cpu_flags)) {
+ init_copy(0, 64, _aarch64);
+ init_copy(1, 32, _aarch64);
+ }
+
+ if (have_neon(cpu_flags)) {
+#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx) \
+ dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_neon
+
+#define init_mc_funcs(idx, dir, mx, my, sz, pfx) \
+ init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \
+ init_mc_func(idx, 0, put, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx); \
+ init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx); \
+ init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \
+ init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx); \
+ init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx)
+
+#define init_mc_funcs_dirs(idx, sz) \
+ init_mc_funcs(idx, h, 1, 0, sz, ff_vp9_); \
+ init_mc_funcs(idx, v, 0, 1, sz, ff_vp9_); \
+ init_mc_funcs(idx, hv, 1, 1, sz,)
+
+ init_avg(0, 64, _neon);
+ init_avg(1, 32, _neon);
+ init_copy_avg(2, 16);
+ init_copy_avg(3, 8);
+ init_copy_avg(4, 4);
+
+ init_mc_funcs_dirs(0, 64);
+ init_mc_funcs_dirs(1, 32);
+ init_mc_funcs_dirs(2, 16);
+ init_mc_funcs_dirs(3, 8);
+ init_mc_funcs_dirs(4, 4);
+ }
+}
+
+#define define_itxfm(type_a, type_b, sz) \
+void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_neon(uint8_t *_dst, \
+ ptrdiff_t stride, \
+ int16_t *_block, int eob)
+
+#define define_itxfm_funcs(sz) \
+ define_itxfm(idct, idct, sz); \
+ define_itxfm(iadst, idct, sz); \
+ define_itxfm(idct, iadst, sz); \
+ define_itxfm(iadst, iadst, sz)
+
+define_itxfm_funcs(4);
+define_itxfm_funcs(8);
+define_itxfm_funcs(16);
+define_itxfm(idct, idct, 32);
+define_itxfm(iwht, iwht, 4);
+
+
+static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+#define init_itxfm(tx, sz) \
+ dsp->itxfm_add[tx][DCT_DCT] = ff_vp9_idct_idct_##sz##_add_neon; \
+ dsp->itxfm_add[tx][DCT_ADST] = ff_vp9_iadst_idct_##sz##_add_neon; \
+ dsp->itxfm_add[tx][ADST_DCT] = ff_vp9_idct_iadst_##sz##_add_neon; \
+ dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_neon
+
+#define init_idct(tx, nm) \
+ dsp->itxfm_add[tx][DCT_DCT] = \
+ dsp->itxfm_add[tx][ADST_DCT] = \
+ dsp->itxfm_add[tx][DCT_ADST] = \
+ dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_neon
+
+ init_itxfm(TX_4X4, 4x4);
+ init_itxfm(TX_8X8, 8x8);
+ init_itxfm(TX_16X16, 16x16);
+ init_idct(TX_32X32, idct_idct_32x32);
+ init_idct(4, iwht_iwht_4x4);
+ }
+}
+
+#define define_loop_filter(dir, wd, len) \
+void ff_vp9_loop_filter_##dir##_##wd##_##len##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
+
+#define define_loop_filters(wd, len) \
+ define_loop_filter(h, wd, len); \
+ define_loop_filter(v, wd, len)
+
+define_loop_filters(4, 8);
+define_loop_filters(8, 8);
+define_loop_filters(16, 8);
+
+define_loop_filters(16, 16);
+
+define_loop_filters(44, 16);
+define_loop_filters(48, 16);
+define_loop_filters(84, 16);
+define_loop_filters(88, 16);
+
+static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_neon;
+ dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_neon;
+ dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_neon;
+ dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_neon;
+ dsp->loop_filter_8[2][1] = ff_vp9_loop_filter_v_16_8_neon;
+ dsp->loop_filter_8[2][0] = ff_vp9_loop_filter_h_16_8_neon;
+
+ dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_neon;
+ dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_neon;
+
+ dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_neon;
+ dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_neon;
+ dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_neon;
+ dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_neon;
+ dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_neon;
+ dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_neon;
+ dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_neon;
+ dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_neon;
+ }
+}
+
+av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp)
+{
+ if (bpp == 10) {
+ ff_vp9dsp_init_10bpp_aarch64(dsp);
+ return;
+ } else if (bpp == 12) {
+ ff_vp9dsp_init_12bpp_aarch64(dsp);
+ return;
+ } else if (bpp != 8)
+ return;
+
+ vp9dsp_mc_init_aarch64(dsp);
+ vp9dsp_loopfilter_init_aarch64(dsp);
+ vp9dsp_itxfm_init_aarch64(dsp);
+}
diff --git a/media/ffvpx/libavcodec/aarch64/vp9itxfm_16bpp_neon.S b/media/ffvpx/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
new file mode 100644
index 0000000000..68296d9c40
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
@@ -0,0 +1,2017 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+const itxfm4_coeffs, align=4
+ .short 11585, 0, 6270, 15137
+iadst4_coeffs:
+ .short 5283, 15212, 9929, 13377
+endconst
+
+const iadst8_coeffs, align=4
+ .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
+idct_coeffs:
+ .short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
+ .short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
+ .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
+ .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
+endconst
+
+const iadst16_coeffs, align=4
+ .short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
+ .short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
+endconst
+
+.macro transpose_4x4s r0, r1, r2, r3, r4, r5, r6, r7
+ trn1 \r4\().4s, \r0\().4s, \r1\().4s
+ trn2 \r5\().4s, \r0\().4s, \r1\().4s
+ trn1 \r6\().4s, \r2\().4s, \r3\().4s
+ trn2 \r7\().4s, \r2\().4s, \r3\().4s
+ trn1 \r0\().2d, \r4\().2d, \r6\().2d
+ trn2 \r2\().2d, \r4\().2d, \r6\().2d
+ trn1 \r1\().2d, \r5\().2d, \r7\().2d
+ trn2 \r3\().2d, \r5\().2d, \r7\().2d
+.endm
+
+// Transpose a 8x8 matrix of 32 bit elements, where each row is spread out
+// over two registers.
+.macro transpose_8x8s r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3
+ transpose_4x4s \r0, \r2, \r4, \r6, \t0, \t1, \t2, \t3
+ transpose_4x4s \r9, \r11, \r13, \r15, \t0, \t1, \t2, \t3
+
+ // Do 4x4 transposes of r1,r3,r5,r7 and r8,r10,r12,r14
+ // while swapping the two 4x4 matrices between each other
+
+ // First step of the 4x4 transpose of r1-r7, into t0-t3
+ trn1 \t0\().4s, \r1\().4s, \r3\().4s
+ trn2 \t1\().4s, \r1\().4s, \r3\().4s
+ trn1 \t2\().4s, \r5\().4s, \r7\().4s
+ trn2 \t3\().4s, \r5\().4s, \r7\().4s
+
+ // First step of the 4x4 transpose of r8-r12, into r1-r7
+ trn1 \r1\().4s, \r8\().4s, \r10\().4s
+ trn2 \r3\().4s, \r8\().4s, \r10\().4s
+ trn1 \r5\().4s, \r12\().4s, \r14\().4s
+ trn2 \r7\().4s, \r12\().4s, \r14\().4s
+
+ // Second step of the 4x4 transpose of r1-r7 (now in t0-r3), into r8-r12
+ trn1 \r8\().2d, \t0\().2d, \t2\().2d
+ trn2 \r12\().2d, \t0\().2d, \t2\().2d
+ trn1 \r10\().2d, \t1\().2d, \t3\().2d
+ trn2 \r14\().2d, \t1\().2d, \t3\().2d
+
+ // Second step of the 4x4 transpose of r8-r12 (now in r1-r7), in place as far as possible
+ trn1 \t0\().2d, \r1\().2d, \r5\().2d
+ trn2 \r5\().2d, \r1\().2d, \r5\().2d
+ trn1 \t1\().2d, \r3\().2d, \r7\().2d
+ trn2 \r7\().2d, \r3\().2d, \r7\().2d
+
+ // Move the outputs of trn1 back in place
+ mov \r1\().16b, \t0\().16b
+ mov \r3\().16b, \t1\().16b
+.endm
+
+// out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
+// out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
+// in/out are .4s registers; this can do with 4 temp registers, but is
+// more efficient if 6 temp registers are available.
+.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
+.if \neg > 0
+ neg \tmp4\().4s, v0.4s
+.endif
+ add \tmp1\().4s, \in1\().4s, \in2\().4s
+ sub \tmp2\().4s, \in1\().4s, \in2\().4s
+.if \neg > 0
+ smull \tmp3\().2d, \tmp1\().2s, \tmp4\().s[0]
+ smull2 \tmp4\().2d, \tmp1\().4s, \tmp4\().s[0]
+.else
+ smull \tmp3\().2d, \tmp1\().2s, v0.s[0]
+ smull2 \tmp4\().2d, \tmp1\().4s, v0.s[0]
+.endif
+.ifb \tmp5
+ rshrn \out1\().2s, \tmp3\().2d, #14
+ rshrn2 \out1\().4s, \tmp4\().2d, #14
+ smull \tmp3\().2d, \tmp2\().2s, v0.s[0]
+ smull2 \tmp4\().2d, \tmp2\().4s, v0.s[0]
+ rshrn \out2\().2s, \tmp3\().2d, #14
+ rshrn2 \out2\().4s, \tmp4\().2d, #14
+.else
+ smull \tmp5\().2d, \tmp2\().2s, v0.s[0]
+ smull2 \tmp6\().2d, \tmp2\().4s, v0.s[0]
+ rshrn \out1\().2s, \tmp3\().2d, #14
+ rshrn2 \out1\().4s, \tmp4\().2d, #14
+ rshrn \out2\().2s, \tmp5\().2d, #14
+ rshrn2 \out2\().4s, \tmp6\().2d, #14
+.endif
+.endm
+
+// Same as dmbutterfly0 above, but treating the input in in2 as zero,
+// writing the same output into both out1 and out2.
+.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
+ smull \tmp1\().2d, \in1\().2s, v0.s[0]
+ smull2 \tmp2\().2d, \in1\().4s, v0.s[0]
+ rshrn \out1\().2s, \tmp1\().2d, #14
+ rshrn2 \out1\().4s, \tmp2\().2d, #14
+ rshrn \out2\().2s, \tmp1\().2d, #14
+ rshrn2 \out2\().4s, \tmp2\().2d, #14
+.endm
+
+// out1,out2 = in1 * coef1 - in2 * coef2
+// out3,out4 = in1 * coef2 + in2 * coef1
+// out are 4 x .2d registers, in are 2 x .4s registers
+.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
+ smull \out1\().2d, \in1\().2s, \coef1
+ smull2 \out2\().2d, \in1\().4s, \coef1
+ smull \out3\().2d, \in1\().2s, \coef2
+ smull2 \out4\().2d, \in1\().4s, \coef2
+ smlsl \out1\().2d, \in2\().2s, \coef2
+ smlsl2 \out2\().2d, \in2\().4s, \coef2
+ smlal \out3\().2d, \in2\().2s, \coef1
+ smlal2 \out4\().2d, \in2\().4s, \coef1
+.endm
+
+// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
+// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
+// inout are 2 x .4s registers
+.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
+ dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
+.if \neg > 0
+ neg \tmp3\().2d, \tmp3\().2d
+ neg \tmp4\().2d, \tmp4\().2d
+.endif
+ rshrn \inout1\().2s, \tmp1\().2d, #14
+ rshrn2 \inout1\().4s, \tmp2\().2d, #14
+ rshrn \inout2\().2s, \tmp3\().2d, #14
+ rshrn2 \inout2\().4s, \tmp4\().2d, #14
+.endm
+
+// Same as dmbutterfly above, but treating the input in inout2 as zero
+.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+ smull \tmp1\().2d, \inout1\().2s, \coef1
+ smull2 \tmp2\().2d, \inout1\().4s, \coef1
+ smull \tmp3\().2d, \inout1\().2s, \coef2
+ smull2 \tmp4\().2d, \inout1\().4s, \coef2
+ rshrn \inout1\().2s, \tmp1\().2d, #14
+ rshrn2 \inout1\().4s, \tmp2\().2d, #14
+ rshrn \inout2\().2s, \tmp3\().2d, #14
+ rshrn2 \inout2\().4s, \tmp4\().2d, #14
+.endm
+
+// Same as dmbutterfly above, but treating the input in inout1 as zero
+.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+ smull \tmp1\().2d, \inout2\().2s, \coef2
+ smull2 \tmp2\().2d, \inout2\().4s, \coef2
+ smull \tmp3\().2d, \inout2\().2s, \coef1
+ smull2 \tmp4\().2d, \inout2\().4s, \coef1
+ neg \tmp1\().2d, \tmp1\().2d
+ neg \tmp2\().2d, \tmp2\().2d
+ rshrn \inout2\().2s, \tmp3\().2d, #14
+ rshrn2 \inout2\().4s, \tmp4\().2d, #14
+ rshrn \inout1\().2s, \tmp1\().2d, #14
+ rshrn2 \inout1\().4s, \tmp2\().2d, #14
+.endm
+
+.macro dsmull_h out1, out2, in, coef
+ smull \out1\().2d, \in\().2s, \coef
+ smull2 \out2\().2d, \in\().4s, \coef
+.endm
+
+.macro drshrn_h out, in1, in2, shift
+ rshrn \out\().2s, \in1\().2d, \shift
+ rshrn2 \out\().4s, \in2\().2d, \shift
+.endm
+
+
+// out1 = in1 + in2
+// out2 = in1 - in2
+.macro butterfly_4s out1, out2, in1, in2
+ add \out1\().4s, \in1\().4s, \in2\().4s
+ sub \out2\().4s, \in1\().4s, \in2\().4s
+.endm
+
+// out1 = in1 - in2
+// out2 = in1 + in2
+.macro butterfly_4s_r out1, out2, in1, in2
+ sub \out1\().4s, \in1\().4s, \in2\().4s
+ add \out2\().4s, \in1\().4s, \in2\().4s
+.endm
+
+// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
+// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
+// out are 2 x .4s registers, in are 4 x .2d registers
+.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
+ add \tmp1\().2d, \in1\().2d, \in3\().2d
+ add \tmp2\().2d, \in2\().2d, \in4\().2d
+ sub \tmp3\().2d, \in1\().2d, \in3\().2d
+ sub \tmp4\().2d, \in2\().2d, \in4\().2d
+ rshrn \out1\().2s, \tmp1\().2d, #14
+ rshrn2 \out1\().4s, \tmp2\().2d, #14
+ rshrn \out2\().2s, \tmp3\().2d, #14
+ rshrn2 \out2\().4s, \tmp4\().2d, #14
+.endm
+
+.macro iwht4_10 c0, c1, c2, c3
+ add \c0\().4s, \c0\().4s, \c1\().4s
+ sub v17.4s, \c2\().4s, \c3\().4s
+ sub v16.4s, \c0\().4s, v17.4s
+ sshr v16.4s, v16.4s, #1
+ sub \c2\().4s, v16.4s, \c1\().4s
+ sub \c1\().4s, v16.4s, \c3\().4s
+ add \c3\().4s, v17.4s, \c2\().4s
+ sub \c0\().4s, \c0\().4s, \c1\().4s
+.endm
+
+.macro iwht4_12 c0, c1, c2, c3
+ iwht4_10 \c0, \c1, \c2, \c3
+.endm
+
+.macro idct4_10 c0, c1, c2, c3
+ mul v22.4s, \c1\().4s, v0.s[3]
+ mul v20.4s, \c1\().4s, v0.s[2]
+ add v16.4s, \c0\().4s, \c2\().4s
+ sub v17.4s, \c0\().4s, \c2\().4s
+ mla v22.4s, \c3\().4s, v0.s[2]
+ mul v18.4s, v16.4s, v0.s[0]
+ mul v24.4s, v17.4s, v0.s[0]
+ mls v20.4s, \c3\().4s, v0.s[3]
+ srshr v22.4s, v22.4s, #14
+ srshr v18.4s, v18.4s, #14
+ srshr v24.4s, v24.4s, #14
+ srshr v20.4s, v20.4s, #14
+ add \c0\().4s, v18.4s, v22.4s
+ sub \c3\().4s, v18.4s, v22.4s
+ add \c1\().4s, v24.4s, v20.4s
+ sub \c2\().4s, v24.4s, v20.4s
+.endm
+
+.macro idct4_12 c0, c1, c2, c3
+ smull v22.2d, \c1\().2s, v0.s[3]
+ smull2 v23.2d, \c1\().4s, v0.s[3]
+ smull v20.2d, \c1\().2s, v0.s[2]
+ smull2 v21.2d, \c1\().4s, v0.s[2]
+ add v16.4s, \c0\().4s, \c2\().4s
+ sub v17.4s, \c0\().4s, \c2\().4s
+ smlal v22.2d, \c3\().2s, v0.s[2]
+ smlal2 v23.2d, \c3\().4s, v0.s[2]
+ smull v18.2d, v16.2s, v0.s[0]
+ smull2 v19.2d, v16.4s, v0.s[0]
+ smull v24.2d, v17.2s, v0.s[0]
+ smull2 v25.2d, v17.4s, v0.s[0]
+ smlsl v20.2d, \c3\().2s, v0.s[3]
+ smlsl2 v21.2d, \c3\().4s, v0.s[3]
+ rshrn v22.2s, v22.2d, #14
+ rshrn2 v22.4s, v23.2d, #14
+ rshrn v18.2s, v18.2d, #14
+ rshrn2 v18.4s, v19.2d, #14
+ rshrn v24.2s, v24.2d, #14
+ rshrn2 v24.4s, v25.2d, #14
+ rshrn v20.2s, v20.2d, #14
+ rshrn2 v20.4s, v21.2d, #14
+ add \c0\().4s, v18.4s, v22.4s
+ sub \c3\().4s, v18.4s, v22.4s
+ add \c1\().4s, v24.4s, v20.4s
+ sub \c2\().4s, v24.4s, v20.4s
+.endm
+
+.macro iadst4_10 c0, c1, c2, c3
+ mul v16.4s, \c0\().4s, v1.s[0]
+ mla v16.4s, \c2\().4s, v1.s[1]
+ mla v16.4s, \c3\().4s, v1.s[2]
+ mul v18.4s, \c0\().4s, v1.s[2]
+ mls v18.4s, \c2\().4s, v1.s[0]
+ sub \c0\().4s, \c0\().4s, \c2\().4s
+ mls v18.4s, \c3\().4s, v1.s[1]
+ add \c0\().4s, \c0\().4s, \c3\().4s
+ mul v22.4s, \c1\().4s, v1.s[3]
+ mul v20.4s, \c0\().4s, v1.s[3]
+ add v24.4s, v16.4s, v22.4s
+ add v26.4s, v18.4s, v22.4s
+ srshr \c0\().4s, v24.4s, #14
+ add v16.4s, v16.4s, v18.4s
+ srshr \c1\().4s, v26.4s, #14
+ sub v16.4s, v16.4s, v22.4s
+ srshr \c2\().4s, v20.4s, #14
+ srshr \c3\().4s, v16.4s, #14
+.endm
+
+.macro iadst4_12 c0, c1, c2, c3
+ smull v16.2d, \c0\().2s, v1.s[0]
+ smull2 v17.2d, \c0\().4s, v1.s[0]
+ smlal v16.2d, \c2\().2s, v1.s[1]
+ smlal2 v17.2d, \c2\().4s, v1.s[1]
+ smlal v16.2d, \c3\().2s, v1.s[2]
+ smlal2 v17.2d, \c3\().4s, v1.s[2]
+ smull v18.2d, \c0\().2s, v1.s[2]
+ smull2 v19.2d, \c0\().4s, v1.s[2]
+ smlsl v18.2d, \c2\().2s, v1.s[0]
+ smlsl2 v19.2d, \c2\().4s, v1.s[0]
+ sub \c0\().4s, \c0\().4s, \c2\().4s
+ smlsl v18.2d, \c3\().2s, v1.s[1]
+ smlsl2 v19.2d, \c3\().4s, v1.s[1]
+ add \c0\().4s, \c0\().4s, \c3\().4s
+ smull v22.2d, \c1\().2s, v1.s[3]
+ smull2 v23.2d, \c1\().4s, v1.s[3]
+ smull v20.2d, \c0\().2s, v1.s[3]
+ smull2 v21.2d, \c0\().4s, v1.s[3]
+ add v24.2d, v16.2d, v22.2d
+ add v25.2d, v17.2d, v23.2d
+ add v26.2d, v18.2d, v22.2d
+ add v27.2d, v19.2d, v23.2d
+ rshrn \c0\().2s, v24.2d, #14
+ rshrn2 \c0\().4s, v25.2d, #14
+ add v16.2d, v16.2d, v18.2d
+ add v17.2d, v17.2d, v19.2d
+ rshrn \c1\().2s, v26.2d, #14
+ rshrn2 \c1\().4s, v27.2d, #14
+ sub v16.2d, v16.2d, v22.2d
+ sub v17.2d, v17.2d, v23.2d
+ rshrn \c2\().2s, v20.2d, #14
+ rshrn2 \c2\().4s, v21.2d, #14
+ rshrn \c3\().2s, v16.2d, #14
+ rshrn2 \c3\().4s, v17.2d, #14
+.endm
+
+// The public functions in this file have got the following signature:
+// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+
+.macro itxfm_func4x4 txfm1, txfm2, bpp
+function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1
+.ifc \txfm1,\txfm2
+.ifc \txfm1,idct
+ movrel x4, itxfm4_coeffs
+ ld1 {v0.4h}, [x4]
+ sxtl v0.4s, v0.4h
+.endif
+.ifc \txfm1,iadst
+ movrel x4, iadst4_coeffs
+ ld1 {v0.d}[1], [x4]
+ sxtl2 v1.4s, v0.8h
+.endif
+.else
+ movrel x4, itxfm4_coeffs
+ ld1 {v0.8h}, [x4]
+ sxtl2 v1.4s, v0.8h
+ sxtl v0.4s, v0.4h
+.endif
+
+ movi v30.4s, #0
+ movi v31.4s, #0
+.ifc \txfm1\()_\txfm2,idct_idct
+ cmp w3, #1
+ b.ne 1f
+ // DC-only for idct/idct
+ ld1 {v2.s}[0], [x2]
+ smull v2.2d, v2.2s, v0.s[0]
+ rshrn v2.2s, v2.2d, #14
+ smull v2.2d, v2.2s, v0.s[0]
+ rshrn v2.2s, v2.2d, #14
+ st1 {v31.s}[0], [x2]
+ dup v4.4s, v2.s[0]
+ mov v5.16b, v4.16b
+ mov v6.16b, v4.16b
+ mov v7.16b, v4.16b
+ b 2f
+.endif
+
+1:
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2]
+ st1 {v30.4s,v31.4s}, [x2], #32
+
+.ifc \txfm1,iwht
+ sshr v4.4s, v4.4s, #2
+ sshr v5.4s, v5.4s, #2
+ sshr v6.4s, v6.4s, #2
+ sshr v7.4s, v7.4s, #2
+.endif
+
+ \txfm1\()4_\bpp v4, v5, v6, v7
+
+ st1 {v30.4s,v31.4s}, [x2], #32
+ // Transpose 4x4 with 32 bit elements
+ transpose_4x4s v4, v5, v6, v7, v16, v17, v18, v19
+
+ \txfm2\()4_\bpp v4, v5, v6, v7
+2:
+ mvni v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
+ ld1 {v0.4h}, [x0], x1
+ ld1 {v1.4h}, [x0], x1
+.ifnc \txfm1,iwht
+ srshr v4.4s, v4.4s, #4
+ srshr v5.4s, v5.4s, #4
+ srshr v6.4s, v6.4s, #4
+ srshr v7.4s, v7.4s, #4
+.endif
+ uaddw v4.4s, v4.4s, v0.4h
+ uaddw v5.4s, v5.4s, v1.4h
+ ld1 {v2.4h}, [x0], x1
+ ld1 {v3.4h}, [x0], x1
+ sqxtun v0.4h, v4.4s
+ sqxtun2 v0.8h, v5.4s
+ sub x0, x0, x1, lsl #2
+
+ uaddw v6.4s, v6.4s, v2.4h
+ umin v0.8h, v0.8h, v31.8h
+ uaddw v7.4s, v7.4s, v3.4h
+ st1 {v0.4h}, [x0], x1
+ sqxtun v2.4h, v6.4s
+ sqxtun2 v2.8h, v7.4s
+ umin v2.8h, v2.8h, v31.8h
+
+ st1 {v0.d}[1], [x0], x1
+ st1 {v2.4h}, [x0], x1
+ st1 {v2.d}[1], [x0], x1
+
+ ret
+endfunc
+.endm
+
+.macro itxfm_funcs4x4 bpp
+itxfm_func4x4 idct, idct, \bpp
+itxfm_func4x4 iadst, idct, \bpp
+itxfm_func4x4 idct, iadst, \bpp
+itxfm_func4x4 iadst, iadst, \bpp
+itxfm_func4x4 iwht, iwht, \bpp
+.endm
+
+itxfm_funcs4x4 10
+itxfm_funcs4x4 12
+
+function idct8x8_dc_add_neon
+ movrel x4, idct_coeffs
+ ld1 {v0.4h}, [x4]
+
+ movi v1.4h, #0
+ sxtl v0.4s, v0.4h
+
+ ld1 {v2.s}[0], [x2]
+ smull v2.2d, v2.2s, v0.s[0]
+ rshrn v2.2s, v2.2d, #14
+ smull v2.2d, v2.2s, v0.s[0]
+ rshrn v2.2s, v2.2d, #14
+ st1 {v1.s}[0], [x2]
+ dup v2.4s, v2.s[0]
+
+ srshr v2.4s, v2.4s, #5
+
+ mov x4, #8
+ mov x3, x0
+ dup v31.8h, w5
+1:
+ // Loop to add the constant from v2 into all 8x8 outputs
+ subs x4, x4, #2
+ ld1 {v3.8h}, [x0], x1
+ ld1 {v4.8h}, [x0], x1
+ uaddw v16.4s, v2.4s, v3.4h
+ uaddw2 v17.4s, v2.4s, v3.8h
+ uaddw v18.4s, v2.4s, v4.4h
+ uaddw2 v19.4s, v2.4s, v4.8h
+ sqxtun v3.4h, v16.4s
+ sqxtun2 v3.8h, v17.4s
+ sqxtun v4.4h, v18.4s
+ sqxtun2 v4.8h, v19.4s
+ umin v3.8h, v3.8h, v31.8h
+ umin v4.8h, v4.8h, v31.8h
+ st1 {v3.8h}, [x3], x1
+ st1 {v4.8h}, [x3], x1
+ b.ne 1b
+
+ ret
+endfunc
+
+.macro idct8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
+ dmbutterfly0 \r0, \r4, \r0, \r4, \t0, \t1, \t2, \t3, \t4, \t5 // r0 = t0a, r4 = t1a
+ dmbutterfly \r2, \r6, v0.s[2], v0.s[3], \t0, \t1, \t2, \t3 // r2 = t2a, r6 = t3a
+ dmbutterfly \r1, \r7, v1.s[0], v1.s[1], \t0, \t1, \t2, \t3 // r1 = t4a, r7 = t7a
+ dmbutterfly \r5, \r3, v1.s[2], v1.s[3], \t0, \t1, \t2, \t3 // r5 = t5a, r3 = t6a
+
+ butterfly_4s \t0, \t1, \r0, \r6 // t0 = t0, t1 = t3
+ butterfly_4s \t2, \r5, \r1, \r5 // t2 = t4, r5 = t5a
+ butterfly_4s \t3, \r6, \r7, \r3 // t3 = t7, r6 = t6a
+ butterfly_4s \r7, \r4, \r4, \r2 // r7 = t1, r4 = t2
+
+ dmbutterfly0 \r6, \r5, \r6, \r5, \r0, \r1, \r2, \r3, \t4, \t5 // r6 = t6, r5 = t5
+
+ butterfly_4s \r1, \r6, \r7, \r6 // r1 = out[1], r6 = out[6]
+ butterfly_4s \r0, \r7, \t0, \t3 // r0 = out[0], r7 = out[7]
+ butterfly_4s \r2, \r5, \r4, \r5 // r2 = out[2], r5 = out[5]
+ butterfly_4s \r3, \r4, \t1, \t2 // r3 = out[3], r4 = out[4]
+.endm
+
+.macro iadst8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
+ dmbutterfly_l \t2, \t3, \t0, \t1, \r7, \r0, v2.s[1], v2.s[0] // t2,t3 = t1a, t0,t1 = t0a
+ dmbutterfly_l \r0, \r7, \t4, \t5, \r3, \r4, v3.s[1], v3.s[0] // r0,r7 = t5a, t4,t5 = t4a
+
+ dbutterfly_n \r3, \t0, \t0, \t1, \t4, \t5, \r3, \r4, \t0, \t1 // r3 = t0, t0 = t4
+ dbutterfly_n \r4, \t1, \t2, \t3, \r0, \r7, \r4, \t1, \t4, \t5 // r4 = t1, t1 = t5
+
+ dmbutterfly_l \t4, \t5, \t2, \t3, \r5, \r2, v2.s[3], v2.s[2] // t4,t5 = t3a, t2,t3 = t2a
+ dmbutterfly_l \r2, \r5, \r0, \r7, \r1, \r6, v3.s[3], v3.s[2] // r2,r5 = t7a, r0,r7 = t6a
+
+ dbutterfly_n \r1, \t2, \t2, \t3, \r0, \r7, \r1, \r6, \t2, \t3 // r1 = t2, t2 = t6
+ dbutterfly_n \r0, \t4, \t4, \t5, \r2, \r5, \r0, \r7, \t4, \t5 // r0 = t3, t4 = t7
+
+ butterfly_4s \r7, \r4, \r4, \r0 // r7 = -out[7], r4 = t3
+ neg \r7\().4s, \r7\().4s // r7 = out[7]
+ butterfly_4s \r0, \r1, \r3, \r1 // r0 = out[0], r1 = t2
+
+ dmbutterfly_l \r2, \r3, \t3, \t5, \t0, \t1, v0.s[2], v0.s[3] // r2,r3 = t5a, t3,t5 = t4a
+ dmbutterfly_l \t0, \t1, \r5, \r6, \t4, \t2, v0.s[3], v0.s[2] // t0,t1 = t6a, r5,r6 = t7a
+
+ dbutterfly_n \r6, \t2, \r2, \r3, \r5, \r6, \t2, \t4, \r2, \r3 // r6 = out[6], t2 = t7
+
+ dmbutterfly0 \r3, \r4, \r1, \r4, \t4, \r5, \r1, \r2 // r3 = -out[3], r4 = out[4]
+ neg \r3\().4s, \r3\().4s // r3 = out[3]
+
+ dbutterfly_n \r1, \t0, \t3, \t5, \t0, \t1, \r1, \r2, \t0, \t1 // r1 = -out[1], t0 = t6
+ neg \r1\().4s, \r1\().4s // r1 = out[1]
+
+ dmbutterfly0 \r2, \r5, \t0, \t2, \t1, \t3, \t4, \t5 // r2 = out[2], r5 = -out[5]
+ neg \r5\().4s, \r5\().4s // r5 = out[5]
+.endm
+
+
+.macro itxfm_func8x8 txfm1, txfm2
+function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
+.ifc \txfm1\()_\txfm2,idct_idct
+ cmp w3, #1
+ b.eq idct8x8_dc_add_neon
+.endif
+ // The iadst also uses a few coefficients from
+ // idct, so those always need to be loaded.
+.ifc \txfm1\()_\txfm2,idct_idct
+ movrel x4, idct_coeffs
+.else
+ movrel x4, iadst8_coeffs
+ ld1 {v1.8h}, [x4], #16
+ stp d8, d9, [sp, #-0x10]!
+ sxtl2 v3.4s, v1.8h
+ sxtl v2.4s, v1.4h
+.endif
+ ld1 {v0.8h}, [x4]
+ sxtl2 v1.4s, v0.8h
+ sxtl v0.4s, v0.4h
+
+ movi v4.4s, #0
+ movi v5.4s, #0
+ movi v6.4s, #0
+ movi v7.4s, #0
+
+1:
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2], #64
+ ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2], #64
+ ld1 {v24.4s,v25.4s,v26.4s,v27.4s}, [x2], #64
+ ld1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64
+ sub x2, x2, #256
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64
+
+.ifc \txfm1\()_\txfm2,idct_idct
+ idct8 v16, v18, v20, v22, v24, v26, v28, v30, v2, v3, v4, v5, v6, v7
+ idct8 v17, v19, v21, v23, v25, v27, v29, v31, v2, v3, v4, v5, v6, v7
+.else
+ \txfm1\()8 v16, v18, v20, v22, v24, v26, v28, v30, v4, v5, v6, v7, v8, v9
+ \txfm1\()8 v17, v19, v21, v23, v25, v27, v29, v31, v4, v5, v6, v7, v8, v9
+.endif
+
+ // Transpose 8x8 with 16 bit elements
+ transpose_8x8s v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v4, v5, v6, v7
+
+.ifc \txfm1\()_\txfm2,idct_idct
+ idct8 v16, v18, v20, v22, v24, v26, v28, v30, v2, v3, v4, v5, v6, v7
+ idct8 v17, v19, v21, v23, v25, v27, v29, v31, v2, v3, v4, v5, v6, v7
+.else
+ \txfm2\()8 v16, v18, v20, v22, v24, v26, v28, v30, v4, v5, v6, v7, v8, v9
+ \txfm2\()8 v17, v19, v21, v23, v25, v27, v29, v31, v4, v5, v6, v7, v8, v9
+.endif
+2:
+ mov x3, x0
+ // Add into the destination
+ ld1 {v0.8h}, [x0], x1
+ srshr v16.4s, v16.4s, #5
+ srshr v17.4s, v17.4s, #5
+ ld1 {v1.8h}, [x0], x1
+ srshr v18.4s, v18.4s, #5
+ srshr v19.4s, v19.4s, #5
+ ld1 {v2.8h}, [x0], x1
+ srshr v20.4s, v20.4s, #5
+ srshr v21.4s, v21.4s, #5
+ uaddw v16.4s, v16.4s, v0.4h
+ uaddw2 v17.4s, v17.4s, v0.8h
+ ld1 {v3.8h}, [x0], x1
+ srshr v22.4s, v22.4s, #5
+ srshr v23.4s, v23.4s, #5
+ uaddw v18.4s, v18.4s, v1.4h
+ uaddw2 v19.4s, v19.4s, v1.8h
+ ld1 {v4.8h}, [x0], x1
+ srshr v24.4s, v24.4s, #5
+ srshr v25.4s, v25.4s, #5
+ uaddw v20.4s, v20.4s, v2.4h
+ uaddw2 v21.4s, v21.4s, v2.8h
+ sqxtun v0.4h, v16.4s
+ sqxtun2 v0.8h, v17.4s
+ dup v16.8h, w5
+ ld1 {v5.8h}, [x0], x1
+ srshr v26.4s, v26.4s, #5
+ srshr v27.4s, v27.4s, #5
+ uaddw v22.4s, v22.4s, v3.4h
+ uaddw2 v23.4s, v23.4s, v3.8h
+ sqxtun v1.4h, v18.4s
+ sqxtun2 v1.8h, v19.4s
+ umin v0.8h, v0.8h, v16.8h
+ ld1 {v6.8h}, [x0], x1
+ srshr v28.4s, v28.4s, #5
+ srshr v29.4s, v29.4s, #5
+ uaddw v24.4s, v24.4s, v4.4h
+ uaddw2 v25.4s, v25.4s, v4.8h
+ sqxtun v2.4h, v20.4s
+ sqxtun2 v2.8h, v21.4s
+ umin v1.8h, v1.8h, v16.8h
+ ld1 {v7.8h}, [x0], x1
+ srshr v30.4s, v30.4s, #5
+ srshr v31.4s, v31.4s, #5
+ uaddw v26.4s, v26.4s, v5.4h
+ uaddw2 v27.4s, v27.4s, v5.8h
+ sqxtun v3.4h, v22.4s
+ sqxtun2 v3.8h, v23.4s
+ umin v2.8h, v2.8h, v16.8h
+
+ st1 {v0.8h}, [x3], x1
+ uaddw v28.4s, v28.4s, v6.4h
+ uaddw2 v29.4s, v29.4s, v6.8h
+ st1 {v1.8h}, [x3], x1
+ sqxtun v4.4h, v24.4s
+ sqxtun2 v4.8h, v25.4s
+ umin v3.8h, v3.8h, v16.8h
+ st1 {v2.8h}, [x3], x1
+ uaddw v30.4s, v30.4s, v7.4h
+ uaddw2 v31.4s, v31.4s, v7.8h
+ st1 {v3.8h}, [x3], x1
+ sqxtun v5.4h, v26.4s
+ sqxtun2 v5.8h, v27.4s
+ umin v4.8h, v4.8h, v16.8h
+ st1 {v4.8h}, [x3], x1
+ sqxtun v6.4h, v28.4s
+ sqxtun2 v6.8h, v29.4s
+ umin v5.8h, v5.8h, v16.8h
+ st1 {v5.8h}, [x3], x1
+ sqxtun v7.4h, v30.4s
+ sqxtun2 v7.8h, v31.4s
+ umin v6.8h, v6.8h, v16.8h
+
+ st1 {v6.8h}, [x3], x1
+ umin v7.8h, v7.8h, v16.8h
+ st1 {v7.8h}, [x3], x1
+
+.ifnc \txfm1\()_\txfm2,idct_idct
+ ldp d8, d9, [sp], 0x10
+.endif
+ ret
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1
+ mov x5, #0x03ff
+ b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1
+ mov x5, #0x0fff
+ b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
+endfunc
+.endm
+
+itxfm_func8x8 idct, idct
+itxfm_func8x8 iadst, idct
+itxfm_func8x8 idct, iadst
+itxfm_func8x8 iadst, iadst
+
+
+function idct16x16_dc_add_neon
+ movrel x4, idct_coeffs
+ ld1 {v0.4h}, [x4]
+ sxtl v0.4s, v0.4h
+
+ movi v1.4h, #0
+
+ ld1 {v2.s}[0], [x2]
+ smull v2.2d, v2.2s, v0.s[0]
+ rshrn v2.2s, v2.2d, #14
+ smull v2.2d, v2.2s, v0.s[0]
+ rshrn v2.2s, v2.2d, #14
+ st1 {v1.s}[0], [x2]
+ dup v2.4s, v2.s[0]
+
+ srshr v0.4s, v2.4s, #6
+
+ mov x3, x0
+ mov x4, #16
+ dup v31.8h, w13
+1:
+ // Loop to add the constant from v2 into all 16x16 outputs
+ subs x4, x4, #2
+ ld1 {v1.8h,v2.8h}, [x0], x1
+ uaddw v16.4s, v0.4s, v1.4h
+ uaddw2 v17.4s, v0.4s, v1.8h
+ ld1 {v3.8h,v4.8h}, [x0], x1
+ uaddw v18.4s, v0.4s, v2.4h
+ uaddw2 v19.4s, v0.4s, v2.8h
+ uaddw v20.4s, v0.4s, v3.4h
+ uaddw2 v21.4s, v0.4s, v3.8h
+ uaddw v22.4s, v0.4s, v4.4h
+ uaddw2 v23.4s, v0.4s, v4.8h
+ sqxtun v1.4h, v16.4s
+ sqxtun2 v1.8h, v17.4s
+ sqxtun v2.4h, v18.4s
+ sqxtun2 v2.8h, v19.4s
+ sqxtun v3.4h, v20.4s
+ sqxtun2 v3.8h, v21.4s
+ sqxtun v4.4h, v22.4s
+ sqxtun2 v4.8h, v23.4s
+ umin v1.8h, v1.8h, v31.8h
+ umin v2.8h, v2.8h, v31.8h
+ st1 {v1.8h,v2.8h}, [x3], x1
+ umin v3.8h, v3.8h, v31.8h
+ umin v4.8h, v4.8h, v31.8h
+ st1 {v3.8h,v4.8h}, [x3], x1
+ b.ne 1b
+
+ ret
+endfunc
+
+.macro idct16_end
+ butterfly_4s v18, v7, v4, v7 // v18 = t0a, v7 = t7a
+ butterfly_4s v19, v22, v5, v22 // v19 = t1a, v22 = t6
+ butterfly_4s v4, v26, v20, v26 // v4 = t2a, v26 = t5
+ butterfly_4s v5, v6, v28, v6 // v5 = t3a, v6 = t4
+ butterfly_4s v20, v28, v16, v24 // v20 = t8a, v28 = t11a
+ butterfly_4s v24, v21, v23, v21 // v24 = t9, v21 = t10
+ butterfly_4s v23, v27, v25, v27 // v23 = t14, v27 = t13
+ butterfly_4s v25, v29, v29, v17 // v25 = t15a, v29 = t12a
+
+ dmbutterfly0 v8, v9, v27, v21, v8, v9, v16, v17, v30, v31 // v8 = t13a, v9 = t10a
+ dmbutterfly0 v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12, v27 = t11
+
+ butterfly_4s v16, v31, v18, v25 // v16 = out[0], v31 = out[15]
+ butterfly_4s v17, v30, v19, v23 // v17 = out[1], v30 = out[14]
+ butterfly_4s_r v25, v22, v22, v24 // v25 = out[9], v22 = out[6]
+ butterfly_4s v23, v24, v7, v20 // v23 = out[7], v24 = out[8]
+ butterfly_4s v18, v29, v4, v8 // v18 = out[2], v29 = out[13]
+ butterfly_4s v19, v28, v5, v28 // v19 = out[3], v28 = out[12]
+ butterfly_4s v20, v27, v6, v27 // v20 = out[4], v27 = out[11]
+ butterfly_4s v21, v26, v26, v9 // v21 = out[5], v26 = out[10]
+ ret
+.endm
+
+function idct16
+ dmbutterfly0 v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a, v24 = t1a
+ dmbutterfly v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a, v28 = t3a
+ dmbutterfly v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a, v30 = t7a
+ dmbutterfly v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a, v22 = t6a
+ dmbutterfly v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a, v31 = t15a
+ dmbutterfly v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a, v23 = t14a
+ dmbutterfly v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
+ dmbutterfly v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
+
+ butterfly_4s v4, v28, v16, v28 // v4 = t0, v28 = t3
+ butterfly_4s v5, v20, v24, v20 // v5 = t1, v20 = t2
+ butterfly_4s v6, v26, v18, v26 // v6 = t4, v26 = t5
+ butterfly_4s v7, v22, v30, v22 // v7 = t7, v22 = t6
+ butterfly_4s v16, v25, v17, v25 // v16 = t8, v25 = t9
+ butterfly_4s v24, v21, v29, v21 // v24 = t11, v21 = t10
+ butterfly_4s v17, v27, v19, v27 // v17 = t12, v27 = t13
+ butterfly_4s v29, v23, v31, v23 // v29 = t15, v23 = t14
+
+ dmbutterfly0 v22, v26, v22, v26, v8, v9, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
+ dmbutterfly v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
+ dmbutterfly v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+ idct16_end
+endfunc
+
+function idct16_half
+ dmbutterfly0_h v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a, v24 = t1a
+ dmbutterfly_h1 v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a, v28 = t3a
+ dmbutterfly_h1 v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a, v30 = t7a
+ dmbutterfly_h2 v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a, v22 = t6a
+ dmbutterfly_h1 v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a, v31 = t15a
+ dmbutterfly_h2 v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a, v23 = t14a
+ dmbutterfly_h1 v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
+ dmbutterfly_h2 v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
+
+ butterfly_4s v4, v28, v16, v28 // v4 = t0, v28 = t3
+ butterfly_4s v5, v20, v24, v20 // v5 = t1, v20 = t2
+ butterfly_4s v6, v26, v18, v26 // v6 = t4, v26 = t5
+ butterfly_4s v7, v22, v30, v22 // v7 = t7, v22 = t6
+ butterfly_4s v16, v25, v17, v25 // v16 = t8, v25 = t9
+ butterfly_4s v24, v21, v29, v21 // v24 = t11, v21 = t10
+ butterfly_4s v17, v27, v19, v27 // v17 = t12, v27 = t13
+ butterfly_4s v29, v23, v31, v23 // v29 = t15, v23 = t14
+
+ dmbutterfly0 v22, v26, v22, v26, v8, v9, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
+ dmbutterfly v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
+ dmbutterfly v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+ idct16_end
+endfunc
+
+function idct16_quarter
+ dsmull_h v24, v25, v19, v3.s[3]
+ dsmull_h v4, v5, v17, v2.s[0]
+ dsmull_h v7, v6, v18, v1.s[1]
+ dsmull_h v30, v31, v18, v1.s[0]
+ neg v24.2d, v24.2d
+ neg v25.2d, v25.2d
+ dsmull_h v29, v28, v17, v2.s[1]
+ dsmull_h v26, v27, v19, v3.s[2]
+ dsmull_h v22, v23, v16, v0.s[0]
+ drshrn_h v24, v24, v25, #14
+ drshrn_h v16, v4, v5, #14
+ drshrn_h v7, v7, v6, #14
+ drshrn_h v6, v30, v31, #14
+ drshrn_h v29, v29, v28, #14
+ drshrn_h v17, v26, v27, #14
+ drshrn_h v28, v22, v23, #14
+
+ dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.s[2], v0.s[3]
+ dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.s[2], v0.s[3]
+ neg v22.2d, v22.2d
+ neg v23.2d, v23.2d
+ drshrn_h v27, v20, v21, #14
+ drshrn_h v21, v22, v23, #14
+ drshrn_h v23, v18, v19, #14
+ drshrn_h v25, v30, v31, #14
+ mov v4.16b, v28.16b
+ mov v5.16b, v28.16b
+ dmbutterfly0 v22, v26, v7, v6, v18, v19, v30, v31
+ mov v20.16b, v28.16b
+ idct16_end
+endfunc
+
+function iadst16
+ ld1 {v0.8h,v1.8h}, [x11]
+ sxtl v2.4s, v1.4h
+ sxtl2 v3.4s, v1.8h
+ sxtl2 v1.4s, v0.8h
+ sxtl v0.4s, v0.4h
+
+ dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.s[1], v0.s[0] // v6,v7 = t1, v4,v5 = t0
+ dmbutterfly_l v10, v11, v8, v9, v23, v24, v1.s[1], v1.s[0] // v10,v11 = t9, v8,v9 = t8
+ dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // v31 = t1a, v24 = t9a
+ dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.s[3], v0.s[2] // v14,v15 = t3, v12,v13 = t2
+ dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // v16 = t0a, v23 = t8a
+
+ dmbutterfly_l v6, v7, v4, v5, v21, v26, v1.s[3], v1.s[2] // v6,v7 = t11, v4,v5 = t10
+ dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // v29 = t3a, v26 = t11a
+ dmbutterfly_l v10, v11, v8, v9, v27, v20, v2.s[1], v2.s[0] // v10,v11 = t5, v8,v9 = t4
+ dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // v18 = t2a, v21 = t10a
+
+ dmbutterfly_l v14, v15, v12, v13, v19, v28, v3.s[1], v3.s[0] // v14,v15 = t13, v12,v13 = t12
+ dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // v20 = t5a, v28 = t13a
+ dmbutterfly_l v6, v7, v4, v5, v25, v22, v2.s[3], v2.s[2] // v6,v7 = t7, v4,v5 = t6
+ dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // v27 = t4a, v19 = t12a
+
+ dmbutterfly_l v10, v11, v8, v9, v17, v30, v3.s[3], v3.s[2] // v10,v11 = t15, v8,v9 = t14
+ ld1 {v0.8h}, [x10]
+ dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a
+ sxtl2 v1.4s, v0.8h
+ sxtl v0.4s, v0.4h
+ dmbutterfly_l v14, v15, v12, v13, v23, v24, v1.s[0], v1.s[1] // v14,v15 = t9, v12,v13 = t8
+ dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a
+
+ dmbutterfly_l v4, v5, v6, v7, v28, v19, v1.s[1], v1.s[0] // v4,v5 = t12, v6,v7 = t13
+ dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a
+ dmbutterfly_l v10, v11, v8, v9, v21, v26, v1.s[2], v1.s[3] // v10,v11 = t11, v8,v9 = t10
+ butterfly_4s_r v4, v27, v16, v27 // v4 = t4, v27 = t0
+ dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a
+
+ dmbutterfly_l v12, v13, v14, v15, v30, v17, v1.s[3], v1.s[2] // v12,v13 = t14, v14,v15 = t15
+ butterfly_4s_r v5, v20, v31, v20 // v5 = t5, v20 = t1
+ dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a
+ dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a
+
+ butterfly_4s_r v6, v25, v18, v25 // v6 = t6, v25 = t2
+ butterfly_4s_r v7, v22, v29, v22 // v7 = t7, v22 = t3
+
+ dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.s[2], v0.s[3] // v10,v11 = t13, v8,v9 = t12
+ dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.s[3], v0.s[2] // v12,v13 = t14, v14,v15 = t15
+
+ dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a
+ dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a
+ neg v29.4s, v29.4s // v29 = out[13]
+
+ dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.s[2], v0.s[3] // v10,v11 = t5a, v8,v9 = t4a
+ dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.s[3], v0.s[2] // v12,v13 = t6a, v14,v15 = t7a
+
+ butterfly_4s v2, v6, v27, v25 // v2 = out[0], v6 = t2a
+ butterfly_4s v3, v7, v23, v21 // v3 =-out[1], v7 = t10
+
+ dbutterfly_n v19, v31, v8, v9, v12, v13, v4, v5, v8, v9 // v19 = -out[3], v31 = t6
+ neg v19.4s, v19.4s // v19 = out[3]
+ dbutterfly_n v28, v16, v10, v11, v14, v15, v4, v5, v10, v11 // v28 = out[12], v16 = t7
+
+ butterfly_4s v5, v8, v20, v22 // v5 =-out[15],v8 = t3a
+ butterfly_4s v4, v9, v24, v26 // v4 = out[14],v9 = t11
+
+ dmbutterfly0 v23, v24, v6, v8, v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
+ dmbutterfly0 v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
+ dmbutterfly0 v20, v27, v16, v31, v10, v11, v12, v13, v14, v15 // v20 = out[4], v27 = out[11]
+ dmbutterfly0 v22, v25, v9, v7, v10, v11, v12, v13, v14, v15 // v22 = out[6], v25 = out[9]
+
+ neg v31.4s, v5.4s // v31 = out[15]
+ neg v17.4s, v3.4s // v17 = out[1]
+
+ mov v16.16b, v2.16b
+ mov v30.16b, v4.16b
+ ret
+endfunc
+
+// Helper macros; we can't use these expressions directly within
+// e.g. .irp due to the extra concatenation \(). Therefore wrap
+// them in macros to allow using .irp below.
+.macro load i, src, inc
+ ld1 {v\i\().4s}, [\src], \inc
+.endm
+.macro store i, dst, inc
+ st1 {v\i\().4s}, [\dst], \inc
+.endm
+.macro movi_v i, size, imm
+ movi v\i\()\size, \imm
+.endm
+.macro load_clear i, src, inc
+ ld1 {v\i\().4s}, [\src]
+ st1 {v4.4s}, [\src], \inc
+.endm
+
+.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7
+ srshr \coef0, \coef0, #6
+ ld1 {v4.4h}, [x0], x1
+ srshr \coef1, \coef1, #6
+ ld1 {v4.d}[1], [x3], x1
+ srshr \coef2, \coef2, #6
+ ld1 {v5.4h}, [x0], x1
+ srshr \coef3, \coef3, #6
+ uaddw \coef0, \coef0, v4.4h
+ ld1 {v5.d}[1], [x3], x1
+ srshr \coef4, \coef4, #6
+ uaddw2 \coef1, \coef1, v4.8h
+ ld1 {v6.4h}, [x0], x1
+ srshr \coef5, \coef5, #6
+ uaddw \coef2, \coef2, v5.4h
+ ld1 {v6.d}[1], [x3], x1
+ sqxtun v4.4h, \coef0
+ srshr \coef6, \coef6, #6
+ uaddw2 \coef3, \coef3, v5.8h
+ ld1 {v7.4h}, [x0], x1
+ sqxtun2 v4.8h, \coef1
+ srshr \coef7, \coef7, #6
+ uaddw \coef4, \coef4, v6.4h
+ ld1 {v7.d}[1], [x3], x1
+ umin v4.8h, v4.8h, v8.8h
+ sub x0, x0, x1, lsl #2
+ sub x3, x3, x1, lsl #2
+ sqxtun v5.4h, \coef2
+ uaddw2 \coef5, \coef5, v6.8h
+ st1 {v4.4h}, [x0], x1
+ sqxtun2 v5.8h, \coef3
+ uaddw \coef6, \coef6, v7.4h
+ st1 {v4.d}[1], [x3], x1
+ umin v5.8h, v5.8h, v8.8h
+ sqxtun v6.4h, \coef4
+ uaddw2 \coef7, \coef7, v7.8h
+ st1 {v5.4h}, [x0], x1
+ sqxtun2 v6.8h, \coef5
+ st1 {v5.d}[1], [x3], x1
+ umin v6.8h, v6.8h, v8.8h
+ sqxtun v7.4h, \coef6
+ st1 {v6.4h}, [x0], x1
+ sqxtun2 v7.8h, \coef7
+ st1 {v6.d}[1], [x3], x1
+ umin v7.8h, v7.8h, v8.8h
+ st1 {v7.4h}, [x0], x1
+ st1 {v7.d}[1], [x3], x1
+.endm
+
+// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
+// transpose into a horizontal 16x4 slice and store.
+// x0 = dst (temp buffer)
+// x1 = slice offset
+// x2 = src
+// x9 = input stride
+.macro itxfm16_1d_funcs txfm
+function \txfm\()16_1d_4x16_pass1_neon
+ mov x14, x30
+
+ movi v4.4s, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ load_clear \i, x2, x9
+.endr
+
+ bl \txfm\()16
+
+ // Do four 4x4 transposes. Originally, v16-v31 contain the
+ // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
+ // contain the four transposed 4x4 blocks.
+ transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7
+ transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7
+ transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7
+ transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7
+
+ // Store the transposed 4x4 blocks horizontally.
+ cmp x1, #12
+ b.eq 1f
+.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
+ store \i, x0, #16
+.endr
+ br x14
+1:
+ // Special case: For the last input column (x1 == 12),
+ // which would be stored as the last row in the temp buffer,
+ // don't store the first 4x4 block, but keep it in registers
+ // for the first slice of the second pass (where it is the
+ // last 4x4 block).
+ add x0, x0, #16
+ st1 {v20.4s}, [x0], #16
+ st1 {v24.4s}, [x0], #16
+ st1 {v28.4s}, [x0], #16
+ add x0, x0, #16
+ st1 {v21.4s}, [x0], #16
+ st1 {v25.4s}, [x0], #16
+ st1 {v29.4s}, [x0], #16
+ add x0, x0, #16
+ st1 {v22.4s}, [x0], #16
+ st1 {v26.4s}, [x0], #16
+ st1 {v30.4s}, [x0], #16
+ add x0, x0, #16
+ st1 {v23.4s}, [x0], #16
+ st1 {v27.4s}, [x0], #16
+ st1 {v31.4s}, [x0], #16
+
+ mov v28.16b, v16.16b
+ mov v29.16b, v17.16b
+ mov v30.16b, v18.16b
+ mov v31.16b, v19.16b
+ br x14
+endfunc
+
+// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
+// load the destination pixels (from a similar 4x16 slice), add and store back.
+// x0 = dst
+// x1 = dst stride
+// x2 = src (temp buffer)
+// x3 = slice offset
+// x9 = temp buffer stride
+function \txfm\()16_1d_4x16_pass2_neon
+ mov x14, x30
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
+ load \i, x2, x9
+.endr
+ cbz x3, 1f
+.irp i, 28, 29, 30, 31
+ load \i, x2, x9
+.endr
+1:
+
+ add x3, x0, x1
+ lsl x1, x1, #1
+ bl \txfm\()16
+
+ dup v8.8h, w13
+ load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+
+ br x14
+endfunc
+.endm
+
+itxfm16_1d_funcs idct
+itxfm16_1d_funcs iadst
+
+// This is the minimum eob value for each subpartition, in increments of 4
+const min_eob_idct_idct_16, align=4
+ .short 0, 10, 38, 89
+endconst
+
+.macro itxfm_func16x16 txfm1, txfm2
+function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
+.ifc \txfm1\()_\txfm2,idct_idct
+ cmp w3, #1
+ b.eq idct16x16_dc_add_neon
+.endif
+ mov x15, x30
+ // iadst16 requires clobbering v8-v15, idct16 only clobbers v8-v9.
+.ifnc \txfm1\()_\txfm2,idct_idct
+ stp d14, d15, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+.endif
+ stp d8, d9, [sp, #-0x10]!
+
+ sub sp, sp, #1024
+
+ mov x4, x0
+ mov x5, x1
+ mov x6, x2
+
+ movrel x10, idct_coeffs
+.ifnc \txfm1\()_\txfm2,idct_idct
+ movrel x11, iadst16_coeffs
+.endif
+.ifc \txfm1,idct
+ ld1 {v0.8h,v1.8h}, [x10]
+ sxtl v2.4s, v1.4h
+ sxtl2 v3.4s, v1.8h
+ sxtl2 v1.4s, v0.8h
+ sxtl v0.4s, v0.4h
+.endif
+ mov x9, #64
+
+.ifc \txfm1\()_\txfm2,idct_idct
+ cmp w3, #10
+ b.le idct16x16_quarter_add_16_neon
+ cmp w3, #38
+ b.le idct16x16_half_add_16_neon
+
+ movrel x12, min_eob_idct_idct_16, 2
+.endif
+
+.irp i, 0, 4, 8, 12
+ add x0, sp, #(\i*64)
+.ifc \txfm1\()_\txfm2,idct_idct
+.if \i > 0
+ ldrh w1, [x12], #2
+ cmp w3, w1
+ mov x1, #(16 - \i)/4
+ b.le 1f
+.endif
+.endif
+ mov x1, #\i
+ add x2, x6, #(\i*4)
+ bl \txfm1\()16_1d_4x16_pass1_neon
+.endr
+.ifc \txfm1\()_\txfm2,iadst_idct
+ ld1 {v0.8h,v1.8h}, [x10]
+ sxtl v2.4s, v1.4h
+ sxtl2 v3.4s, v1.8h
+ sxtl2 v1.4s, v0.8h
+ sxtl v0.4s, v0.4h
+.endif
+
+.ifc \txfm1\()_\txfm2,idct_idct
+ b 3f
+1:
+ // Set v28-v31 to zero, for the in-register passthrough of
+ // coefficients to pass 2.
+ movi v28.4s, #0
+ movi v29.4s, #0
+ movi v30.4s, #0
+ movi v31.4s, #0
+2:
+ subs x1, x1, #1
+.rept 4
+ st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x0], x9
+.endr
+ b.ne 2b
+3:
+.endif
+
+.irp i, 0, 4, 8, 12
+ add x0, x4, #(\i*2)
+ mov x1, x5
+ add x2, sp, #(\i*4)
+ mov x3, #\i
+ bl \txfm2\()16_1d_4x16_pass2_neon
+.endr
+
+ add sp, sp, #1024
+ ldp d8, d9, [sp], 0x10
+.ifnc \txfm1\()_\txfm2,idct_idct
+ ldp d10, d11, [sp], 0x10
+ ldp d12, d13, [sp], 0x10
+ ldp d14, d15, [sp], 0x10
+.endif
+ br x15
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
+ mov x13, #0x03ff
+ b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1
+ mov x13, #0x0fff
+ b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
+endfunc
+.endm
+
+itxfm_func16x16 idct, idct
+itxfm_func16x16 iadst, idct
+itxfm_func16x16 idct, iadst
+itxfm_func16x16 iadst, iadst
+
+function idct16_1d_4x16_pass1_quarter_neon
+ mov x14, x30
+
+ movi v4.4s, #0
+.irp i, 16, 17, 18, 19
+ load_clear \i, x2, x9
+.endr
+
+ bl idct16_quarter
+
+ // Do four 4x4 transposes. Originally, v16-v31 contain the
+ // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
+ // contain the four transposed 4x4 blocks.
+ transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7
+ transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7
+ transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7
+ transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7
+
+ // Store the transposed 4x4 blocks horizontally.
+ // The first 4x4 block is kept in registers for the second pass,
+ // store the rest in the temp buffer.
+ add x0, x0, #16
+ st1 {v20.4s}, [x0], #16
+ st1 {v24.4s}, [x0], #16
+ st1 {v28.4s}, [x0], #16
+ add x0, x0, #16
+ st1 {v21.4s}, [x0], #16
+ st1 {v25.4s}, [x0], #16
+ st1 {v29.4s}, [x0], #16
+ add x0, x0, #16
+ st1 {v22.4s}, [x0], #16
+ st1 {v26.4s}, [x0], #16
+ st1 {v30.4s}, [x0], #16
+ add x0, x0, #16
+ st1 {v23.4s}, [x0], #16
+ st1 {v27.4s}, [x0], #16
+ st1 {v31.4s}, [x0], #16
+ br x14
+endfunc
+
+function idct16_1d_4x16_pass2_quarter_neon
+ mov x14, x30
+
+ // Only load the top 4 lines, and only do it for the later slices.
+ // For the first slice, d16-d19 is kept in registers from the first pass.
+ cbz x3, 1f
+.irp i, 16, 17, 18, 19
+ load \i, x2, x9
+.endr
+1:
+
+ add x3, x0, x1
+ lsl x1, x1, #1
+ bl idct16_quarter
+
+ dup v8.8h, w13
+ load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+
+ br x14
+endfunc
+
+function idct16_1d_4x16_pass1_half_neon
+ mov x14, x30
+
+ movi v4.4s, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ load_clear \i, x2, x9
+.endr
+
+ bl idct16_half
+
+ // Do four 4x4 transposes. Originally, v16-v31 contain the
+ // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
+ // contain the four transposed 4x4 blocks.
+ transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7
+ transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7
+ transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7
+ transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7
+
+ // Store the transposed 4x4 blocks horizontally.
+ cmp x1, #4
+ b.eq 1f
+.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
+ store \i, x0, #16
+.endr
+ br x14
+1:
+ // Special case: For the second input column (r1 == 4),
+ // which would be stored as the second row in the temp buffer,
+ // don't store the first 4x4 block, but keep it in registers
+ // for the first slice of the second pass (where it is the
+ // second 4x4 block).
+ add x0, x0, #16
+ st1 {v20.4s}, [x0], #16
+ st1 {v24.4s}, [x0], #16
+ st1 {v28.4s}, [x0], #16
+ add x0, x0, #16
+ st1 {v21.4s}, [x0], #16
+ st1 {v25.4s}, [x0], #16
+ st1 {v29.4s}, [x0], #16
+ add x0, x0, #16
+ st1 {v22.4s}, [x0], #16
+ st1 {v26.4s}, [x0], #16
+ st1 {v30.4s}, [x0], #16
+ add x0, x0, #16
+ st1 {v23.4s}, [x0], #16
+ st1 {v27.4s}, [x0], #16
+ st1 {v31.4s}, [x0], #16
+
+ mov v20.16b, v16.16b
+ mov v21.16b, v17.16b
+ mov v22.16b, v18.16b
+ mov v23.16b, v19.16b
+ br x14
+endfunc
+
+function idct16_1d_4x16_pass2_half_neon
+ mov x14, x30
+
+.irp i, 16, 17, 18, 19
+ load \i, x2, x9
+.endr
+ cbz x3, 1f
+.irp i, 20, 21, 22, 23
+ load \i, x2, x9
+.endr
+1:
+
+ add x3, x0, x1
+ lsl x1, x1, #1
+ bl idct16_half
+
+ dup v8.8h, w13
+ load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+
+ br x14
+endfunc
+
+.macro idct16_partial size
+function idct16x16_\size\()_add_16_neon
+ add x0, sp, #(0*64)
+ mov x1, #0
+ add x2, x6, #(0*4)
+ bl idct16_1d_4x16_pass1_\size\()_neon
+.ifc \size,half
+ add x0, sp, #(4*64)
+ mov x1, #4
+ add x2, x6, #(4*4)
+ bl idct16_1d_4x16_pass1_\size\()_neon
+.endif
+
+.irp i, 0, 4, 8, 12
+ add x0, x4, #(\i*2)
+ mov x1, x5
+ add x2, sp, #(\i*4)
+ mov x3, #\i
+ bl idct16_1d_4x16_pass2_\size\()_neon
+.endr
+
+ add sp, sp, #1024
+ ldp d8, d9, [sp], 0x10
+ br x15
+endfunc
+.endm
+
+idct16_partial quarter
+idct16_partial half
+
+function idct32x32_dc_add_neon
+ movrel x4, idct_coeffs
+ ld1 {v0.4h}, [x4]
+ sxtl v0.4s, v0.4h
+
+ movi v1.4h, #0
+
+ ld1 {v2.s}[0], [x2]
+ smull v2.2d, v2.2s, v0.s[0]
+ rshrn v2.2s, v2.2d, #14
+ smull v2.2d, v2.2s, v0.s[0]
+ rshrn v2.2s, v2.2d, #14
+ st1 {v1.s}[0], [x2]
+ dup v2.4s, v2.s[0]
+
+ srshr v0.4s, v2.4s, #6
+
+ mov x3, x0
+ mov x4, #32
+ sub x1, x1, #32
+ dup v31.8h, w13
+1:
+ // Loop to add the constant v0 into all 32x32 outputs
+ subs x4, x4, #1
+ ld1 {v1.8h,v2.8h}, [x0], #32
+ uaddw v16.4s, v0.4s, v1.4h
+ uaddw2 v17.4s, v0.4s, v1.8h
+ ld1 {v3.8h,v4.8h}, [x0], x1
+ uaddw v18.4s, v0.4s, v2.4h
+ uaddw2 v19.4s, v0.4s, v2.8h
+ uaddw v20.4s, v0.4s, v3.4h
+ uaddw2 v21.4s, v0.4s, v3.8h
+ uaddw v22.4s, v0.4s, v4.4h
+ uaddw2 v23.4s, v0.4s, v4.8h
+ sqxtun v1.4h, v16.4s
+ sqxtun2 v1.8h, v17.4s
+ sqxtun v2.4h, v18.4s
+ sqxtun2 v2.8h, v19.4s
+ sqxtun v3.4h, v20.4s
+ sqxtun2 v3.8h, v21.4s
+ sqxtun v4.4h, v22.4s
+ sqxtun2 v4.8h, v23.4s
+ umin v1.8h, v1.8h, v31.8h
+ umin v2.8h, v2.8h, v31.8h
+ st1 {v1.8h,v2.8h}, [x3], #32
+ umin v3.8h, v3.8h, v31.8h
+ umin v4.8h, v4.8h, v31.8h
+ st1 {v3.8h,v4.8h}, [x3], x1
+ b.ne 1b
+
+ ret
+endfunc
+
+.macro idct32_end
+ butterfly_4s v16, v5, v4, v5 // v16 = t16a, v5 = t19a
+ butterfly_4s v17, v20, v23, v20 // v17 = t17, v20 = t18
+ butterfly_4s v18, v6, v7, v6 // v18 = t23a, v6 = t20a
+ butterfly_4s v19, v21, v22, v21 // v19 = t22, v21 = t21
+ butterfly_4s v4, v28, v28, v30 // v4 = t24a, v28 = t27a
+ butterfly_4s v23, v26, v25, v26 // v23 = t25, v26 = t26
+ butterfly_4s v7, v8, v29, v31 // v7 = t31a, v3 = t28a
+ butterfly_4s v22, v27, v24, v27 // v22 = t30, v27 = t29
+
+ dmbutterfly v27, v20, v0.s[2], v0.s[3], v24, v25, v30, v31 // v27 = t18a, v20 = t29a
+ dmbutterfly v8, v5, v0.s[2], v0.s[3], v24, v25, v30, v31 // v3 = t19, v5 = t28
+ dmbutterfly v28, v6, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20
+ dmbutterfly v26, v21, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
+
+ butterfly_4s v31, v24, v7, v4 // v31 = t31, v24 = t24
+ butterfly_4s v30, v25, v22, v23 // v30 = t30a, v25 = t25a
+ butterfly_4s_r v23, v16, v16, v18 // v23 = t23, v16 = t16
+ butterfly_4s_r v22, v17, v17, v19 // v22 = t22a, v17 = t17a
+ butterfly_4s v18, v21, v27, v21 // v18 = t18, v21 = t21
+ butterfly_4s_r v27, v28, v5, v28 // v27 = t27a, v28 = t28a
+ butterfly_4s v29, v26, v20, v26 // v29 = t29, v26 = t26
+ butterfly_4s v19, v20, v8, v6 // v19 = t19a, v20 = t20
+
+ dmbutterfly0 v27, v20, v27, v20, v4, v5, v6, v7, v8, v9 // v27 = t27, v20 = t20
+ dmbutterfly0 v26, v21, v26, v21, v4, v5, v6, v7, v8, v9 // v26 = t26a, v21 = t21a
+ dmbutterfly0 v25, v22, v25, v22, v4, v5, v6, v7, v8, v9 // v25 = t25, v22 = t22
+ dmbutterfly0 v24, v23, v24, v23, v4, v5, v6, v7, v8, v9 // v24 = t24a, v23 = t23a
+ ret
+.endm
+
+function idct32_odd
+ dmbutterfly v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+ dmbutterfly v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+ dmbutterfly v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+ dmbutterfly v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+ dmbutterfly v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+ dmbutterfly v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+ dmbutterfly v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+ dmbutterfly v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
+
+ butterfly_4s v4, v24, v16, v24 // v4 = t16, v24 = t17
+ butterfly_4s v5, v20, v28, v20 // v5 = t19, v20 = t18
+ butterfly_4s v6, v26, v18, v26 // v6 = t20, v26 = t21
+ butterfly_4s v7, v22, v30, v22 // v7 = t23, v22 = t22
+ butterfly_4s v28, v25, v17, v25 // v28 = t24, v25 = t25
+ butterfly_4s v30, v21, v29, v21 // v30 = t27, v21 = t26
+ butterfly_4s v29, v23, v31, v23 // v29 = t31, v23 = t30
+ butterfly_4s v31, v27, v19, v27 // v31 = t28, v27 = t29
+
+ dmbutterfly v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
+ dmbutterfly v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
+ dmbutterfly v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
+ dmbutterfly v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+ idct32_end
+endfunc
+
+function idct32_odd_half
+ dmbutterfly_h1 v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+ dmbutterfly_h2 v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+ dmbutterfly_h1 v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+ dmbutterfly_h2 v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+ dmbutterfly_h1 v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+ dmbutterfly_h2 v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+ dmbutterfly_h1 v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+ dmbutterfly_h2 v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
+
+ butterfly_4s v4, v24, v16, v24 // v4 = t16, v24 = t17
+ butterfly_4s v5, v20, v28, v20 // v5 = t19, v20 = t18
+ butterfly_4s v6, v26, v18, v26 // v6 = t20, v26 = t21
+ butterfly_4s v7, v22, v30, v22 // v7 = t23, v22 = t22
+ butterfly_4s v28, v25, v17, v25 // v28 = t24, v25 = t25
+ butterfly_4s v30, v21, v29, v21 // v30 = t27, v21 = t26
+ butterfly_4s v29, v23, v31, v23 // v29 = t31, v23 = t30
+ butterfly_4s v31, v27, v19, v27 // v31 = t28, v27 = t29
+
+ dmbutterfly v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
+ dmbutterfly v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
+ dmbutterfly v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
+ dmbutterfly v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+ idct32_end
+endfunc
+
+function idct32_odd_quarter
+ dsmull_h v4, v5, v16, v10.s[0]
+ dsmull_h v28, v29, v19, v11.s[3]
+ dsmull_h v30, v31, v16, v10.s[1]
+ dsmull_h v22, v23, v17, v13.s[2]
+ dsmull_h v7, v6, v17, v13.s[3]
+ dsmull_h v26, v27, v19, v11.s[2]
+ dsmull_h v20, v21, v18, v12.s[0]
+ dsmull_h v24, v25, v18, v12.s[1]
+
+ neg v28.2d, v28.2d
+ neg v29.2d, v29.2d
+ neg v7.2d, v7.2d
+ neg v6.2d, v6.2d
+
+ drshrn_h v4, v4, v5, #14
+ drshrn_h v5, v28, v29, #14
+ drshrn_h v29, v30, v31, #14
+ drshrn_h v28, v22, v23, #14
+ drshrn_h v7, v7, v6, #14
+ drshrn_h v31, v26, v27, #14
+ drshrn_h v6, v20, v21, #14
+ drshrn_h v30, v24, v25, #14
+
+ dmbutterfly_l v16, v17, v18, v19, v29, v4, v1.s[0], v1.s[1]
+ dmbutterfly_l v27, v26, v20, v21, v31, v5, v1.s[0], v1.s[1]
+ drshrn_h v23, v16, v17, #14
+ drshrn_h v24, v18, v19, #14
+ neg v20.2d, v20.2d
+ neg v21.2d, v21.2d
+ drshrn_h v27, v27, v26, #14
+ drshrn_h v20, v20, v21, #14
+ dmbutterfly_l v16, v17, v18, v19, v30, v6, v1.s[2], v1.s[3]
+ drshrn_h v21, v16, v17, #14
+ drshrn_h v26, v18, v19, #14
+ dmbutterfly_l v16, v17, v18, v19, v28, v7, v1.s[2], v1.s[3]
+ drshrn_h v25, v16, v17, #14
+ neg v18.2d, v18.2d
+ neg v19.2d, v19.2d
+ drshrn_h v22, v18, v19, #14
+
+ idct32_end
+endfunc
+
+.macro idct32_funcs suffix
+// Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
+// The 32-point IDCT can be decomposed into two 16-point IDCTs;
+// a normal IDCT16 with every other input component (the even ones, with
+// each output written twice), followed by a separate 16-point IDCT
+// of the odd inputs, added/subtracted onto the outputs of the first idct16.
+// x0 = dst (temp buffer)
+// x1 = unused
+// x2 = src
+// x9 = double input stride
+function idct32_1d_4x32_pass1\suffix\()_neon
+ mov x14, x30
+
+ movi v4.4s, #0
+
+ // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ load_clear \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+ load_clear \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ load_clear \i, x2, x9
+.endr
+.endif
+
+ bl idct16\suffix
+
+ // Do four 4x4 transposes. Originally, v16-v31 contain the
+ // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
+ // contain the four transposed 4x4 blocks.
+ transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7
+ transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7
+ transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7
+ transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7
+
+ // Store the registers a, b, c, d horizontally, followed by the
+ // same registers d, c, b, a mirrored.
+.macro store_rev a, b, c, d
+ // There's no rev128 instruction, but we reverse each 64 bit
+ // half, and then flip them using an ext with 8 bytes offset.
+ rev64 v7.4s, \d
+ st1 {\a}, [x0], #16
+ ext v7.16b, v7.16b, v7.16b, #8
+ st1 {\b}, [x0], #16
+ rev64 v6.4s, \c
+ st1 {\c}, [x0], #16
+ ext v6.16b, v6.16b, v6.16b, #8
+ st1 {\d}, [x0], #16
+ rev64 v5.4s, \b
+ st1 {v7.4s}, [x0], #16
+ ext v5.16b, v5.16b, v5.16b, #8
+ st1 {v6.4s}, [x0], #16
+ rev64 v4.4s, \a
+ st1 {v5.4s}, [x0], #16
+ ext v4.16b, v4.16b, v4.16b, #8
+ st1 {v4.4s}, [x0], #16
+.endm
+ store_rev v16.4s, v20.4s, v24.4s, v28.4s
+ store_rev v17.4s, v21.4s, v25.4s, v29.4s
+ store_rev v18.4s, v22.4s, v26.4s, v30.4s
+ store_rev v19.4s, v23.4s, v27.4s, v31.4s
+ sub x0, x0, #512
+.purgem store_rev
+
+ // Move x2 back to the start of the input, and move
+ // to the first odd row
+.ifb \suffix
+ sub x2, x2, x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+ sub x2, x2, x9, lsl #2
+.endif
+.ifc \suffix,_half
+ sub x2, x2, x9, lsl #3
+.endif
+ add x2, x2, #128
+
+ movi v4.4s, #0
+ // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ load_clear \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+ load_clear \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ load_clear \i, x2, x9
+.endr
+.endif
+
+ bl idct32_odd\suffix
+
+ transpose_4x4s v31, v30, v29, v28, v4, v5, v6, v7
+ transpose_4x4s v27, v26, v25, v24, v4, v5, v6, v7
+ transpose_4x4s v23, v22, v21, v20, v4, v5, v6, v7
+ transpose_4x4s v19, v18, v17, v16, v4, v5, v6, v7
+
+ // Store the registers a, b, c, d horizontally,
+ // adding into the output first, and the mirrored,
+ // subtracted from the output.
+.macro store_rev a, b, c, d, a16b, b16b
+ ld1 {v4.4s}, [x0]
+ rev64 v9.4s, \d
+ add v4.4s, v4.4s, \a
+ st1 {v4.4s}, [x0], #16
+ rev64 v8.4s, \c
+ ld1 {v4.4s}, [x0]
+ ext v9.16b, v9.16b, v9.16b, #8
+ add v4.4s, v4.4s, \b
+ st1 {v4.4s}, [x0], #16
+ ext v8.16b, v8.16b, v8.16b, #8
+ ld1 {v4.4s}, [x0]
+ rev64 \b, \b
+ add v4.4s, v4.4s, \c
+ st1 {v4.4s}, [x0], #16
+ rev64 \a, \a
+ ld1 {v4.4s}, [x0]
+ ext \b16b, \b16b, \b16b, #8
+ add v4.4s, v4.4s, \d
+ st1 {v4.4s}, [x0], #16
+ ext \a16b, \a16b, \a16b, #8
+ ld1 {v4.4s}, [x0]
+ sub v4.4s, v4.4s, v9.4s
+ st1 {v4.4s}, [x0], #16
+ ld1 {v4.4s}, [x0]
+ sub v4.4s, v4.4s, v8.4s
+ st1 {v4.4s}, [x0], #16
+ ld1 {v4.4s}, [x0]
+ sub v4.4s, v4.4s, \b
+ st1 {v4.4s}, [x0], #16
+ ld1 {v4.4s}, [x0]
+ sub v4.4s, v4.4s, \a
+ st1 {v4.4s}, [x0], #16
+.endm
+
+ store_rev v31.4s, v27.4s, v23.4s, v19.4s, v31.16b, v27.16b
+ store_rev v30.4s, v26.4s, v22.4s, v18.4s, v30.16b, v26.16b
+ store_rev v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b
+ store_rev v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b
+.purgem store_rev
+ br x14
+endfunc
+
+// This is mostly the same as 4x32_pass1, but without the transpose,
+// and use the source as temp buffer between the two idct passes, and
+// add into the destination.
+// x0 = dst
+// x1 = dst stride
+// x2 = src (temp buffer)
+// x7 = negative double temp buffer stride
+// x9 = double temp buffer stride
+function idct32_1d_4x32_pass2\suffix\()_neon
+ mov x14, x30
+
+ // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ load \i, x2, x9
+.endr
+ sub x2, x2, x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+ load \i, x2, x9
+.endr
+ sub x2, x2, x9, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ load \i, x2, x9
+.endr
+ sub x2, x2, x9, lsl #3
+.endif
+
+ bl idct16\suffix
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ store \i, x2, x9
+.endr
+
+ sub x2, x2, x9, lsl #4
+ add x2, x2, #128
+
+ // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ load \i, x2, x9
+.endr
+ sub x2, x2, x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+ load \i, x2, x9
+.endr
+ sub x2, x2, x9, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ load \i, x2, x9
+.endr
+ sub x2, x2, x9, lsl #3
+.endif
+ sub x2, x2, #128
+
+ bl idct32_odd\suffix
+
+.macro load_acc_store a, b, c, d, neg=0
+.if \neg == 0
+ ld1 {v4.4s}, [x2], x9
+ ld1 {v5.4s}, [x2], x9
+ add v4.4s, v4.4s, \a
+ ld1 {v6.4s}, [x2], x9
+ add v5.4s, v5.4s, \b
+ ld1 {v7.4s}, [x2], x9
+ add v6.4s, v6.4s, \c
+ add v7.4s, v7.4s, \d
+.else
+ ld1 {v4.4s}, [x2], x7
+ ld1 {v5.4s}, [x2], x7
+ sub v4.4s, v4.4s, \a
+ ld1 {v6.4s}, [x2], x7
+ sub v5.4s, v5.4s, \b
+ ld1 {v7.4s}, [x2], x7
+ sub v6.4s, v6.4s, \c
+ sub v7.4s, v7.4s, \d
+.endif
+ ld1 {v8.4h}, [x0], x1
+ ld1 {v8.d}[1], [x0], x1
+ srshr v4.4s, v4.4s, #6
+ ld1 {v9.4h}, [x0], x1
+ srshr v5.4s, v5.4s, #6
+ uaddw v4.4s, v4.4s, v8.4h
+ ld1 {v9.d}[1], [x0], x1
+ srshr v6.4s, v6.4s, #6
+ uaddw2 v5.4s, v5.4s, v8.8h
+ srshr v7.4s, v7.4s, #6
+ sub x0, x0, x1, lsl #2
+ uaddw v6.4s, v6.4s, v9.4h
+ sqxtun v4.4h, v4.4s
+ uaddw2 v7.4s, v7.4s, v9.8h
+ sqxtun2 v4.8h, v5.4s
+ umin v4.8h, v4.8h, v15.8h
+ st1 {v4.4h}, [x0], x1
+ sqxtun v5.4h, v6.4s
+ st1 {v4.d}[1], [x0], x1
+ sqxtun2 v5.8h, v7.4s
+ umin v5.8h, v5.8h, v15.8h
+ st1 {v5.4h}, [x0], x1
+ st1 {v5.d}[1], [x0], x1
+.endm
+ load_acc_store v31.4s, v30.4s, v29.4s, v28.4s
+ load_acc_store v27.4s, v26.4s, v25.4s, v24.4s
+ load_acc_store v23.4s, v22.4s, v21.4s, v20.4s
+ load_acc_store v19.4s, v18.4s, v17.4s, v16.4s
+ sub x2, x2, x9
+ load_acc_store v16.4s, v17.4s, v18.4s, v19.4s, 1
+ load_acc_store v20.4s, v21.4s, v22.4s, v23.4s, 1
+ load_acc_store v24.4s, v25.4s, v26.4s, v27.4s, 1
+ load_acc_store v28.4s, v29.4s, v30.4s, v31.4s, 1
+.purgem load_acc_store
+ br x14
+endfunc
+.endm
+
+idct32_funcs
+idct32_funcs _quarter
+idct32_funcs _half
+
+const min_eob_idct_idct_32, align=4
+ .short 0, 9, 34, 70, 135, 240, 336, 448
+endconst
+
+function vp9_idct_idct_32x32_add_16_neon
+ cmp w3, #1
+ b.eq idct32x32_dc_add_neon
+
+ movrel x10, idct_coeffs
+
+ mov x15, x30
+ stp d8, d9, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d14, d15, [sp, #-0x10]!
+
+ sub sp, sp, #4096
+
+ mov x4, x0
+ mov x5, x1
+ mov x6, x2
+
+ // Double stride of the input, since we only read every other line
+ mov x9, #256
+ neg x7, x9
+
+ ld1 {v0.8h,v1.8h}, [x10], #32
+ sxtl v2.4s, v1.4h
+ sxtl2 v3.4s, v1.8h
+ sxtl2 v1.4s, v0.8h
+ sxtl v0.4s, v0.4h
+ ld1 {v10.8h,v11.8h}, [x10]
+ sxtl v12.4s, v11.4h
+ sxtl2 v13.4s, v11.8h
+ sxtl2 v11.4s, v10.8h
+ sxtl v10.4s, v10.4h
+
+ dup v15.8h, w13
+
+ cmp w3, #34
+ b.le idct32x32_quarter_add_16_neon
+ cmp w3, #135
+ b.le idct32x32_half_add_16_neon
+
+ movrel x12, min_eob_idct_idct_32, 2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x0, sp, #(\i*128)
+.if \i > 0
+ ldrh w1, [x12], #2
+ cmp w3, w1
+ mov x1, #(32 - \i)/4
+ b.le 1f
+.endif
+ add x2, x6, #(\i*4)
+ bl idct32_1d_4x32_pass1_neon
+.endr
+ b 3f
+
+1:
+ // Write zeros to the temp buffer for pass 2
+ movi v16.4s, #0
+ movi v17.4s, #0
+ movi v18.4s, #0
+ movi v19.4s, #0
+2:
+ subs x1, x1, #1
+.rept 4
+ st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+ st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+.endr
+ b.ne 2b
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x0, x4, #(\i*2)
+ mov x1, x5
+ add x2, sp, #(\i*4)
+ bl idct32_1d_4x32_pass2_neon
+.endr
+
+ add sp, sp, #4096
+ ldp d14, d15, [sp], 0x10
+ ldp d12, d13, [sp], 0x10
+ ldp d10, d11, [sp], 0x10
+ ldp d8, d9, [sp], 0x10
+
+ br x15
+endfunc
+
+function ff_vp9_idct_idct_32x32_add_10_neon, export=1
+ mov x13, #0x03ff
+ b vp9_idct_idct_32x32_add_16_neon
+endfunc
+
+function ff_vp9_idct_idct_32x32_add_12_neon, export=1
+ mov x13, #0x0fff
+ b vp9_idct_idct_32x32_add_16_neon
+endfunc
+
+.macro idct32_partial size
+function idct32x32_\size\()_add_16_neon
+.irp i, 0, 4
+ add x0, sp, #(\i*128)
+.ifc \size,quarter
+.if \i == 4
+ cmp w3, #9
+ b.le 1f
+.endif
+.endif
+ add x2, x6, #(\i*4)
+ bl idct32_1d_4x32_pass1_\size\()_neon
+.endr
+
+.ifc \size,half
+.irp i, 8, 12
+ add x0, sp, #(\i*128)
+.if \i == 12
+ cmp w3, #70
+ b.le 1f
+.endif
+ add x2, x6, #(\i*4)
+ bl idct32_1d_4x32_pass1_\size\()_neon
+.endr
+.endif
+ b 3f
+
+1:
+ // Write zeros to the temp buffer for pass 2
+ movi v16.4s, #0
+ movi v17.4s, #0
+ movi v18.4s, #0
+ movi v19.4s, #0
+
+.rept 4
+ st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+ st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+.endr
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x0, x4, #(\i*2)
+ mov x1, x5
+ add x2, sp, #(\i*4)
+ bl idct32_1d_4x32_pass2_\size\()_neon
+.endr
+
+ add sp, sp, #4096
+ ldp d14, d15, [sp], 0x10
+ ldp d12, d13, [sp], 0x10
+ ldp d10, d11, [sp], 0x10
+ ldp d8, d9, [sp], 0x10
+
+ br x15
+endfunc
+.endm
+
+idct32_partial quarter
+idct32_partial half
diff --git a/media/ffvpx/libavcodec/aarch64/vp9itxfm_neon.S b/media/ffvpx/libavcodec/aarch64/vp9itxfm_neon.S
new file mode 100644
index 0000000000..99413b0f70
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/vp9itxfm_neon.S
@@ -0,0 +1,1580 @@
+/*
+ * Copyright (c) 2016 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+const itxfm4_coeffs, align=4
+ .short 11585, 0, 6270, 15137
+iadst4_coeffs:
+ .short 5283, 15212, 9929, 13377
+endconst
+
+const iadst8_coeffs, align=4
+ .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
+idct_coeffs:
+ .short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
+ .short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
+ .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
+ .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
+endconst
+
+const iadst16_coeffs, align=4
+ .short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
+ .short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
+endconst
+
+// out1 = ((in1 + in2) * v0[0] + (1 << 13)) >> 14
+// out2 = ((in1 - in2) * v0[0] + (1 << 13)) >> 14
+// in/out are .8h registers; this can do with 4 temp registers, but is
+// more efficient if 6 temp registers are available.
+.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
+.if \neg > 0
+ neg \tmp4\().4h, v0.4h
+.endif
+ add \tmp1\().8h, \in1\().8h, \in2\().8h
+ sub \tmp2\().8h, \in1\().8h, \in2\().8h
+.if \neg > 0
+ smull \tmp3\().4s, \tmp1\().4h, \tmp4\().h[0]
+ smull2 \tmp4\().4s, \tmp1\().8h, \tmp4\().h[0]
+.else
+ smull \tmp3\().4s, \tmp1\().4h, v0.h[0]
+ smull2 \tmp4\().4s, \tmp1\().8h, v0.h[0]
+.endif
+.ifb \tmp5
+ rshrn \out1\().4h, \tmp3\().4s, #14
+ rshrn2 \out1\().8h, \tmp4\().4s, #14
+ smull \tmp3\().4s, \tmp2\().4h, v0.h[0]
+ smull2 \tmp4\().4s, \tmp2\().8h, v0.h[0]
+ rshrn \out2\().4h, \tmp3\().4s, #14
+ rshrn2 \out2\().8h, \tmp4\().4s, #14
+.else
+ smull \tmp5\().4s, \tmp2\().4h, v0.h[0]
+ smull2 \tmp6\().4s, \tmp2\().8h, v0.h[0]
+ rshrn \out1\().4h, \tmp3\().4s, #14
+ rshrn2 \out1\().8h, \tmp4\().4s, #14
+ rshrn \out2\().4h, \tmp5\().4s, #14
+ rshrn2 \out2\().8h, \tmp6\().4s, #14
+.endif
+.endm
+
+// Same as dmbutterfly0 above, but treating the input in in2 as zero,
+// writing the same output into both out1 and out2.
+.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
+ smull \tmp1\().4s, \in1\().4h, v0.h[0]
+ smull2 \tmp2\().4s, \in1\().8h, v0.h[0]
+ rshrn \out1\().4h, \tmp1\().4s, #14
+ rshrn2 \out1\().8h, \tmp2\().4s, #14
+ rshrn \out2\().4h, \tmp1\().4s, #14
+ rshrn2 \out2\().8h, \tmp2\().4s, #14
+.endm
+
+// out1,out2 = in1 * coef1 - in2 * coef2
+// out3,out4 = in1 * coef2 + in2 * coef1
+// out are 4 x .4s registers, in are 2 x .8h registers
+.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
+ smull \out1\().4s, \in1\().4h, \coef1
+ smull2 \out2\().4s, \in1\().8h, \coef1
+ smull \out3\().4s, \in1\().4h, \coef2
+ smull2 \out4\().4s, \in1\().8h, \coef2
+ smlsl \out1\().4s, \in2\().4h, \coef2
+ smlsl2 \out2\().4s, \in2\().8h, \coef2
+ smlal \out3\().4s, \in2\().4h, \coef1
+ smlal2 \out4\().4s, \in2\().8h, \coef1
+.endm
+
+// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
+// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
+// inout are 2 x .8h registers
+.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
+ dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
+.if \neg > 0
+ neg \tmp3\().4s, \tmp3\().4s
+ neg \tmp4\().4s, \tmp4\().4s
+.endif
+ rshrn \inout1\().4h, \tmp1\().4s, #14
+ rshrn2 \inout1\().8h, \tmp2\().4s, #14
+ rshrn \inout2\().4h, \tmp3\().4s, #14
+ rshrn2 \inout2\().8h, \tmp4\().4s, #14
+.endm
+
+// Same as dmbutterfly above, but treating the input in inout2 as zero
+.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+ smull \tmp1\().4s, \inout1\().4h, \coef1
+ smull2 \tmp2\().4s, \inout1\().8h, \coef1
+ smull \tmp3\().4s, \inout1\().4h, \coef2
+ smull2 \tmp4\().4s, \inout1\().8h, \coef2
+ rshrn \inout1\().4h, \tmp1\().4s, #14
+ rshrn2 \inout1\().8h, \tmp2\().4s, #14
+ rshrn \inout2\().4h, \tmp3\().4s, #14
+ rshrn2 \inout2\().8h, \tmp4\().4s, #14
+.endm
+
+// Same as dmbutterfly above, but treating the input in inout1 as zero
+.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+ smull \tmp1\().4s, \inout2\().4h, \coef2
+ smull2 \tmp2\().4s, \inout2\().8h, \coef2
+ smull \tmp3\().4s, \inout2\().4h, \coef1
+ smull2 \tmp4\().4s, \inout2\().8h, \coef1
+ neg \tmp1\().4s, \tmp1\().4s
+ neg \tmp2\().4s, \tmp2\().4s
+ rshrn \inout2\().4h, \tmp3\().4s, #14
+ rshrn2 \inout2\().8h, \tmp4\().4s, #14
+ rshrn \inout1\().4h, \tmp1\().4s, #14
+ rshrn2 \inout1\().8h, \tmp2\().4s, #14
+.endm
+
+.macro dsmull_h out1, out2, in, coef
+ smull \out1\().4s, \in\().4h, \coef
+ smull2 \out2\().4s, \in\().8h, \coef
+.endm
+
+.macro drshrn_h out, in1, in2, shift
+ rshrn \out\().4h, \in1\().4s, \shift
+ rshrn2 \out\().8h, \in2\().4s, \shift
+.endm
+
+
+// out1 = in1 + in2
+// out2 = in1 - in2
+.macro butterfly_8h out1, out2, in1, in2
+ add \out1\().8h, \in1\().8h, \in2\().8h
+ sub \out2\().8h, \in1\().8h, \in2\().8h
+.endm
+
+// out1 = in1 - in2
+// out2 = in1 + in2
+.macro butterfly_8h_r out1, out2, in1, in2
+ sub \out1\().8h, \in1\().8h, \in2\().8h
+ add \out2\().8h, \in1\().8h, \in2\().8h
+.endm
+
+// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
+// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
+// out are 2 x .8h registers, in are 4 x .4s registers
+.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
+ add \tmp1\().4s, \in1\().4s, \in3\().4s
+ add \tmp2\().4s, \in2\().4s, \in4\().4s
+ sub \tmp3\().4s, \in1\().4s, \in3\().4s
+ sub \tmp4\().4s, \in2\().4s, \in4\().4s
+ rshrn \out1\().4h, \tmp1\().4s, #14
+ rshrn2 \out1\().8h, \tmp2\().4s, #14
+ rshrn \out2\().4h, \tmp3\().4s, #14
+ rshrn2 \out2\().8h, \tmp4\().4s, #14
+.endm
+
+.macro iwht4 c0, c1, c2, c3
+ add \c0\().4h, \c0\().4h, \c1\().4h
+ sub v17.4h, \c2\().4h, \c3\().4h
+ sub v16.4h, \c0\().4h, v17.4h
+ sshr v16.4h, v16.4h, #1
+ sub \c2\().4h, v16.4h, \c1\().4h
+ sub \c1\().4h, v16.4h, \c3\().4h
+ add \c3\().4h, v17.4h, \c2\().4h
+ sub \c0\().4h, \c0\().4h, \c1\().4h
+.endm
+
+.macro idct4 c0, c1, c2, c3
+ smull v22.4s, \c1\().4h, v0.h[3]
+ smull v20.4s, \c1\().4h, v0.h[2]
+ add v16.4h, \c0\().4h, \c2\().4h
+ sub v17.4h, \c0\().4h, \c2\().4h
+ smlal v22.4s, \c3\().4h, v0.h[2]
+ smull v18.4s, v16.4h, v0.h[0]
+ smull v19.4s, v17.4h, v0.h[0]
+ smlsl v20.4s, \c3\().4h, v0.h[3]
+ rshrn v22.4h, v22.4s, #14
+ rshrn v18.4h, v18.4s, #14
+ rshrn v19.4h, v19.4s, #14
+ rshrn v20.4h, v20.4s, #14
+ add \c0\().4h, v18.4h, v22.4h
+ sub \c3\().4h, v18.4h, v22.4h
+ add \c1\().4h, v19.4h, v20.4h
+ sub \c2\().4h, v19.4h, v20.4h
+.endm
+
+.macro iadst4 c0, c1, c2, c3
+ smull v16.4s, \c0\().4h, v0.h[4]
+ smlal v16.4s, \c2\().4h, v0.h[5]
+ smlal v16.4s, \c3\().4h, v0.h[6]
+ smull v17.4s, \c0\().4h, v0.h[6]
+ smlsl v17.4s, \c2\().4h, v0.h[4]
+ sub \c0\().4h, \c0\().4h, \c2\().4h
+ smlsl v17.4s, \c3\().4h, v0.h[5]
+ add \c0\().4h, \c0\().4h, \c3\().4h
+ smull v19.4s, \c1\().4h, v0.h[7]
+ smull v18.4s, \c0\().4h, v0.h[7]
+ add v20.4s, v16.4s, v19.4s
+ add v21.4s, v17.4s, v19.4s
+ rshrn \c0\().4h, v20.4s, #14
+ add v16.4s, v16.4s, v17.4s
+ rshrn \c1\().4h, v21.4s, #14
+ sub v16.4s, v16.4s, v19.4s
+ rshrn \c2\().4h, v18.4s, #14
+ rshrn \c3\().4h, v16.4s, #14
+.endm
+
+// The public functions in this file have got the following signature:
+// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+
+.macro itxfm_func4x4 txfm1, txfm2
+function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
+.ifc \txfm1,\txfm2
+.ifc \txfm1,idct
+ movrel x4, itxfm4_coeffs
+ ld1 {v0.4h}, [x4]
+.endif
+.ifc \txfm1,iadst
+ movrel x4, iadst4_coeffs
+ ld1 {v0.d}[1], [x4]
+.endif
+.else
+ movrel x4, itxfm4_coeffs
+ ld1 {v0.8h}, [x4]
+.endif
+
+ movi v31.8h, #0
+.ifc \txfm1\()_\txfm2,idct_idct
+ cmp w3, #1
+ b.ne 1f
+ // DC-only for idct/idct
+ ld1 {v2.h}[0], [x2]
+ smull v2.4s, v2.4h, v0.h[0]
+ rshrn v2.4h, v2.4s, #14
+ smull v2.4s, v2.4h, v0.h[0]
+ rshrn v2.4h, v2.4s, #14
+ st1 {v31.h}[0], [x2]
+ dup v4.4h, v2.h[0]
+ mov v5.16b, v4.16b
+ mov v6.16b, v4.16b
+ mov v7.16b, v4.16b
+ b 2f
+.endif
+
+1:
+ ld1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x2]
+ st1 {v31.8h}, [x2], #16
+
+.ifc \txfm1,iwht
+ sshr v4.4h, v4.4h, #2
+ sshr v5.4h, v5.4h, #2
+ sshr v6.4h, v6.4h, #2
+ sshr v7.4h, v7.4h, #2
+.endif
+
+ \txfm1\()4 v4, v5, v6, v7
+
+ st1 {v31.8h}, [x2], #16
+ // Transpose 4x4 with 16 bit elements
+ transpose_4x4H v4, v5, v6, v7, v16, v17, v18, v19
+
+ \txfm2\()4 v4, v5, v6, v7
+2:
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v1.s}[0], [x0], x1
+.ifnc \txfm1,iwht
+ srshr v4.4h, v4.4h, #4
+ srshr v5.4h, v5.4h, #4
+ srshr v6.4h, v6.4h, #4
+ srshr v7.4h, v7.4h, #4
+.endif
+ uaddw v4.8h, v4.8h, v0.8b
+ uaddw v5.8h, v5.8h, v1.8b
+ ld1 {v2.s}[0], [x0], x1
+ ld1 {v3.s}[0], [x0], x1
+ sqxtun v0.8b, v4.8h
+ sqxtun v1.8b, v5.8h
+ sub x0, x0, x1, lsl #2
+
+ uaddw v6.8h, v6.8h, v2.8b
+ uaddw v7.8h, v7.8h, v3.8b
+ st1 {v0.s}[0], [x0], x1
+ sqxtun v2.8b, v6.8h
+ sqxtun v3.8b, v7.8h
+
+ st1 {v1.s}[0], [x0], x1
+ st1 {v2.s}[0], [x0], x1
+ st1 {v3.s}[0], [x0], x1
+
+ ret
+endfunc
+.endm
+
+itxfm_func4x4 idct, idct
+itxfm_func4x4 iadst, idct
+itxfm_func4x4 idct, iadst
+itxfm_func4x4 iadst, iadst
+itxfm_func4x4 iwht, iwht
+
+
+.macro idct8
+ dmbutterfly0 v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a
+ dmbutterfly v18, v22, v0.h[2], v0.h[3], v2, v3, v4, v5 // v18 = t2a, v22 = t3a
+ dmbutterfly v17, v23, v0.h[4], v0.h[5], v2, v3, v4, v5 // v17 = t4a, v23 = t7a
+ dmbutterfly v21, v19, v0.h[6], v0.h[7], v2, v3, v4, v5 // v21 = t5a, v19 = t6a
+
+ butterfly_8h v24, v25, v16, v22 // v24 = t0, v25 = t3
+ butterfly_8h v28, v29, v17, v21 // v28 = t4, v29 = t5a
+ butterfly_8h v30, v31, v23, v19 // v30 = t7, v31 = t6a
+ butterfly_8h v26, v27, v20, v18 // v26 = t1, v27 = t2
+
+ dmbutterfly0 v31, v29, v31, v29, v2, v3, v4, v5, v6, v7 // v31 = t6, v29 = t5
+
+ butterfly_8h v16, v23, v24, v30 // v16 = out[0], v23 = out[7]
+ butterfly_8h v17, v22, v26, v31 // v17 = out[1], v22 = out[6]
+ butterfly_8h v18, v21, v27, v29 // q13 = out[2], q10 = out[5]
+ butterfly_8h v19, v20, v25, v28 // v17 = out[3], q12 = out[4]
+.endm
+
+.macro iadst8
+ dmbutterfly_l v24, v25, v26, v27, v23, v16, v1.h[1], v1.h[0] // v24,v25 = t1a, v26,v27 = t0a
+ dmbutterfly_l v28, v29, v30, v31, v21, v18, v1.h[3], v1.h[2] // v28,v29 = t3a, v30,v31 = t2a
+ dmbutterfly_l v2, v3, v4, v5, v19, v20, v1.h[5], v1.h[4] // v2,v3 = t5a, v4,v5 = t4a
+ dmbutterfly_l v16, v18, v21, v23, v17, v22, v1.h[7], v1.h[6] // v16,v18 = t7a, v21,v23 = t6a
+
+ dbutterfly_n v4, v5, v26, v27, v4, v5, v6, v7, v26, v27 // v4 = t0, v5 = t4
+ dbutterfly_n v2, v3, v24, v25, v2, v3, v6, v7, v26, v27 // v2 = t1, v3 = t5
+ dbutterfly_n v24, v25, v30, v31, v21, v23, v6, v7, v26, v27 // v24 = t2, v25 = t6
+ dbutterfly_n v30, v31, v28, v29, v16, v18, v6, v7, v26, v27 // v30 = t3, v31 = t7
+
+ butterfly_8h v16, v6, v4, v24 // v16 = out[0], v6 = t2
+ butterfly_8h v23, v7, v2, v30 // v23 = -out[7], v7 = t3
+ neg v23.8h, v23.8h // v23 = out[7]
+
+ dmbutterfly0 v19, v20, v6, v7, v24, v26, v27, v28, v29, v30 // v19 = -out[3], v20 = out[4]
+ neg v19.8h, v19.8h // v19 = out[3]
+
+ dmbutterfly_l v26, v27, v28, v29, v5, v3, v0.h[2], v0.h[3] // v26,v27 = t5a, v28,v29 = t4a
+ dmbutterfly_l v2, v3, v4, v5, v31, v25, v0.h[3], v0.h[2] // v2,v3 = t6a, v4,v5 = t7a
+
+ dbutterfly_n v17, v30, v28, v29, v2, v3, v6, v7, v24, v25 // v17 = -out[1], v30 = t6
+ dbutterfly_n v22, v31, v26, v27, v4, v5, v6, v7, v24, v25 // v22 = out[6], v31 = t7
+ neg v17.8h, v17.8h // v17 = out[1]
+
+ dmbutterfly0 v18, v21, v30, v31, v2, v3, v4, v5, v6, v7 // v18 = out[2], v21 = -out[5]
+ neg v21.8h, v21.8h // v21 = out[5]
+.endm
+
+
+.macro itxfm_func8x8 txfm1, txfm2
+function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
+ // The iadst also uses a few coefficients from
+ // idct, so those always need to be loaded.
+.ifc \txfm1\()_\txfm2,idct_idct
+ movrel x4, idct_coeffs
+.else
+ movrel x4, iadst8_coeffs
+ ld1 {v1.8h}, [x4], #16
+.endif
+ ld1 {v0.8h}, [x4]
+
+ movi v2.8h, #0
+ movi v3.8h, #0
+ movi v4.8h, #0
+ movi v5.8h, #0
+
+.ifc \txfm1\()_\txfm2,idct_idct
+ cmp w3, #1
+ b.ne 1f
+ // DC-only for idct/idct
+ ld1 {v2.h}[0], [x2]
+ smull v2.4s, v2.4h, v0.h[0]
+ rshrn v2.4h, v2.4s, #14
+ smull v2.4s, v2.4h, v0.h[0]
+ rshrn v2.4h, v2.4s, #14
+ st1 {v3.h}[0], [x2]
+ dup v16.8h, v2.h[0]
+ mov v17.16b, v16.16b
+ mov v18.16b, v16.16b
+ mov v19.16b, v16.16b
+ mov v20.16b, v16.16b
+ mov v21.16b, v16.16b
+ mov v22.16b, v16.16b
+ mov v23.16b, v16.16b
+ b 2f
+.endif
+1:
+ ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2], #64
+ ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2], #64
+ sub x2, x2, #128
+ st1 {v2.8h,v3.8h,v4.8h,v5.8h}, [x2], #64
+ st1 {v2.8h,v3.8h,v4.8h,v5.8h}, [x2], #64
+
+ \txfm1\()8
+
+ // Transpose 8x8 with 16 bit elements
+ transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+
+ \txfm2\()8
+2:
+ mov x3, x0
+ // Add into the destination
+ ld1 {v0.8b}, [x0], x1
+ srshr v16.8h, v16.8h, #5
+ ld1 {v1.8b}, [x0], x1
+ srshr v17.8h, v17.8h, #5
+ ld1 {v2.8b}, [x0], x1
+ srshr v18.8h, v18.8h, #5
+ uaddw v16.8h, v16.8h, v0.8b
+ ld1 {v3.8b}, [x0], x1
+ srshr v19.8h, v19.8h, #5
+ uaddw v17.8h, v17.8h, v1.8b
+ ld1 {v4.8b}, [x0], x1
+ srshr v20.8h, v20.8h, #5
+ uaddw v18.8h, v18.8h, v2.8b
+ sqxtun v0.8b, v16.8h
+ ld1 {v5.8b}, [x0], x1
+ srshr v21.8h, v21.8h, #5
+ uaddw v19.8h, v19.8h, v3.8b
+ sqxtun v1.8b, v17.8h
+ ld1 {v6.8b}, [x0], x1
+ srshr v22.8h, v22.8h, #5
+ uaddw v20.8h, v20.8h, v4.8b
+ sqxtun v2.8b, v18.8h
+ ld1 {v7.8b}, [x0], x1
+ srshr v23.8h, v23.8h, #5
+ uaddw v21.8h, v21.8h, v5.8b
+ sqxtun v3.8b, v19.8h
+
+ st1 {v0.8b}, [x3], x1
+ uaddw v22.8h, v22.8h, v6.8b
+ st1 {v1.8b}, [x3], x1
+ sqxtun v4.8b, v20.8h
+ st1 {v2.8b}, [x3], x1
+ uaddw v23.8h, v23.8h, v7.8b
+ st1 {v3.8b}, [x3], x1
+ sqxtun v5.8b, v21.8h
+ st1 {v4.8b}, [x3], x1
+ sqxtun v6.8b, v22.8h
+ st1 {v5.8b}, [x3], x1
+ sqxtun v7.8b, v23.8h
+
+ st1 {v6.8b}, [x3], x1
+ st1 {v7.8b}, [x3], x1
+
+ ret
+endfunc
+.endm
+
+itxfm_func8x8 idct, idct
+itxfm_func8x8 iadst, idct
+itxfm_func8x8 idct, iadst
+itxfm_func8x8 iadst, iadst
+
+
+function idct16x16_dc_add_neon
+ movrel x4, idct_coeffs
+ ld1 {v0.4h}, [x4]
+
+ movi v1.4h, #0
+
+ ld1 {v2.h}[0], [x2]
+ smull v2.4s, v2.4h, v0.h[0]
+ rshrn v2.4h, v2.4s, #14
+ smull v2.4s, v2.4h, v0.h[0]
+ rshrn v2.4h, v2.4s, #14
+ dup v2.8h, v2.h[0]
+ st1 {v1.h}[0], [x2]
+
+ srshr v2.8h, v2.8h, #6
+
+ mov x3, x0
+ mov x4, #16
+1:
+ // Loop to add the constant from v2 into all 16x16 outputs
+ subs x4, x4, #2
+ ld1 {v3.16b}, [x0], x1
+ ld1 {v4.16b}, [x0], x1
+ uaddw v16.8h, v2.8h, v3.8b
+ uaddw2 v17.8h, v2.8h, v3.16b
+ uaddw v18.8h, v2.8h, v4.8b
+ uaddw2 v19.8h, v2.8h, v4.16b
+ sqxtun v3.8b, v16.8h
+ sqxtun2 v3.16b, v17.8h
+ sqxtun v4.8b, v18.8h
+ sqxtun2 v4.16b, v19.8h
+ st1 {v3.16b}, [x3], x1
+ st1 {v4.16b}, [x3], x1
+ b.ne 1b
+
+ ret
+endfunc
+
+.macro idct16_end
+ butterfly_8h v18, v7, v4, v7 // v18 = t0a, v7 = t7a
+ butterfly_8h v19, v22, v5, v22 // v19 = t1a, v22 = t6
+ butterfly_8h v4, v26, v20, v26 // v4 = t2a, v26 = t5
+ butterfly_8h v5, v6, v28, v6 // v5 = t3a, v6 = t4
+ butterfly_8h v20, v28, v16, v24 // v20 = t8a, v28 = t11a
+ butterfly_8h v24, v21, v23, v21 // v24 = t9, v21 = t10
+ butterfly_8h v23, v27, v25, v27 // v23 = t14, v27 = t13
+ butterfly_8h v25, v29, v29, v17 // v25 = t15a, v29 = t12a
+
+ dmbutterfly0 v2, v3, v27, v21, v2, v3, v16, v17, v30, v31 // v2 = t13a, v3 = t10a
+ dmbutterfly0 v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12, v27 = t11
+
+ butterfly_8h v16, v31, v18, v25 // v16 = out[0], v31 = out[15]
+ butterfly_8h v17, v30, v19, v23 // v17 = out[1], v30 = out[14]
+ butterfly_8h_r v25, v22, v22, v24 // v25 = out[9], v22 = out[6]
+ butterfly_8h v23, v24, v7, v20 // v23 = out[7], v24 = out[8]
+ butterfly_8h v18, v29, v4, v2 // v18 = out[2], v29 = out[13]
+ butterfly_8h v19, v28, v5, v28 // v19 = out[3], v28 = out[12]
+ butterfly_8h v20, v27, v6, v27 // v20 = out[4], v27 = out[11]
+ butterfly_8h v21, v26, v26, v3 // v21 = out[5], v26 = out[10]
+ ret
+.endm
+
+function idct16
+ dmbutterfly0 v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a
+ dmbutterfly v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a, v28 = t3a
+ dmbutterfly v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a, v30 = t7a
+ dmbutterfly v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a, v22 = t6a
+ dmbutterfly v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a, v31 = t15a
+ dmbutterfly v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a, v23 = t14a
+ dmbutterfly v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
+ dmbutterfly v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
+
+ butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3
+ butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2
+ butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5
+ butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6
+ butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9
+ butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10
+ butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13
+ butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14
+
+ dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
+ dmbutterfly v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
+ dmbutterfly v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+ idct16_end
+endfunc
+
+function idct16_half
+ dmbutterfly0_h v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a
+ dmbutterfly_h1 v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a, v28 = t3a
+ dmbutterfly_h1 v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a, v30 = t7a
+ dmbutterfly_h2 v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a, v22 = t6a
+ dmbutterfly_h1 v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a, v31 = t15a
+ dmbutterfly_h2 v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a, v23 = t14a
+ dmbutterfly_h1 v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
+ dmbutterfly_h2 v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
+
+ butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3
+ butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2
+ butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5
+ butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6
+ butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9
+ butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10
+ butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13
+ butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14
+
+ dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
+ dmbutterfly v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
+ dmbutterfly v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+ idct16_end
+endfunc
+
+function idct16_quarter
+ dsmull_h v24, v25, v19, v1.h[7]
+ dsmull_h v4, v5, v17, v1.h[0]
+ dsmull_h v7, v6, v18, v0.h[5]
+ dsmull_h v30, v31, v18, v0.h[4]
+ neg v24.4s, v24.4s
+ neg v25.4s, v25.4s
+ dsmull_h v29, v28, v17, v1.h[1]
+ dsmull_h v26, v27, v19, v1.h[6]
+ dsmull_h v22, v23, v16, v0.h[0]
+ drshrn_h v24, v24, v25, #14
+ drshrn_h v16, v4, v5, #14
+ drshrn_h v7, v7, v6, #14
+ drshrn_h v6, v30, v31, #14
+ drshrn_h v29, v29, v28, #14
+ drshrn_h v17, v26, v27, #14
+ drshrn_h v28, v22, v23, #14
+
+ dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.h[2], v0.h[3]
+ dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.h[2], v0.h[3]
+ neg v22.4s, v22.4s
+ neg v23.4s, v23.4s
+ drshrn_h v27, v20, v21, #14
+ drshrn_h v21, v22, v23, #14
+ drshrn_h v23, v18, v19, #14
+ drshrn_h v25, v30, v31, #14
+ mov v4.16b, v28.16b
+ mov v5.16b, v28.16b
+ dmbutterfly0 v22, v26, v7, v6, v18, v19, v30, v31
+ mov v20.16b, v28.16b
+ idct16_end
+endfunc
+
+function iadst16
+ ld1 {v0.8h,v1.8h}, [x11]
+
+ dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.h[1], v0.h[0] // v6,v7 = t1, v4,v5 = t0
+ dmbutterfly_l v10, v11, v8, v9, v23, v24, v0.h[5], v0.h[4] // v10,v11 = t9, v8,v9 = t8
+ dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // v31 = t1a, v24 = t9a
+ dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2] // v14,v15 = t3, v12,v13 = t2
+ dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // v16 = t0a, v23 = t8a
+
+ dmbutterfly_l v6, v7, v4, v5, v21, v26, v0.h[7], v0.h[6] // v6,v7 = t11, v4,v5 = t10
+ dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // v29 = t3a, v26 = t11a
+ dmbutterfly_l v10, v11, v8, v9, v27, v20, v1.h[1], v1.h[0] // v10,v11 = t5, v8,v9 = t4
+ dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // v18 = t2a, v21 = t10a
+
+ dmbutterfly_l v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4] // v14,v15 = t13, v12,v13 = t12
+ dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // v20 = t5a, v28 = t13a
+ dmbutterfly_l v6, v7, v4, v5, v25, v22, v1.h[3], v1.h[2] // v6,v7 = t7, v4,v5 = t6
+ dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // v27 = t4a, v19 = t12a
+
+ dmbutterfly_l v10, v11, v8, v9, v17, v30, v1.h[7], v1.h[6] // v10,v11 = t15, v8,v9 = t14
+ ld1 {v0.8h}, [x10]
+ dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a
+ dmbutterfly_l v14, v15, v12, v13, v23, v24, v0.h[4], v0.h[5] // v14,v15 = t9, v12,v13 = t8
+ dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a
+
+ dmbutterfly_l v4, v5, v6, v7, v28, v19, v0.h[5], v0.h[4] // v4,v5 = t12, v6,v7 = t13
+ dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a
+ dmbutterfly_l v10, v11, v8, v9, v21, v26, v0.h[6], v0.h[7] // v10,v11 = t11, v8,v9 = t10
+ butterfly_8h_r v4, v27, v16, v27 // v4 = t4, v27 = t0
+ dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a
+
+ dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[7], v0.h[6] // v12,v13 = t14, v14,v15 = t15
+ butterfly_8h_r v5, v20, v31, v20 // v5 = t5, v20 = t1
+ dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a
+ dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a
+
+ butterfly_8h_r v6, v25, v18, v25 // v6 = t6, v25 = t2
+ butterfly_8h_r v7, v22, v29, v22 // v7 = t7, v22 = t3
+
+ dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.h[2], v0.h[3] // v10,v11 = t13, v8,v9 = t12
+ dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[3], v0.h[2] // v12,v13 = t14, v14,v15 = t15
+
+ dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a
+ dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a
+ neg v29.8h, v29.8h // v29 = out[13]
+
+ dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.h[2], v0.h[3] // v10,v11 = t5a, v8,v9 = t4a
+ dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.h[3], v0.h[2] // v12,v13 = t6a, v14,v15 = t7a
+
+ butterfly_8h v2, v6, v27, v25 // v2 = out[0], v6 = t2a
+ butterfly_8h v3, v7, v23, v21 // v3 =-out[1], v7 = t10
+
+ dbutterfly_n v19, v31, v8, v9, v12, v13, v4, v5, v8, v9 // v19 = -out[3], v31 = t6
+ neg v19.8h, v19.8h // v19 = out[3]
+ dbutterfly_n v28, v16, v10, v11, v14, v15, v4, v5, v10, v11 // v28 = out[12], v16 = t7
+
+ butterfly_8h v5, v8, v20, v22 // v5 =-out[15],v8 = t3a
+ butterfly_8h v4, v9, v24, v26 // v4 = out[14],v9 = t11
+
+ dmbutterfly0 v23, v24, v6, v8, v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
+ dmbutterfly0 v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
+ dmbutterfly0 v20, v27, v16, v31, v10, v11, v12, v13, v14, v15 // v20 = out[4], v27 = out[11]
+ dmbutterfly0 v22, v25, v9, v7, v10, v11, v12, v13, v14, v15 // v22 = out[6], v25 = out[9]
+
+ neg v31.8h, v5.8h // v31 = out[15]
+ neg v17.8h, v3.8h // v17 = out[1]
+
+ mov v16.16b, v2.16b
+ mov v30.16b, v4.16b
+ ret
+endfunc
+
+// Helper macros; we can't use these expressions directly within
+// e.g. .irp due to the extra concatenation \(). Therefore wrap
+// them in macros to allow using .irp below.
+.macro load i, src, inc
+ ld1 {v\i\().8h}, [\src], \inc
+.endm
+.macro store i, dst, inc
+ st1 {v\i\().8h}, [\dst], \inc
+.endm
+.macro movi_v i, size, imm
+ movi v\i\()\size, \imm
+.endm
+.macro load_clear i, src, inc
+ ld1 {v\i\().8h}, [\src]
+ st1 {v2.8h}, [\src], \inc
+.endm
+
+.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
+ srshr \coef0, \coef0, #6
+ ld1 {v2.8b}, [x0], x1
+ srshr \coef1, \coef1, #6
+ ld1 {v3.8b}, [x3], x1
+ srshr \coef2, \coef2, #6
+ ld1 {v4.8b}, [x0], x1
+ srshr \coef3, \coef3, #6
+ uaddw \coef0, \coef0, v2.8b
+ ld1 {v5.8b}, [x3], x1
+ uaddw \coef1, \coef1, v3.8b
+ srshr \coef4, \coef4, #6
+ ld1 {v6.8b}, [x0], x1
+ srshr \coef5, \coef5, #6
+ ld1 {v7.8b}, [x3], x1
+ sqxtun v2.8b, \coef0
+ srshr \coef6, \coef6, #6
+ sqxtun v3.8b, \coef1
+ srshr \coef7, \coef7, #6
+ uaddw \coef2, \coef2, v4.8b
+ ld1 {\tmp1}, [x0], x1
+ uaddw \coef3, \coef3, v5.8b
+ ld1 {\tmp2}, [x3], x1
+ sqxtun v4.8b, \coef2
+ sub x0, x0, x1, lsl #2
+ sub x3, x3, x1, lsl #2
+ sqxtun v5.8b, \coef3
+ uaddw \coef4, \coef4, v6.8b
+ st1 {v2.8b}, [x0], x1
+ uaddw \coef5, \coef5, v7.8b
+ st1 {v3.8b}, [x3], x1
+ sqxtun v6.8b, \coef4
+ st1 {v4.8b}, [x0], x1
+ sqxtun v7.8b, \coef5
+ st1 {v5.8b}, [x3], x1
+ uaddw \coef6, \coef6, \tmp1
+ st1 {v6.8b}, [x0], x1
+ uaddw \coef7, \coef7, \tmp2
+ st1 {v7.8b}, [x3], x1
+ sqxtun \tmp1, \coef6
+ sqxtun \tmp2, \coef7
+ st1 {\tmp1}, [x0], x1
+ st1 {\tmp2}, [x3], x1
+.endm
+
+// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
+// transpose into a horizontal 16x8 slice and store.
+// x0 = dst (temp buffer)
+// x1 = slice offset
+// x2 = src
+// x9 = input stride
+.macro itxfm16_1d_funcs txfm
+function \txfm\()16_1d_8x16_pass1_neon
+ mov x14, x30
+
+ movi v2.8h, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ load_clear \i, x2, x9
+.endr
+
+ bl \txfm\()16
+
+ // Do two 8x8 transposes. Originally, v16-v31 contain the
+ // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
+ // transposed 8x8 blocks.
+ transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+ transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
+
+ // Store the transposed 8x8 blocks horizontally.
+ cmp x1, #8
+ b.eq 1f
+.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
+ store \i, x0, #16
+.endr
+ br x14
+1:
+ // Special case: For the last input column (x1 == 8),
+ // which would be stored as the last row in the temp buffer,
+ // don't store the first 8x8 block, but keep it in registers
+ // for the first slice of the second pass (where it is the
+ // last 8x8 block).
+.irp i, 24, 25, 26, 27, 28, 29, 30, 31
+ add x0, x0, #16
+ store \i, x0, #16
+.endr
+ mov v24.16b, v16.16b
+ mov v25.16b, v17.16b
+ mov v26.16b, v18.16b
+ mov v27.16b, v19.16b
+ mov v28.16b, v20.16b
+ mov v29.16b, v21.16b
+ mov v30.16b, v22.16b
+ mov v31.16b, v23.16b
+ br x14
+endfunc
+
+// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
+// load the destination pixels (from a similar 8x16 slice), add and store back.
+// x0 = dst
+// x1 = dst stride
+// x2 = src (temp buffer)
+// x3 = slice offset
+// x9 = temp buffer stride
+function \txfm\()16_1d_8x16_pass2_neon
+ mov x14, x30
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ load \i, x2, x9
+.endr
+ cbz x3, 1f
+.irp i, 24, 25, 26, 27, 28, 29, 30, 31
+ load \i, x2, x9
+.endr
+1:
+
+ add x3, x0, x1
+ lsl x1, x1, #1
+ bl \txfm\()16
+
+ load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
+ load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
+
+ br x14
+endfunc
+.endm
+
+itxfm16_1d_funcs idct
+itxfm16_1d_funcs iadst
+
+.macro itxfm_func16x16 txfm1, txfm2
+function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
+.ifc \txfm1\()_\txfm2,idct_idct
+ cmp w3, #1
+ b.eq idct16x16_dc_add_neon
+.endif
+ mov x15, x30
+ // iadst16 requires clobbering v8-v15, but idct16 doesn't need to.
+.ifnc \txfm1\()_\txfm2,idct_idct
+ stp d14, d15, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+.endif
+
+ sub sp, sp, #512
+
+ mov x4, x0
+ mov x5, x1
+ mov x6, x2
+
+ movrel x10, idct_coeffs
+.ifnc \txfm1\()_\txfm2,idct_idct
+ movrel x11, iadst16_coeffs
+.endif
+.ifc \txfm1,idct
+ ld1 {v0.8h,v1.8h}, [x10]
+.endif
+ mov x9, #32
+
+.ifc \txfm1\()_\txfm2,idct_idct
+ cmp w3, #10
+ b.le idct16x16_quarter_add_neon
+ cmp w3, #38
+ b.le idct16x16_half_add_neon
+.endif
+
+.irp i, 0, 8
+ add x0, sp, #(\i*32)
+.ifc \txfm1\()_\txfm2,idct_idct
+.if \i == 8
+ cmp w3, #38
+ b.le 1f
+.endif
+.endif
+ mov x1, #\i
+ add x2, x6, #(\i*2)
+ bl \txfm1\()16_1d_8x16_pass1_neon
+.endr
+.ifc \txfm1\()_\txfm2,iadst_idct
+ ld1 {v0.8h,v1.8h}, [x10]
+.endif
+
+.ifc \txfm1\()_\txfm2,idct_idct
+ b 3f
+1:
+ // Set v24-v31 to zero, for the in-register passthrough of
+ // coefficients to pass 2. Since we only do two slices, this can
+ // only ever happen for the second slice. So we only need to store
+ // zeros to the temp buffer for the second half of the buffer.
+ // Move x0 to the second half, and use x9 == 32 as increment.
+ add x0, x0, #16
+.irp i, 24, 25, 26, 27, 28, 29, 30, 31
+ movi_v \i, .16b, #0
+ st1 {v24.8h}, [x0], x9
+.endr
+3:
+.endif
+
+.irp i, 0, 8
+ add x0, x4, #(\i)
+ mov x1, x5
+ add x2, sp, #(\i*2)
+ mov x3, #\i
+ bl \txfm2\()16_1d_8x16_pass2_neon
+.endr
+
+ add sp, sp, #512
+.ifnc \txfm1\()_\txfm2,idct_idct
+ ldp d8, d9, [sp], 0x10
+ ldp d10, d11, [sp], 0x10
+ ldp d12, d13, [sp], 0x10
+ ldp d14, d15, [sp], 0x10
+.endif
+ br x15
+endfunc
+.endm
+
+itxfm_func16x16 idct, idct
+itxfm_func16x16 iadst, idct
+itxfm_func16x16 idct, iadst
+itxfm_func16x16 iadst, iadst
+
+function idct16_1d_8x16_pass1_quarter_neon
+ mov x14, x30
+ movi v2.8h, #0
+.irp i, 16, 17, 18, 19
+ load_clear \i, x2, x9
+.endr
+
+ bl idct16_quarter
+
+ // Do two 8x8 transposes. Originally, v16-v31 contain the
+ // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
+ // transposed 8x8 blocks.
+ transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+ transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
+
+ // Store the transposed 8x8 blocks horizontally.
+ // The first 8x8 block is kept in registers for the second pass,
+ // store the rest in the temp buffer.
+ // Since only a 4x4 part of the input was nonzero, this means that
+ // only 4 rows are nonzero after transposing, and the second pass
+ // only reads the topmost 4 rows. Therefore only store the topmost
+ // 4 rows.
+ add x0, x0, #16
+.irp i, 24, 25, 26, 27
+ store \i, x0, x9
+.endr
+ br x14
+endfunc
+
+function idct16_1d_8x16_pass2_quarter_neon
+ mov x14, x30
+ cbz x3, 1f
+.irp i, 16, 17, 18, 19
+ load \i, x2, x9
+.endr
+1:
+
+ add x3, x0, x1
+ lsl x1, x1, #1
+ bl idct16_quarter
+
+ load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
+ load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
+
+ br x14
+endfunc
+
+function idct16_1d_8x16_pass1_half_neon
+ mov x14, x30
+ movi v2.8h, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ load_clear \i, x2, x9
+.endr
+
+ bl idct16_half
+
+ // Do two 8x8 transposes. Originally, v16-v31 contain the
+ // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
+ // transposed 8x8 blocks.
+ transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+ transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
+
+ // Store the transposed 8x8 blocks horizontally.
+ // The first 8x8 block is kept in registers for the second pass,
+ // store the rest in the temp buffer.
+ add x0, x0, #16
+.irp i, 24, 25, 26, 27, 28, 29, 30, 31
+ store \i, x0, x9
+.endr
+ br x14
+endfunc
+
+function idct16_1d_8x16_pass2_half_neon
+ mov x14, x30
+ cbz x3, 1f
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ load \i, x2, x9
+.endr
+1:
+
+ add x3, x0, x1
+ lsl x1, x1, #1
+ bl idct16_half
+
+ load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
+ load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
+
+ br x14
+endfunc
+
+.macro idct16_partial size
+function idct16x16_\size\()_add_neon
+ add x0, sp, #(0*32)
+ add x2, x6, #(0*2)
+ bl idct16_1d_8x16_pass1_\size\()_neon
+.irp i, 0, 8
+ add x0, x4, #(\i)
+ mov x1, x5
+ add x2, sp, #(\i*2)
+ mov x3, #\i
+ bl idct16_1d_8x16_pass2_\size\()_neon
+.endr
+
+ add sp, sp, #512
+ br x15
+endfunc
+.endm
+
+idct16_partial quarter
+idct16_partial half
+
+function idct32x32_dc_add_neon
+ movrel x4, idct_coeffs
+ ld1 {v0.4h}, [x4]
+
+ movi v1.4h, #0
+
+ ld1 {v2.h}[0], [x2]
+ smull v2.4s, v2.4h, v0.h[0]
+ rshrn v2.4h, v2.4s, #14
+ smull v2.4s, v2.4h, v0.h[0]
+ rshrn v2.4h, v2.4s, #14
+ dup v2.8h, v2.h[0]
+ st1 {v1.h}[0], [x2]
+
+ srshr v0.8h, v2.8h, #6
+
+ mov x3, x0
+ mov x4, #32
+1:
+ // Loop to add the constant v0 into all 32x32 outputs
+ subs x4, x4, #2
+ ld1 {v1.16b,v2.16b}, [x0], x1
+ uaddw v16.8h, v0.8h, v1.8b
+ uaddw2 v17.8h, v0.8h, v1.16b
+ ld1 {v3.16b,v4.16b}, [x0], x1
+ uaddw v18.8h, v0.8h, v2.8b
+ uaddw2 v19.8h, v0.8h, v2.16b
+ uaddw v20.8h, v0.8h, v3.8b
+ uaddw2 v21.8h, v0.8h, v3.16b
+ uaddw v22.8h, v0.8h, v4.8b
+ uaddw2 v23.8h, v0.8h, v4.16b
+ sqxtun v1.8b, v16.8h
+ sqxtun2 v1.16b, v17.8h
+ sqxtun v2.8b, v18.8h
+ sqxtun2 v2.16b, v19.8h
+ sqxtun v3.8b, v20.8h
+ sqxtun2 v3.16b, v21.8h
+ st1 {v1.16b,v2.16b}, [x3], x1
+ sqxtun v4.8b, v22.8h
+ sqxtun2 v4.16b, v23.8h
+ st1 {v3.16b,v4.16b}, [x3], x1
+ b.ne 1b
+
+ ret
+endfunc
+
+.macro idct32_end
+ butterfly_8h v16, v5, v4, v5 // v16 = t16a, v5 = t19a
+ butterfly_8h v17, v20, v23, v20 // v17 = t17, v20 = t18
+ butterfly_8h v18, v6, v7, v6 // v18 = t23a, v6 = t20a
+ butterfly_8h v19, v21, v22, v21 // v19 = t22, v21 = t21
+ butterfly_8h v4, v28, v28, v30 // v4 = t24a, v28 = t27a
+ butterfly_8h v23, v26, v25, v26 // v23 = t25, v26 = t26
+ butterfly_8h v7, v3, v29, v31 // v7 = t31a, v3 = t28a
+ butterfly_8h v22, v27, v24, v27 // v22 = t30, v27 = t29
+
+ dmbutterfly v27, v20, v0.h[2], v0.h[3], v24, v25, v30, v31 // v27 = t18a, v20 = t29a
+ dmbutterfly v3, v5, v0.h[2], v0.h[3], v24, v25, v30, v31 // v3 = t19, v5 = t28
+ dmbutterfly v28, v6, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20
+ dmbutterfly v26, v21, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
+
+ butterfly_8h v31, v24, v7, v4 // v31 = t31, v24 = t24
+ butterfly_8h v30, v25, v22, v23 // v30 = t30a, v25 = t25a
+ butterfly_8h_r v23, v16, v16, v18 // v23 = t23, v16 = t16
+ butterfly_8h_r v22, v17, v17, v19 // v22 = t22a, v17 = t17a
+ butterfly_8h v18, v21, v27, v21 // v18 = t18, v21 = t21
+ butterfly_8h_r v27, v28, v5, v28 // v27 = t27a, v28 = t28a
+ butterfly_8h v29, v26, v20, v26 // v29 = t29, v26 = t26
+ butterfly_8h v19, v20, v3, v6 // v19 = t19a, v20 = t20
+
+ dmbutterfly0 v27, v20, v27, v20, v2, v3, v4, v5, v6, v7 // v27 = t27, v20 = t20
+ dmbutterfly0 v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a
+ dmbutterfly0 v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25, v22 = t22
+ dmbutterfly0 v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a
+ ret
+.endm
+
+function idct32_odd
+ dmbutterfly v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+ dmbutterfly v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+ dmbutterfly v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+ dmbutterfly v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+ dmbutterfly v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+ dmbutterfly v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+ dmbutterfly v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+ dmbutterfly v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
+
+ butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17
+ butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18
+ butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21
+ butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22
+ butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25
+ butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26
+ butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30
+ butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29
+
+ dmbutterfly v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
+ dmbutterfly v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
+ dmbutterfly v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
+ dmbutterfly v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+ idct32_end
+endfunc
+
+function idct32_odd_half
+ dmbutterfly_h1 v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+ dmbutterfly_h2 v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+ dmbutterfly_h1 v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+ dmbutterfly_h2 v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+ dmbutterfly_h1 v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+ dmbutterfly_h2 v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+ dmbutterfly_h1 v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+ dmbutterfly_h2 v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
+
+ butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17
+ butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18
+ butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21
+ butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22
+ butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25
+ butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26
+ butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30
+ butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29
+
+ dmbutterfly v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
+ dmbutterfly v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
+ dmbutterfly v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
+ dmbutterfly v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+ idct32_end
+endfunc
+
+function idct32_odd_quarter
+ dsmull_h v4, v5, v16, v8.h[0]
+ dsmull_h v28, v29, v19, v8.h[7]
+ dsmull_h v30, v31, v16, v8.h[1]
+ dsmull_h v22, v23, v17, v9.h[6]
+ dsmull_h v7, v6, v17, v9.h[7]
+ dsmull_h v26, v27, v19, v8.h[6]
+ dsmull_h v20, v21, v18, v9.h[0]
+ dsmull_h v24, v25, v18, v9.h[1]
+
+ neg v28.4s, v28.4s
+ neg v29.4s, v29.4s
+ neg v7.4s, v7.4s
+ neg v6.4s, v6.4s
+
+ drshrn_h v4, v4, v5, #14
+ drshrn_h v5, v28, v29, #14
+ drshrn_h v29, v30, v31, #14
+ drshrn_h v28, v22, v23, #14
+ drshrn_h v7, v7, v6, #14
+ drshrn_h v31, v26, v27, #14
+ drshrn_h v6, v20, v21, #14
+ drshrn_h v30, v24, v25, #14
+
+ dmbutterfly_l v16, v17, v18, v19, v29, v4, v0.h[4], v0.h[5]
+ dmbutterfly_l v27, v26, v20, v21, v31, v5, v0.h[4], v0.h[5]
+ drshrn_h v23, v16, v17, #14
+ drshrn_h v24, v18, v19, #14
+ neg v20.4s, v20.4s
+ neg v21.4s, v21.4s
+ drshrn_h v27, v27, v26, #14
+ drshrn_h v20, v20, v21, #14
+ dmbutterfly_l v16, v17, v18, v19, v30, v6, v0.h[6], v0.h[7]
+ drshrn_h v21, v16, v17, #14
+ drshrn_h v26, v18, v19, #14
+ dmbutterfly_l v16, v17, v18, v19, v28, v7, v0.h[6], v0.h[7]
+ drshrn_h v25, v16, v17, #14
+ neg v18.4s, v18.4s
+ neg v19.4s, v19.4s
+ drshrn_h v22, v18, v19, #14
+
+ idct32_end
+endfunc
+
+.macro idct32_funcs suffix
+// Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
+// The 32-point IDCT can be decomposed into two 16-point IDCTs;
+// a normal IDCT16 with every other input component (the even ones, with
+// each output written twice), followed by a separate 16-point IDCT
+// of the odd inputs, added/subtracted onto the outputs of the first idct16.
+// x0 = dst (temp buffer)
+// x1 = unused
+// x2 = src
+// x9 = double input stride
+function idct32_1d_8x32_pass1\suffix\()_neon
+ mov x14, x30
+ movi v2.8h, #0
+
+ // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ load_clear \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+ load_clear \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ load_clear \i, x2, x9
+.endr
+.endif
+
+ bl idct16\suffix
+
+ // Do two 8x8 transposes. Originally, v16-v31 contain the
+ // 16 rows. Afterwards, v16-v23 and v24-v31 contain the
+ // two transposed 8x8 blocks.
+ transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+ transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
+
+ // Store the registers a, b horizontally, followed by the
+ // same registers b, a mirrored.
+.macro store_rev a, b
+ // There's no rev128 instruction, but we reverse each 64 bit
+ // half, and then flip them using an ext with 8 bytes offset.
+ rev64 v3.8h, \b
+ st1 {\a}, [x0], #16
+ rev64 v2.8h, \a
+ ext v3.16b, v3.16b, v3.16b, #8
+ st1 {\b}, [x0], #16
+ ext v2.16b, v2.16b, v2.16b, #8
+ st1 {v3.8h}, [x0], #16
+ st1 {v2.8h}, [x0], #16
+.endm
+ store_rev v16.8h, v24.8h
+ store_rev v17.8h, v25.8h
+ store_rev v18.8h, v26.8h
+ store_rev v19.8h, v27.8h
+ store_rev v20.8h, v28.8h
+ store_rev v21.8h, v29.8h
+ store_rev v22.8h, v30.8h
+ store_rev v23.8h, v31.8h
+ sub x0, x0, #512
+.purgem store_rev
+
+ // Move x2 back to the start of the input, and move
+ // to the first odd row
+.ifb \suffix
+ sub x2, x2, x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+ sub x2, x2, x9, lsl #2
+.endif
+.ifc \suffix,_half
+ sub x2, x2, x9, lsl #3
+.endif
+ add x2, x2, #64
+
+ movi v2.8h, #0
+ // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ load_clear \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+ load_clear \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ load_clear \i, x2, x9
+.endr
+.endif
+
+ bl idct32_odd\suffix
+
+ transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
+ transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
+
+ // Store the registers a, b horizontally,
+ // adding into the output first, and the mirrored,
+ // subtracted from the output.
+.macro store_rev a, b
+ ld1 {v4.8h}, [x0]
+ rev64 v3.8h, \b
+ add v4.8h, v4.8h, \a
+ rev64 v2.8h, \a
+ st1 {v4.8h}, [x0], #16
+ ext v3.16b, v3.16b, v3.16b, #8
+ ld1 {v5.8h}, [x0]
+ ext v2.16b, v2.16b, v2.16b, #8
+ add v5.8h, v5.8h, \b
+ st1 {v5.8h}, [x0], #16
+ ld1 {v6.8h}, [x0]
+ sub v6.8h, v6.8h, v3.8h
+ st1 {v6.8h}, [x0], #16
+ ld1 {v7.8h}, [x0]
+ sub v7.8h, v7.8h, v2.8h
+ st1 {v7.8h}, [x0], #16
+.endm
+
+ store_rev v31.8h, v23.8h
+ store_rev v30.8h, v22.8h
+ store_rev v29.8h, v21.8h
+ store_rev v28.8h, v20.8h
+ store_rev v27.8h, v19.8h
+ store_rev v26.8h, v18.8h
+ store_rev v25.8h, v17.8h
+ store_rev v24.8h, v16.8h
+.purgem store_rev
+ br x14
+endfunc
+
+// This is mostly the same as 8x32_pass1, but without the transpose,
+// and use the source as temp buffer between the two idct passes, and
+// add into the destination.
+// x0 = dst
+// x1 = dst stride
+// x2 = src (temp buffer)
+// x7 = negative double temp buffer stride
+// x9 = double temp buffer stride
+function idct32_1d_8x32_pass2\suffix\()_neon
+ mov x14, x30
+ // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ load \i, x2, x9
+.endr
+ sub x2, x2, x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+ load \i, x2, x9
+.endr
+ sub x2, x2, x9, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ load \i, x2, x9
+.endr
+ sub x2, x2, x9, lsl #3
+.endif
+
+ bl idct16\suffix
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ store \i, x2, x9
+.endr
+
+ sub x2, x2, x9, lsl #4
+ add x2, x2, #64
+
+ // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ load \i, x2, x9
+.endr
+ sub x2, x2, x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+ load \i, x2, x9
+.endr
+ sub x2, x2, x9, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ load \i, x2, x9
+.endr
+ sub x2, x2, x9, lsl #3
+.endif
+ sub x2, x2, #64
+
+ bl idct32_odd\suffix
+
+.macro load_acc_store a, b, c, d, neg=0
+.if \neg == 0
+ ld1 {v4.8h}, [x2], x9
+ ld1 {v5.8h}, [x2], x9
+ add v4.8h, v4.8h, \a
+ ld1 {v6.8h}, [x2], x9
+ add v5.8h, v5.8h, \b
+ ld1 {v7.8h}, [x2], x9
+ add v6.8h, v6.8h, \c
+ add v7.8h, v7.8h, \d
+.else
+ ld1 {v4.8h}, [x2], x7
+ ld1 {v5.8h}, [x2], x7
+ sub v4.8h, v4.8h, \a
+ ld1 {v6.8h}, [x2], x7
+ sub v5.8h, v5.8h, \b
+ ld1 {v7.8h}, [x2], x7
+ sub v6.8h, v6.8h, \c
+ sub v7.8h, v7.8h, \d
+.endif
+ ld1 {v10.8b}, [x0], x1
+ ld1 {v11.8b}, [x0], x1
+ srshr v4.8h, v4.8h, #6
+ ld1 {v2.8b}, [x0], x1
+ srshr v5.8h, v5.8h, #6
+ uaddw v4.8h, v4.8h, v10.8b
+ ld1 {v3.8b}, [x0], x1
+ srshr v6.8h, v6.8h, #6
+ uaddw v5.8h, v5.8h, v11.8b
+ srshr v7.8h, v7.8h, #6
+ sub x0, x0, x1, lsl #2
+ uaddw v6.8h, v6.8h, v2.8b
+ sqxtun v4.8b, v4.8h
+ uaddw v7.8h, v7.8h, v3.8b
+ sqxtun v5.8b, v5.8h
+ st1 {v4.8b}, [x0], x1
+ sqxtun v6.8b, v6.8h
+ st1 {v5.8b}, [x0], x1
+ sqxtun v7.8b, v7.8h
+ st1 {v6.8b}, [x0], x1
+ st1 {v7.8b}, [x0], x1
+.endm
+ load_acc_store v31.8h, v30.8h, v29.8h, v28.8h
+ load_acc_store v27.8h, v26.8h, v25.8h, v24.8h
+ load_acc_store v23.8h, v22.8h, v21.8h, v20.8h
+ load_acc_store v19.8h, v18.8h, v17.8h, v16.8h
+ sub x2, x2, x9
+ load_acc_store v16.8h, v17.8h, v18.8h, v19.8h, 1
+ load_acc_store v20.8h, v21.8h, v22.8h, v23.8h, 1
+ load_acc_store v24.8h, v25.8h, v26.8h, v27.8h, 1
+ load_acc_store v28.8h, v29.8h, v30.8h, v31.8h, 1
+.purgem load_acc_store
+ br x14
+endfunc
+.endm
+
+idct32_funcs
+idct32_funcs _quarter
+idct32_funcs _half
+
+const min_eob_idct_idct_32, align=4
+ .short 0, 34, 135, 336
+endconst
+
+function ff_vp9_idct_idct_32x32_add_neon, export=1
+ cmp w3, #1
+ b.eq idct32x32_dc_add_neon
+
+ movrel x10, idct_coeffs
+
+ mov x15, x30
+
+ stp d10, d11, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+
+ sub sp, sp, #2048
+
+ mov x4, x0
+ mov x5, x1
+ mov x6, x2
+
+ // Double stride of the input, since we only read every other line
+ mov x9, #128
+ neg x7, x9
+
+ ld1 {v0.8h,v1.8h}, [x10], #32
+ ld1 {v8.8h,v9.8h}, [x10]
+
+ cmp w3, #34
+ b.le idct32x32_quarter_add_neon
+ cmp w3, #135
+ b.le idct32x32_half_add_neon
+
+ movrel x12, min_eob_idct_idct_32, 2
+
+.irp i, 0, 8, 16, 24
+ add x0, sp, #(\i*64)
+.if \i > 0
+ ldrh w1, [x12], #2
+ cmp w3, w1
+ mov x1, #(32 - \i)/4
+ b.le 1f
+.endif
+ add x2, x6, #(\i*2)
+ bl idct32_1d_8x32_pass1_neon
+.endr
+ b 3f
+
+1:
+ // Write zeros to the temp buffer for pass 2
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ movi v19.8h, #0
+2:
+ subs x1, x1, #1
+.rept 4
+ st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x0], #64
+.endr
+ b.ne 2b
+3:
+.irp i, 0, 8, 16, 24
+ add x0, x4, #(\i)
+ mov x1, x5
+ add x2, sp, #(\i*2)
+ bl idct32_1d_8x32_pass2_neon
+.endr
+
+ add sp, sp, #2048
+
+ ldp d8, d9, [sp], 0x10
+ ldp d10, d11, [sp], 0x10
+
+ br x15
+endfunc
+
+.macro idct32_partial size
+function idct32x32_\size\()_add_neon
+ add x0, sp, #(0*64)
+ add x2, x6, #(0*2)
+ bl idct32_1d_8x32_pass1_\size\()_neon
+.ifc \size,half
+ add x0, sp, #(8*64)
+ add x2, x6, #(8*2)
+ bl idct32_1d_8x32_pass1_\size\()_neon
+.endif
+.irp i, 0, 8, 16, 24
+ add x0, x4, #(\i)
+ mov x1, x5
+ add x2, sp, #(\i*2)
+ bl idct32_1d_8x32_pass2_\size\()_neon
+.endr
+
+ add sp, sp, #2048
+
+ ldp d8, d9, [sp], 0x10
+ ldp d10, d11, [sp], 0x10
+
+ br x15
+endfunc
+.endm
+
+idct32_partial quarter
+idct32_partial half
diff --git a/media/ffvpx/libavcodec/aarch64/vp9lpf_16bpp_neon.S b/media/ffvpx/libavcodec/aarch64/vp9lpf_16bpp_neon.S
new file mode 100644
index 0000000000..9075f3d406
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/vp9lpf_16bpp_neon.S
@@ -0,0 +1,873 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+
+.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
+ trn1 \t4\().8h, \r0\().8h, \r1\().8h
+ trn2 \t5\().8h, \r0\().8h, \r1\().8h
+ trn1 \t6\().8h, \r2\().8h, \r3\().8h
+ trn2 \t7\().8h, \r2\().8h, \r3\().8h
+
+ trn1 \r0\().4s, \t4\().4s, \t6\().4s
+ trn2 \r2\().4s, \t4\().4s, \t6\().4s
+ trn1 \r1\().4s, \t5\().4s, \t7\().4s
+ trn2 \r3\().4s, \t5\().4s, \t7\().4s
+.endm
+
+// The input to and output from this macro is in the registers v16-v31,
+// and v0-v7 are used as scratch registers.
+// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
+// Depending on the width of the loop filter, we either use v16-v19
+// and v28-v31 as temp registers, or v8-v15.
+.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
+ dup v0.8h, w2 // E
+ dup v2.8h, w3 // I
+ dup v3.8h, w4 // H
+
+ uabd v4.8h, v20.8h, v21.8h // abs(p3 - p2)
+ uabd v5.8h, v21.8h, v22.8h // abs(p2 - p1)
+ uabd v6.8h, v22.8h, v23.8h // abs(p1 - p0)
+ uabd v7.8h, v24.8h, v25.8h // abs(q0 - q1)
+ uabd \tmp1\().8h, v25.8h, v26.8h // abs(q1 - q2)
+ uabd \tmp2\().8h, v26.8h, v27.8h // abs(q2 - q3)
+ umax v4.8h, v4.8h, v5.8h
+ umax v5.8h, v6.8h, v7.8h
+ umax \tmp1\().8h, \tmp1\().8h, \tmp2\().8h
+ uabd v6.8h, v23.8h, v24.8h // abs(p0 - q0)
+ umax v4.8h, v4.8h, v5.8h
+ add v6.8h, v6.8h, v6.8h // abs(p0 - q0) * 2
+ uabd v5.8h, v22.8h, v25.8h // abs(p1 - q1)
+ umax v4.8h, v4.8h, \tmp1\().8h // max(abs(p3 - p2), ..., abs(q2 - q3))
+ ushr v5.8h, v5.8h, #1
+ cmhs v4.8h, v2.8h, v4.8h // max(abs()) <= I
+ add v6.8h, v6.8h, v5.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+ cmhs v6.8h, v0.8h, v6.8h
+ and v4.16b, v4.16b, v6.16b // fm
+
+ // If no pixels need filtering, just exit as soon as possible
+ mov x11, v4.d[0]
+ mov x12, v4.d[1]
+ adds x11, x11, x12
+ b.ne 1f
+ br x10
+1:
+
+.if \wd >= 8
+ dup v0.8h, w5
+
+ uabd v6.8h, v20.8h, v23.8h // abs(p3 - p0)
+ uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0)
+ uabd v1.8h, v22.8h, v23.8h // abs(p1 - p0)
+ uabd \tmp1\().8h, v25.8h, v24.8h // abs(q1 - q0)
+ uabd \tmp2\().8h, v26.8h, v24.8h // abs(q2 - q0)
+ uabd \tmp3\().8h, v27.8h, v24.8h // abs(q3 - q0)
+ umax v6.8h, v6.8h, v2.8h
+ umax v1.8h, v1.8h, \tmp1\().8h
+ umax \tmp2\().8h, \tmp2\().8h, \tmp3\().8h
+.if \wd == 16
+ uabd v7.8h, v16.8h, v23.8h // abs(p7 - p0)
+ umax v6.8h, v6.8h, v1.8h
+ uabd v2.8h, v17.8h, v23.8h // abs(p6 - p0)
+ umax v6.8h, v6.8h, \tmp2\().8h
+ uabd v1.8h, v18.8h, v23.8h // abs(p5 - p0)
+ cmhs v6.8h, v0.8h, v6.8h // flat8in
+ uabd v8.8h, v19.8h, v23.8h // abs(p4 - p0)
+ and v6.16b, v6.16b, v4.16b // flat8in && fm
+ uabd v9.8h, v28.8h, v24.8h // abs(q4 - q0)
+ bic v4.16b, v4.16b, v6.16b // fm && !flat8in
+ uabd v10.8h, v29.8h, v24.8h // abs(q5 - q0)
+ uabd v11.8h, v30.8h, v24.8h // abs(q6 - q0)
+ uabd v12.8h, v31.8h, v24.8h // abs(q7 - q0)
+
+ umax v7.8h, v7.8h, v2.8h
+ umax v1.8h, v1.8h, v8.8h
+ umax v9.8h, v9.8h, v10.8h
+ umax v11.8h, v11.8h, v12.8h
+ // The rest of the calculation of flat8out is interleaved below
+.else
+ // The rest of the calculation of flat8in is interleaved below
+.endif
+.endif
+
+ // Calculate the normal inner loop filter for 2 or 4 pixels
+ uabd v5.8h, v22.8h, v23.8h // abs(p1 - p0)
+.if \wd == 16
+ umax v7.8h, v7.8h, v1.8h
+ umax v9.8h, v9.8h, v11.8h
+.elseif \wd == 8
+ umax v6.8h, v6.8h, v1.8h
+.endif
+ uabd v1.8h, v25.8h, v24.8h // abs(q1 - q0)
+.if \wd == 16
+ umax v7.8h, v7.8h, v9.8h
+.elseif \wd == 8
+ umax v6.8h, v6.8h, \tmp2\().8h
+.endif
+ dup \tmp2\().8h, w6 // left shift for saturation
+ sub \tmp1\().8h, v22.8h, v25.8h // p1 - q1
+ neg \tmp6\().8h, \tmp2\().8h // negative left shift after saturation
+ umax v5.8h, v5.8h, v1.8h // max(abs(p1 - p0), abs(q1 - q0))
+ sub \tmp3\().8h, v24.8h, v23.8h // q0 - p0
+ movi \tmp5\().8h, #3
+.if \wd == 8
+ cmhs v6.8h, v0.8h, v6.8h // flat8in
+.endif
+ cmhs v5.8h, v3.8h, v5.8h // !hev
+.if \wd == 8
+ and v6.16b, v6.16b, v4.16b // flat8in && fm
+.endif
+ sqshl \tmp1\().8h, \tmp1\().8h, \tmp2\().8h
+.if \wd == 16
+ cmhs v7.8h, v0.8h, v7.8h // flat8out
+.elseif \wd == 8
+ bic v4.16b, v4.16b, v6.16b // fm && !flat8in
+.endif
+ and v5.16b, v5.16b, v4.16b // !hev && fm && !flat8in
+.if \wd == 16
+ and v7.16b, v7.16b, v6.16b // flat8out && flat8in && fm
+.endif
+ sshl \tmp1\().8h, \tmp1\().8h, \tmp6\().8h // av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
+
+ mul \tmp3\().8h, \tmp3\().8h, \tmp5\().8h // 3 * (q0 - p0)
+ bic \tmp1\().16b, \tmp1\().16b, v5.16b // if (!hev) av_clip_int8 = 0
+ movi v2.8h, #4
+ add \tmp3\().8h, \tmp3\().8h, \tmp1\().8h // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
+ movi v3.8h, #3
+ sqshl \tmp1\().8h, \tmp3\().8h, \tmp2\().8h
+ movi \tmp5\().8h, #0
+ sshl \tmp1\().8h, \tmp1\().8h, \tmp6\().8h // av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
+ dup \tmp6\().8h, w7 // max pixel value
+.if \wd == 16
+ bic v6.16b, v6.16b, v7.16b // fm && flat8in && !flat8out
+.endif
+
+ ushr \tmp2\().8h, \tmp6\().8h, #1 // (1 << (BIT_DEPTH - 1)) - 1
+
+ add \tmp3\().8h, \tmp1\().8h, v2.8h // f + 4
+ add \tmp4\().8h, \tmp1\().8h, v3.8h // f + 3
+ smin \tmp3\().8h, \tmp3\().8h, \tmp2\().8h // FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
+ smin \tmp4\().8h, \tmp4\().8h, \tmp2\().8h // FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
+ sshr \tmp3\().8h, \tmp3\().8h, #3 // f1
+ sshr \tmp4\().8h, \tmp4\().8h, #3 // f2
+
+ add v0.8h, v23.8h, \tmp4\().8h // p0 + f2
+ sub v2.8h, v24.8h, \tmp3\().8h // q0 - f1
+ smin v0.8h, v0.8h, \tmp6\().8h
+ smin v2.8h, v2.8h, \tmp6\().8h
+ srshr \tmp3\().8h, \tmp3\().8h, #1 // f = (f1 + 1) >> 1
+ smax v0.8h, v0.8h, \tmp5\().8h // out p0
+ smax v2.8h, v2.8h, \tmp5\().8h // out q0
+ bit v23.16b, v0.16b, v4.16b // if (fm && !flat8in)
+ bit v24.16b, v2.16b, v4.16b
+
+ add v0.8h, v22.8h, \tmp3\().8h // p1 + f
+ sub v2.8h, v25.8h, \tmp3\().8h // q1 - f
+.if \wd >= 8
+ mov x11, v6.d[0]
+.endif
+ smin v0.8h, v0.8h, \tmp6\().8h
+ smin v2.8h, v2.8h, \tmp6\().8h
+.if \wd >= 8
+ mov x12, v6.d[1]
+.endif
+ smax v0.8h, v0.8h, \tmp5\().8h // out p1
+ smax v2.8h, v2.8h, \tmp5\().8h // out q1
+.if \wd >= 8
+ adds x11, x11, x12
+.endif
+ bit v22.16b, v0.16b, v5.16b // if (!hev && fm && !flat8in)
+ bit v25.16b, v2.16b, v5.16b
+
+ // If no pixels need flat8in, jump to flat8out
+ // (or to a writeout of the inner 4 pixels, for wd=8)
+.if \wd >= 8
+.if \wd == 16
+ b.eq 6f
+.else
+ b.ne 1f
+ br x13
+1:
+.endif
+
+ // flat8in
+ add \tmp1\().8h, v20.8h, v21.8h
+ add \tmp3\().8h, v22.8h, v25.8h
+ add \tmp5\().8h, v20.8h, v22.8h
+ add \tmp7\().8h, v23.8h, v26.8h
+ add v0.8h, \tmp1\().8h, \tmp1\().8h
+ add v0.8h, v0.8h, v23.8h
+ add v0.8h, v0.8h, v24.8h
+ add v0.8h, v0.8h, \tmp5\().8h
+ sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
+ sub \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
+ urshr v2.8h, v0.8h, #3 // out p2
+
+ add v0.8h, v0.8h, \tmp3\().8h
+ add \tmp1\().8h, v20.8h, v23.8h
+ add \tmp3\().8h, v24.8h, v27.8h
+ urshr v3.8h, v0.8h, #3 // out p1
+
+ add v0.8h, v0.8h, \tmp7\().8h
+ sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
+ add \tmp5\().8h, v21.8h, v24.8h
+ add \tmp7\().8h, v25.8h, v27.8h
+ urshr v4.8h, v0.8h, #3 // out p0
+
+ add v0.8h, v0.8h, \tmp3\().8h
+ sub \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
+ add \tmp1\().8h, v22.8h, v25.8h
+ add \tmp3\().8h, v26.8h, v27.8h
+ urshr v5.8h, v0.8h, #3 // out q0
+
+ add v0.8h, v0.8h, \tmp7\().8h
+ sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
+ urshr \tmp5\().8h, v0.8h, #3 // out q1
+
+ add v0.8h, v0.8h, \tmp3\().8h
+ // The output here is written back into the input registers. This doesn't
+ // matter for the flat8part below, since we only update those pixels
+ // which won't be touched below.
+ bit v21.16b, v2.16b, v6.16b
+ bit v22.16b, v3.16b, v6.16b
+ bit v23.16b, v4.16b, v6.16b
+ urshr \tmp6\().8h, v0.8h, #3 // out q2
+ bit v24.16b, v5.16b, v6.16b
+ bit v25.16b, \tmp5\().16b, v6.16b
+ bit v26.16b, \tmp6\().16b, v6.16b
+.endif
+.if \wd == 16
+6:
+ orr v2.16b, v6.16b, v7.16b
+ mov x11, v2.d[0]
+ mov x12, v2.d[1]
+ adds x11, x11, x12
+ b.ne 1f
+ // If no pixels needed flat8in nor flat8out, jump to a
+ // writeout of the inner 4 pixels
+ br x14
+1:
+
+ mov x11, v7.d[0]
+ mov x12, v7.d[1]
+ adds x11, x11, x12
+ b.ne 1f
+ // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
+ br x15
+
+1:
+ // flat8out
+ // This writes all outputs into v2-v17 (skipping v6 and v16).
+ // If this part is skipped, the output is read from v21-v26 (which is the input
+ // to this section).
+ shl v0.8h, v16.8h, #3 // 8 * v16
+ sub v0.8h, v0.8h, v16.8h // 7 * v16
+ add v0.8h, v0.8h, v17.8h
+ add v8.8h, v17.8h, v18.8h
+ add v10.8h, v19.8h, v20.8h
+ add v0.8h, v0.8h, v8.8h
+ add v8.8h, v16.8h, v17.8h
+ add v12.8h, v21.8h, v22.8h
+ add v0.8h, v0.8h, v10.8h
+ add v10.8h, v18.8h, v25.8h
+ add v14.8h, v23.8h, v24.8h
+ sub v10.8h, v10.8h, v8.8h
+ add v0.8h, v0.8h, v12.8h
+ add v0.8h, v0.8h, v14.8h
+ add v12.8h, v16.8h, v18.8h
+ add v14.8h, v19.8h, v26.8h
+ urshr v2.8h, v0.8h, #4
+
+ add v0.8h, v0.8h, v10.8h
+ add v8.8h, v16.8h, v19.8h
+ add v10.8h, v20.8h, v27.8h
+ sub v14.8h, v14.8h, v12.8h
+ bif v2.16b, v17.16b, v7.16b
+ urshr v3.8h , v0.8h, #4
+
+ add v0.8h, v0.8h, v14.8h
+ add v12.8h, v16.8h, v20.8h
+ add v14.8h, v21.8h, v28.8h
+ sub v10.8h, v10.8h, v8.8h
+ bif v3.16b, v18.16b, v7.16b
+ urshr v4.8h, v0.8h, #4
+
+ add v0.8h, v0.8h, v10.8h
+ add v8.8h, v16.8h, v21.8h
+ add v10.8h, v22.8h, v29.8h
+ sub v14.8h, v14.8h, v12.8h
+ bif v4.16b, v19.16b, v7.16b
+ urshr v5.8h, v0.8h, #4
+
+ add v0.8h, v0.8h, v14.8h
+ add v12.8h, v16.8h, v22.8h
+ add v14.8h, v23.8h, v30.8h
+ sub v10.8h, v10.8h, v8.8h
+ bif v5.16b, v20.16b, v7.16b
+ urshr v6.8h, v0.8h, #4
+
+ add v0.8h, v0.8h, v10.8h
+ add v10.8h, v16.8h, v23.8h
+ sub v14.8h, v14.8h, v12.8h
+ add v12.8h, v24.8h, v31.8h
+ bif v6.16b, v21.16b, v7.16b
+ urshr v8.8h, v0.8h, #4
+
+ add v0.8h, v0.8h, v14.8h
+ sub v10.8h, v12.8h, v10.8h
+ add v12.8h, v17.8h, v24.8h
+ add v14.8h, v25.8h, v31.8h
+ bif v8.16b, v22.16b, v7.16b
+ urshr v9.8h, v0.8h, #4
+
+ add v0.8h, v0.8h, v10.8h
+ sub v14.8h, v14.8h, v12.8h
+ add v12.8h, v26.8h, v31.8h
+ bif v9.16b, v23.16b, v7.16b
+ urshr v10.8h, v0.8h, #4
+
+ add v0.8h, v0.8h, v14.8h
+ add v14.8h, v18.8h, v25.8h
+ add v18.8h, v19.8h, v26.8h
+ sub v12.8h, v12.8h, v14.8h
+ add v14.8h, v27.8h, v31.8h
+ bif v10.16b, v24.16b, v7.16b
+ urshr v11.8h, v0.8h, #4
+
+ add v0.8h, v0.8h, v12.8h
+ add v12.8h, v20.8h, v27.8h
+ sub v14.8h, v14.8h, v18.8h
+ add v18.8h, v28.8h, v31.8h
+ bif v11.16b, v25.16b, v7.16b
+ sub v18.8h, v18.8h, v12.8h
+ urshr v12.8h, v0.8h, #4
+
+ add v0.8h, v0.8h, v14.8h
+ add v14.8h, v21.8h, v28.8h
+ add v20.8h, v29.8h, v31.8h
+ bif v12.16b, v26.16b, v7.16b
+ urshr v13.8h, v0.8h, #4
+
+ add v0.8h, v0.8h, v18.8h
+ sub v20.8h, v20.8h, v14.8h
+ add v18.8h, v22.8h, v29.8h
+ add v22.8h, v30.8h, v31.8h
+ bif v13.16b, v27.16b, v7.16b
+ urshr v14.8h, v0.8h, #4
+
+ add v0.8h, v0.8h, v20.8h
+ sub v22.8h, v22.8h, v18.8h
+ bif v14.16b, v28.16b, v7.16b
+ urshr v15.8h, v0.8h, #4
+
+ add v0.8h, v0.8h, v22.8h
+ bif v15.16b, v29.16b, v7.16b
+ urshr v17.8h, v0.8h, #4
+ bif v17.16b, v30.16b, v7.16b
+.endif
+.endm
+
+// For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
+// while we need those for inputs/outputs in wd=16 and use v8-v15
+// for temp registers there instead.
+function vp9_loop_filter_4
+ loop_filter 4, v16, v17, v18, v19, v28, v29, v30, v31
+ ret
+endfunc
+
+function vp9_loop_filter_8
+ loop_filter 8, v16, v17, v18, v19, v28, v29, v30, v31
+ ret
+endfunc
+
+function vp9_loop_filter_16
+ loop_filter 16, v8, v9, v10, v11, v12, v13, v14, v15
+ ret
+endfunc
+
+.macro loop_filter_4
+ bl vp9_loop_filter_4
+.endm
+
+.macro loop_filter_8
+ // calculate alternative 'return' targets
+ adr x13, 6f
+ bl vp9_loop_filter_8
+.endm
+
+.macro loop_filter_16
+ // calculate alternative 'return' targets
+ adr x14, 7f
+ adr x15, 8f
+ bl vp9_loop_filter_16
+.endm
+
+
+// The public functions in this file have got the following signature:
+// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
+
+.macro bpp_frontend func, bpp, push
+function ff_\func\()_\bpp\()_neon, export=1
+.if \push
+ mov x16, x30
+ stp d14, d15, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+.endif
+ lsl w2, w2, #\bpp - 8
+ lsl w3, w3, #\bpp - 8
+ lsl w4, w4, #\bpp - 8
+ mov x5, #1 << (\bpp - 8)
+ mov x6, #16 - \bpp
+ mov x7, #((1 << \bpp) - 1)
+.if \push
+ bl \func\()_16_neon
+ ldp d8, d9, [sp], 0x10
+ ldp d10, d11, [sp], 0x10
+ ldp d12, d13, [sp], 0x10
+ ldp d14, d15, [sp], 0x10
+ br x16
+.else
+ b \func\()_16_neon
+.endif
+endfunc
+.endm
+
+.macro bpp_frontends func, push=0
+ bpp_frontend \func, 10, \push
+ bpp_frontend \func, 12, \push
+.endm
+
+.macro bpp_frontend_rep func, suffix, int_suffix, dir, bpp, push
+function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
+ mov x16, x30
+.if \push
+ stp d14, d15, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+.endif
+ lsl w2, w2, #\bpp - 8
+ lsl w3, w3, #\bpp - 8
+ lsl w4, w4, #\bpp - 8
+ mov x5, #1 << (\bpp - 8)
+ mov x6, #16 - \bpp
+ mov x7, #((1 << \bpp) - 1)
+ bl \func\()_\int_suffix\()_16_neon
+.ifc \dir,h
+ add x0, x0, x1, lsl #3
+.else
+ add x0, x0, #16
+.endif
+ bl \func\()_\int_suffix\()_16_neon
+.if \push
+ ldp d8, d9, [sp], 0x10
+ ldp d10, d11, [sp], 0x10
+ ldp d12, d13, [sp], 0x10
+ ldp d14, d15, [sp], 0x10
+.endif
+ br x16
+endfunc
+.endm
+
+.macro bpp_frontends_rep func, suffix, int_suffix, dir, push=0
+ bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 10, \push
+ bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 12, \push
+.endm
+
+.macro bpp_frontend_mix2 wd1, wd2, dir, bpp
+function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
+ mov x16, x30
+ lsr w8, w2, #8
+ lsr w14, w3, #8
+ lsr w15, w4, #8
+ and w2, w2, #0xff
+ and w3, w3, #0xff
+ and w4, w4, #0xff
+ lsl w2, w2, #\bpp - 8
+ lsl w3, w3, #\bpp - 8
+ lsl w4, w4, #\bpp - 8
+ mov x5, #1 << (\bpp - 8)
+ mov x6, #16 - \bpp
+ mov x7, #((1 << \bpp) - 1)
+ bl vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
+.ifc \dir,h
+ add x0, x0, x1, lsl #3
+.else
+ add x0, x0, #16
+.endif
+ lsl w2, w8, #\bpp - 8
+ lsl w3, w14, #\bpp - 8
+ lsl w4, w15, #\bpp - 8
+ bl vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
+ br x16
+endfunc
+.endm
+
+.macro bpp_frontends_mix2 wd1, wd2
+ bpp_frontend_mix2 \wd1, \wd2, v, 10
+ bpp_frontend_mix2 \wd1, \wd2, v, 12
+ bpp_frontend_mix2 \wd1, \wd2, h, 10
+ bpp_frontend_mix2 \wd1, \wd2, h, 12
+.endm
+
+function vp9_loop_filter_v_4_8_16_neon
+ mov x10, x30
+ sub x9, x0, x1, lsl #2
+ ld1 {v20.8h}, [x9], x1 // p3
+ ld1 {v24.8h}, [x0], x1 // q0
+ ld1 {v21.8h}, [x9], x1 // p2
+ ld1 {v25.8h}, [x0], x1 // q1
+ ld1 {v22.8h}, [x9], x1 // p1
+ ld1 {v26.8h}, [x0], x1 // q2
+ ld1 {v23.8h}, [x9], x1 // p0
+ ld1 {v27.8h}, [x0], x1 // q3
+ sub x0, x0, x1, lsl #2
+ sub x9, x9, x1, lsl #1
+
+ loop_filter_4
+
+ st1 {v22.8h}, [x9], x1
+ st1 {v24.8h}, [x0], x1
+ st1 {v23.8h}, [x9], x1
+ st1 {v25.8h}, [x0], x1
+ sub x0, x0, x1, lsl #1
+
+ br x10
+endfunc
+
+bpp_frontends vp9_loop_filter_v_4_8
+
+function vp9_loop_filter_h_4_8_16_neon
+ mov x10, x30
+ sub x9, x0, #8
+ add x0, x9, x1, lsl #2
+ ld1 {v20.8h}, [x9], x1
+ ld1 {v24.8h}, [x0], x1
+ ld1 {v21.8h}, [x9], x1
+ ld1 {v25.8h}, [x0], x1
+ ld1 {v22.8h}, [x9], x1
+ ld1 {v26.8h}, [x0], x1
+ ld1 {v23.8h}, [x9], x1
+ ld1 {v27.8h}, [x0], x1
+
+ sub x9, x9, x1, lsl #2
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #8
+
+ transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ loop_filter_4
+
+ // Move x9 forward by 2 pixels; we don't need to rewrite the
+ // outermost 2 pixels since they aren't changed.
+ add x9, x9, #4
+ add x0, x9, x1, lsl #2
+
+ // We only will write the mid 4 pixels back; after the loop filter,
+ // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
+ // We need to transpose them to columns, done with a 4x8 transpose
+ // (which in practice is two 4x4 transposes of the two 4x4 halves
+ // of the 8x4 pixels; into 4x8 pixels).
+ transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29
+ st1 {v22.d}[0], [x9], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.d}[0], [x9], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.d}[0], [x9], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.d}[0], [x9], x1
+ st1 {v25.d}[1], [x0], x1
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #4
+
+ br x10
+endfunc
+
+bpp_frontends vp9_loop_filter_h_4_8
+
+function vp9_loop_filter_v_8_8_16_neon
+ mov x10, x30
+ sub x9, x0, x1, lsl #2
+ ld1 {v20.8h}, [x9], x1 // p3
+ ld1 {v24.8h}, [x0], x1 // q0
+ ld1 {v21.8h}, [x9], x1 // p2
+ ld1 {v25.8h}, [x0], x1 // q1
+ ld1 {v22.8h}, [x9], x1 // p1
+ ld1 {v26.8h}, [x0], x1 // q2
+ ld1 {v23.8h}, [x9], x1 // p0
+ ld1 {v27.8h}, [x0], x1 // q3
+ sub x9, x9, x1, lsl #2
+ sub x0, x0, x1, lsl #2
+ add x9, x9, x1
+
+ loop_filter_8
+
+ st1 {v21.8h}, [x9], x1
+ st1 {v24.8h}, [x0], x1
+ st1 {v22.8h}, [x9], x1
+ st1 {v25.8h}, [x0], x1
+ st1 {v23.8h}, [x9], x1
+ st1 {v26.8h}, [x0], x1
+ sub x0, x0, x1, lsl #1
+ sub x0, x0, x1
+
+ br x10
+6:
+ sub x9, x0, x1, lsl #1
+ st1 {v22.8h}, [x9], x1
+ st1 {v24.8h}, [x0], x1
+ st1 {v23.8h}, [x9], x1
+ st1 {v25.8h}, [x0], x1
+ sub x0, x0, x1, lsl #1
+ br x10
+endfunc
+
+bpp_frontends vp9_loop_filter_v_8_8
+
+function vp9_loop_filter_h_8_8_16_neon
+ mov x10, x30
+ sub x9, x0, #8
+ add x0, x9, x1, lsl #2
+ ld1 {v20.8h}, [x9], x1
+ ld1 {v24.8h}, [x0], x1
+ ld1 {v21.8h}, [x9], x1
+ ld1 {v25.8h}, [x0], x1
+ ld1 {v22.8h}, [x9], x1
+ ld1 {v26.8h}, [x0], x1
+ ld1 {v23.8h}, [x9], x1
+ ld1 {v27.8h}, [x0], x1
+
+ sub x9, x9, x1, lsl #2
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #8
+
+ transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ loop_filter_8
+
+ add x0, x9, x1, lsl #2
+
+ // Even though only 6 pixels per row have been changed, we write the
+ // full 8 pixel registers.
+ transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ st1 {v20.8h}, [x9], x1
+ st1 {v24.8h}, [x0], x1
+ st1 {v21.8h}, [x9], x1
+ st1 {v25.8h}, [x0], x1
+ st1 {v22.8h}, [x9], x1
+ st1 {v26.8h}, [x0], x1
+ st1 {v23.8h}, [x9], x1
+ st1 {v27.8h}, [x0], x1
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #8
+
+ br x10
+6:
+ // If we didn't need to do the flat8in part, we use the same writeback
+ // as in loop_filter_h_4_8.
+ add x9, x9, #4
+ add x0, x9, x1, lsl #2
+ transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29
+ st1 {v22.d}[0], [x9], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.d}[0], [x9], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.d}[0], [x9], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.d}[0], [x9], x1
+ st1 {v25.d}[1], [x0], x1
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #4
+ br x10
+endfunc
+
+bpp_frontends vp9_loop_filter_h_8_8
+
+bpp_frontends_mix2 4, 4
+bpp_frontends_mix2 4, 8
+bpp_frontends_mix2 8, 4
+bpp_frontends_mix2 8, 8
+
+function vp9_loop_filter_v_16_8_16_neon
+ mov x10, x30
+ sub x9, x0, x1, lsl #3
+ ld1 {v16.8h}, [x9], x1 // p7
+ ld1 {v24.8h}, [x0], x1 // q0
+ ld1 {v17.8h}, [x9], x1 // p6
+ ld1 {v25.8h}, [x0], x1 // q1
+ ld1 {v18.8h}, [x9], x1 // p5
+ ld1 {v26.8h}, [x0], x1 // q2
+ ld1 {v19.8h}, [x9], x1 // p4
+ ld1 {v27.8h}, [x0], x1 // q3
+ ld1 {v20.8h}, [x9], x1 // p3
+ ld1 {v28.8h}, [x0], x1 // q4
+ ld1 {v21.8h}, [x9], x1 // p2
+ ld1 {v29.8h}, [x0], x1 // q5
+ ld1 {v22.8h}, [x9], x1 // p1
+ ld1 {v30.8h}, [x0], x1 // q6
+ ld1 {v23.8h}, [x9], x1 // p0
+ ld1 {v31.8h}, [x0], x1 // q7
+ sub x9, x9, x1, lsl #3
+ sub x0, x0, x1, lsl #3
+ add x9, x9, x1
+
+ loop_filter_16
+
+ // If we did the flat8out part, we get the output in
+ // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
+ // store v2-v9 there, and v10-v17 into x0.
+ st1 {v2.8h}, [x9], x1
+ st1 {v10.8h}, [x0], x1
+ st1 {v3.8h}, [x9], x1
+ st1 {v11.8h}, [x0], x1
+ st1 {v4.8h}, [x9], x1
+ st1 {v12.8h}, [x0], x1
+ st1 {v5.8h}, [x9], x1
+ st1 {v13.8h}, [x0], x1
+ st1 {v6.8h}, [x9], x1
+ st1 {v14.8h}, [x0], x1
+ st1 {v8.8h}, [x9], x1
+ st1 {v15.8h}, [x0], x1
+ st1 {v9.8h}, [x9], x1
+ st1 {v17.8h}, [x0], x1
+ sub x0, x0, x1, lsl #3
+ add x0, x0, x1
+
+ br x10
+8:
+ add x9, x9, x1, lsl #2
+ // If we didn't do the flat8out part, the output is left in the
+ // input registers.
+ st1 {v21.8h}, [x9], x1
+ st1 {v24.8h}, [x0], x1
+ st1 {v22.8h}, [x9], x1
+ st1 {v25.8h}, [x0], x1
+ st1 {v23.8h}, [x9], x1
+ st1 {v26.8h}, [x0], x1
+ sub x0, x0, x1, lsl #1
+ sub x0, x0, x1
+ br x10
+7:
+ sub x9, x0, x1, lsl #1
+ st1 {v22.8h}, [x9], x1
+ st1 {v24.8h}, [x0], x1
+ st1 {v23.8h}, [x9], x1
+ st1 {v25.8h}, [x0], x1
+ sub x0, x0, x1, lsl #1
+ br x10
+endfunc
+
+bpp_frontends vp9_loop_filter_v_16_8, push=1
+bpp_frontends_rep vp9_loop_filter_v_16, 16, 8, v, push=1
+
+function vp9_loop_filter_h_16_8_16_neon
+ mov x10, x30
+ sub x9, x0, #16
+ ld1 {v16.8h}, [x9], x1
+ ld1 {v24.8h}, [x0], x1
+ ld1 {v17.8h}, [x9], x1
+ ld1 {v25.8h}, [x0], x1
+ ld1 {v18.8h}, [x9], x1
+ ld1 {v26.8h}, [x0], x1
+ ld1 {v19.8h}, [x9], x1
+ ld1 {v27.8h}, [x0], x1
+ ld1 {v20.8h}, [x9], x1
+ ld1 {v28.8h}, [x0], x1
+ ld1 {v21.8h}, [x9], x1
+ ld1 {v29.8h}, [x0], x1
+ ld1 {v22.8h}, [x9], x1
+ ld1 {v30.8h}, [x0], x1
+ ld1 {v23.8h}, [x9], x1
+ ld1 {v31.8h}, [x0], x1
+ sub x0, x0, x1, lsl #3
+ sub x9, x9, x1, lsl #3
+
+ // The 16x8 pixels read above is in two 8x8 blocks; the left
+ // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes
+ // of this, to get one column per register.
+ transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
+ transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
+
+ loop_filter_16
+
+ transpose_8x8H v16, v2, v3, v4, v5, v6, v8, v9, v0, v1
+ transpose_8x8H v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
+
+ st1 {v16.8h}, [x9], x1
+ st1 {v10.8h}, [x0], x1
+ st1 {v2.8h}, [x9], x1
+ st1 {v11.8h}, [x0], x1
+ st1 {v3.8h}, [x9], x1
+ st1 {v12.8h}, [x0], x1
+ st1 {v4.8h}, [x9], x1
+ st1 {v13.8h}, [x0], x1
+ st1 {v5.8h}, [x9], x1
+ st1 {v14.8h}, [x0], x1
+ st1 {v6.8h}, [x9], x1
+ st1 {v15.8h}, [x0], x1
+ st1 {v8.8h}, [x9], x1
+ st1 {v17.8h}, [x0], x1
+ st1 {v9.8h}, [x9], x1
+ st1 {v31.8h}, [x0], x1
+ sub x0, x0, x1, lsl #3
+
+ br x10
+8:
+ // The same writeback as in loop_filter_h_8_8
+ sub x9, x0, #8
+ add x0, x9, x1, lsl #2
+ transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ st1 {v20.8h}, [x9], x1
+ st1 {v24.8h}, [x0], x1
+ st1 {v21.8h}, [x9], x1
+ st1 {v25.8h}, [x0], x1
+ st1 {v22.8h}, [x9], x1
+ st1 {v26.8h}, [x0], x1
+ st1 {v23.8h}, [x9], x1
+ st1 {v27.8h}, [x0], x1
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #8
+ br x10
+7:
+ // The same writeback as in loop_filter_h_4_8
+ sub x9, x0, #4
+ add x0, x9, x1, lsl #2
+ transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29
+ st1 {v22.d}[0], [x9], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.d}[0], [x9], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.d}[0], [x9], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.d}[0], [x9], x1
+ st1 {v25.d}[1], [x0], x1
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #4
+ br x10
+endfunc
+
+bpp_frontends vp9_loop_filter_h_16_8, push=1
+bpp_frontends_rep vp9_loop_filter_h_16, 16, 8, h, push=1
diff --git a/media/ffvpx/libavcodec/aarch64/vp9lpf_neon.S b/media/ffvpx/libavcodec/aarch64/vp9lpf_neon.S
new file mode 100644
index 0000000000..0878763020
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/vp9lpf_neon.S
@@ -0,0 +1,1334 @@
+/*
+ * Copyright (c) 2016 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+
+// The main loop filter macro is templated and can produce filters for
+// vectors of 8 or 16 bytes. The register mapping throughout the filter
+// is close to identical to the arm version (please try to maintain this,
+// if either is changed!). When the arm version uses e.g. d20 for the
+// input variable p3, the aarch64 version uses v20.8b or v20.16b, depending
+// on vector length.
+//
+// The number of elements in the vector is passed in via the macro parameter
+// \sz, which is either .8b or .16b. For simple instructions that doesn't
+// lengthen or narrow things, this can easily be templated like this:
+// uabd v4\sz, v20\sz, v21\sz
+//
+// For instructions that lengthen or narrow content, the arm version would
+// have used q registers. For these instructions, we have macros that expand
+// into either a single e.g. uaddl instruction, or into a uaddl + uaddl2
+// pair, depending on the \sz parameter. Wherever the arm version would have
+// used a q register, these macros instead take two v registers, i.e. q3
+// is mapped to v6+v7. For the case with 8 byte input vectors, such a
+// lengthening operation is only stored in v6.8h (what was in q3 in the arm
+// case), while the 16 byte input vectors will use v6.8h + v7.8h.
+// Such a macro invocation would look like this:
+// uaddl_sz v8.8h, v9.8h, v17, v18, \sz
+//
+// That is, in the 8 byte input vector case, the second register in these
+// register pairs will be unused.
+// Unfortunately, this makes the code quite hard to read. For readability,
+// see the arm version instead.
+
+
+.macro add_sz dst1, dst2, in1, in2, in3, in4, sz
+ add \dst1, \in1, \in3
+.ifc \sz, .16b
+ add \dst2, \in2, \in4
+.endif
+.endm
+
+.macro sub_sz dst1, dst2, in1, in2, in3, in4, sz
+ sub \dst1, \in1, \in3
+.ifc \sz, .16b
+ sub \dst2, \in2, \in4
+.endif
+.endm
+
+.macro uaddw_sz dst1, dst2, in1, in2, in3, sz
+ uaddw \dst1, \in1, \in3\().8b
+.ifc \sz, .16b
+ uaddw2 \dst2, \in2, \in3\().16b
+.endif
+.endm
+
+.macro usubw_sz dst1, dst2, in1, in2, in3, sz
+ usubw \dst1, \in1, \in3\().8b
+.ifc \sz, .16b
+ usubw2 \dst2, \in2, \in3\().16b
+.endif
+.endm
+
+.macro usubl_sz dst1, dst2, in1, in2, sz
+ usubl \dst1, \in1\().8b, \in2\().8b
+.ifc \sz, .16b
+ usubl2 \dst2, \in1\().16b, \in2\().16b
+.endif
+.endm
+
+.macro sqxtn_sz dst, in1, in2, sz
+ sqxtn \dst\().8b, \in1
+.ifc \sz, .16b
+ sqxtn2 \dst\().16b, \in2
+.endif
+.endm
+
+.macro sqxtun_sz dst, in1, in2, sz
+ sqxtun \dst\().8b, \in1
+.ifc \sz, .16b
+ sqxtun2 \dst\().16b, \in2
+.endif
+.endm
+
+.macro mul_sz dst1, dst2, in1, in2, in3, in4, sz
+ mul \dst1, \in1, \in3
+.ifc \sz, .16b
+ mul \dst2, \in2, \in4
+.endif
+.endm
+
+.macro saddw_sz dst1, dst2, in1, in2, in3, sz
+ saddw \dst1, \in1, \in3\().8b
+.ifc \sz, .16b
+ saddw2 \dst2, \in2, \in3\().16b
+.endif
+.endm
+
+.macro ssubw_sz dst1, dst2, in1, in2, in3, sz
+ ssubw \dst1, \in1, \in3\().8b
+.ifc \sz, .16b
+ ssubw2 \dst2, \in2, \in3\().16b
+.endif
+.endm
+
+.macro uxtl_sz dst1, dst2, in, sz
+ uxtl \dst1, \in\().8b
+.ifc \sz, .16b
+ uxtl2 \dst2, \in\().16b
+.endif
+.endm
+
+.macro uaddl_sz dst1, dst2, in1, in2, sz
+ uaddl \dst1, \in1\().8b, \in2\().8b
+.ifc \sz, .16b
+ uaddl2 \dst2, \in1\().16b, \in2\().16b
+.endif
+.endm
+
+.macro rshrn_sz dst, in1, in2, shift, sz
+ rshrn \dst\().8b, \in1, \shift
+.ifc \sz, .16b
+ rshrn2 \dst\().16b, \in2, \shift
+.endif
+.endm
+
+.macro ushll_sz dst1, dst2, in, shift, sz
+ ushll \dst1, \in\().8b, \shift
+.ifc \sz, .16b
+ ushll2 \dst2, \in\().16b, \shift
+.endif
+.endm
+
+// The input to and output from this macro is in the registers v16-v31,
+// and v0-v7 are used as scratch registers.
+// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
+// Depending on the width of the loop filter, we either use v16-v19
+// and v28-v31 as temp registers, or v8-v15.
+// When comparing to the arm version, tmpq1 == tmp1 + tmp2,
+// tmpq2 == tmp3 + tmp4, etc.
+.macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
+.if \mix == 0
+ dup v0\sz, w2 // E
+ dup v2\sz, w3 // I
+ dup v3\sz, w4 // H
+.else
+ dup v0.8h, w2 // E
+ dup v2.8h, w3 // I
+ dup v3.8h, w4 // H
+ rev16 v1.16b, v0.16b // E
+ rev16 v4.16b, v2.16b // I
+ rev16 v5.16b, v3.16b // H
+ uzp1 v0.16b, v0.16b, v1.16b
+ uzp1 v2.16b, v2.16b, v4.16b
+ uzp1 v3.16b, v3.16b, v5.16b
+.endif
+
+ uabd v4\sz, v20\sz, v21\sz // abs(p3 - p2)
+ uabd v5\sz, v21\sz, v22\sz // abs(p2 - p1)
+ uabd v6\sz, v22\sz, v23\sz // abs(p1 - p0)
+ uabd v7\sz, v24\sz, v25\sz // abs(q0 - q1)
+ uabd \tmp1\sz, v25\sz, v26\sz // abs(q1 - q2)
+ uabd \tmp2\sz, v26\sz, v27\sz // abs(q2 - q3)
+ umax v4\sz, v4\sz, v5\sz
+ umax v5\sz, v6\sz, v7\sz
+ umax \tmp1\sz, \tmp1\sz, \tmp2\sz
+ uabd v6\sz, v23\sz, v24\sz // abs(p0 - q0)
+ umax v4\sz, v4\sz, v5\sz
+ uqadd v6\sz, v6\sz, v6\sz // abs(p0 - q0) * 2
+ uabd v5\sz, v22\sz, v25\sz // abs(p1 - q1)
+ umax v4\sz, v4\sz, \tmp1\sz // max(abs(p3 - p2), ..., abs(q2 - q3))
+ ushr v5\sz, v5\sz, #1
+ cmhs v4\sz, v2\sz, v4\sz // max(abs()) <= I
+ uqadd v6\sz, v6\sz, v5\sz // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+ cmhs v5\sz, v0\sz, v6\sz
+ and v4\sz, v4\sz, v5\sz // fm
+
+ // If no pixels need filtering, just exit as soon as possible
+ mov x5, v4.d[0]
+.ifc \sz, .16b
+ mov x6, v4.d[1]
+ adds x5, x5, x6
+ b.eq 9f
+.else
+ cbz x5, 9f
+.endif
+
+.if \wd >= 8
+ movi v0\sz, #1
+
+ uabd v6\sz, v20\sz, v23\sz // abs(p3 - p0)
+ uabd v2\sz, v21\sz, v23\sz // abs(p2 - p0)
+ uabd v1\sz, v22\sz, v23\sz // abs(p1 - p0)
+ uabd \tmp1\sz, v25\sz, v24\sz // abs(q1 - q0)
+ uabd \tmp2\sz, v26\sz, v24\sz // abs(q2 - q0)
+ uabd \tmp3\sz, v27\sz, v24\sz // abs(q3 - q0)
+ umax v6\sz, v6\sz, v2\sz
+ umax v1\sz, v1\sz, \tmp1\sz
+ umax \tmp2\sz, \tmp2\sz, \tmp3\sz
+.if \wd == 16
+ uabd v7\sz, v16\sz, v23\sz // abs(p7 - p0)
+ umax v6\sz, v6\sz, v1\sz
+ uabd v2\sz, v17\sz, v23\sz // abs(p6 - p0)
+ umax v6\sz, v6\sz, \tmp2\sz
+ uabd v1\sz, v18\sz, v23\sz // abs(p5 - p0)
+ cmhs v6\sz, v0\sz, v6\sz // flat8in
+ uabd v8\sz, v19\sz, v23\sz // abs(p4 - p0)
+ and v6\sz, v6\sz, v4\sz // flat8in && fm
+ uabd v9\sz, v28\sz, v24\sz // abs(q4 - q0)
+ bic v4\sz, v4\sz, v6\sz // fm && !flat8in
+ uabd v10\sz, v29\sz, v24\sz // abs(q5 - q0)
+ uabd v11\sz, v30\sz, v24\sz // abs(q6 - q0)
+ uabd v12\sz, v31\sz, v24\sz // abs(q7 - q0)
+
+ umax v7\sz, v7\sz, v2\sz
+ umax v1\sz, v1\sz, v8\sz
+ umax v9\sz, v9\sz, v10\sz
+ umax v11\sz, v11\sz, v12\sz
+ // The rest of the calculation of flat8out is interleaved below
+.else
+ // The rest of the calculation of flat8in is interleaved below
+.endif
+.endif
+
+ // Calculate the normal inner loop filter for 2 or 4 pixels
+ uabd v5\sz, v22\sz, v23\sz // abs(p1 - p0)
+.if \wd == 16
+ umax v7\sz, v7\sz, v1\sz
+ umax v9\sz, v9\sz, v11\sz
+.elseif \wd == 8
+ umax v6\sz, v6\sz, v1\sz
+.endif
+ uabd v1\sz, v25\sz, v24\sz // abs(q1 - q0)
+.if \wd == 16
+ umax v7\sz, v7\sz, v9\sz
+.elseif \wd == 8
+ umax v6\sz, v6\sz, \tmp2\sz
+.endif
+ usubl_sz \tmp1\().8h, \tmp2\().8h, v22, v25, \sz // p1 - q1
+ umax v5\sz, v5\sz, v1\sz // max(abs(p1 - p0), abs(q1 - q0))
+.if \mix != 0
+ mov v1.d[0], x11
+.endif
+ usubl_sz \tmp3\().8h, \tmp4\().8h, v24, v23, \sz // q0 - p0
+ movi \tmp5\().8h, #3
+.if \wd == 8
+ cmhs v6\sz, v0\sz, v6\sz // flat8in
+.endif
+.if \mix != 0
+ sxtl v1.8h, v1.8b
+.endif
+ cmhs v5\sz, v3\sz, v5\sz // !hev
+.if \wd == 8
+ // If a 4/8 or 8/4 mix is used, clear the relevant half of v6
+.if \mix != 0
+ and v6\sz, v6\sz, v1.16b
+.endif
+ and v6\sz, v6\sz, v4\sz // flat8in && fm
+.endif
+ sqxtn_sz \tmp1, \tmp1\().8h, \tmp2\().8h, \sz // av_clip_int8(p1 - q1)
+.if \wd == 16
+ cmhs v7\sz, v0\sz, v7\sz // flat8out
+.elseif \wd == 8
+ bic v4\sz, v4\sz, v6\sz // fm && !flat8in
+.endif
+ and v5\sz, v5\sz, v4\sz // !hev && fm && !flat8in
+.if \wd == 16
+ and v7\sz, v7\sz, v6\sz // flat8out && flat8in && fm
+.endif
+
+ mul_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp5\().8h, \tmp5\().8h, \sz // 3 * (q0 - p0)
+ bic \tmp1\sz, \tmp1\sz, v5\sz // if (!hev) av_clip_int8 = 0
+ movi v2\sz, #4
+ saddw_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp1, \sz // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
+ movi v3\sz, #3
+ sqxtn_sz \tmp1, \tmp3\().8h, \tmp4\().8h, \sz // f
+.if \wd == 16
+ bic v6\sz, v6\sz, v7\sz // fm && flat8in && !flat8out
+.endif
+
+ sqadd \tmp3\sz, \tmp1\sz, v2\sz // FFMIN(f + 4, 127)
+ sqadd \tmp4\sz, \tmp1\sz, v3\sz // FFMIN(f + 3, 127)
+ uxtl_sz v0.8h, v1.8h, v23, \sz // p0
+ sshr \tmp3\sz, \tmp3\sz, #3 // f1
+ sshr \tmp4\sz, \tmp4\sz, #3 // f2
+
+ uxtl_sz v2.8h, v3.8h, v24, \sz // q0
+ saddw_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp4, \sz // p0 + f2
+ ssubw_sz v2.8h, v3.8h, v2.8h, v3.8h, \tmp3, \sz // q0 - f1
+ sqxtun_sz v0, v0.8h, v1.8h, \sz // out p0
+ sqxtun_sz v1, v2.8h, v3.8h, \sz // out q0
+ srshr \tmp3\sz, \tmp3\sz, #1 // f = (f1 + 1) >> 1
+ bit v23\sz, v0\sz, v4\sz // if (fm && !flat8in)
+ bit v24\sz, v1\sz, v4\sz
+
+ uxtl_sz v0.8h, v1.8h, v22, \sz // p1
+ uxtl_sz v2.8h, v3.8h, v25, \sz // q1
+.if \wd >= 8
+ mov x5, v6.d[0]
+.ifc \sz, .16b
+ mov x6, v6.d[1]
+.endif
+.endif
+ saddw_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp3, \sz // p1 + f
+ ssubw_sz v2.8h, v3.8h, v2.8h, v3.8h, \tmp3, \sz // q1 - f
+ sqxtun_sz v0, v0.8h, v1.8h, \sz // out p1
+ sqxtun_sz v2, v2.8h, v3.8h, \sz // out q1
+.if \wd >= 8
+.ifc \sz, .16b
+ adds x5, x5, x6
+.endif
+.endif
+ bit v22\sz, v0\sz, v5\sz // if (!hev && fm && !flat8in)
+ bit v25\sz, v2\sz, v5\sz
+
+ // If no pixels need flat8in, jump to flat8out
+ // (or to a writeout of the inner 4 pixels, for wd=8)
+.if \wd >= 8
+.ifc \sz, .16b
+ b.eq 6f
+.else
+ cbz x5, 6f
+.endif
+
+ // flat8in
+ uaddl_sz \tmp1\().8h, \tmp2\().8h, v20, v21, \sz
+ uaddl_sz \tmp3\().8h, \tmp4\().8h, v22, v25, \sz
+ uaddl_sz \tmp5\().8h, \tmp6\().8h, v20, v22, \sz
+ uaddl_sz \tmp7\().8h, \tmp8\().8h, v23, v26, \sz
+ add_sz v0.8h, v1.8h, \tmp1\().8h, \tmp2\().8h, \tmp1\().8h, \tmp2\().8h, \sz
+ uaddw_sz v0.8h, v1.8h, v0.8h, v1.8h, v23, \sz
+ uaddw_sz v0.8h, v1.8h, v0.8h, v1.8h, v24, \sz
+ add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp5\().8h, \tmp6\().8h, \sz
+ sub_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp1\().8h, \tmp2\().8h, \sz
+ sub_sz \tmp7\().8h, \tmp8\().8h, \tmp7\().8h, \tmp8\().8h, \tmp5\().8h, \tmp6\().8h, \sz
+ rshrn_sz v2, v0.8h, v1.8h, #3, \sz // out p2
+
+ add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp3\().8h, \tmp4\().8h, \sz
+ uaddl_sz \tmp1\().8h, \tmp2\().8h, v20, v23, \sz
+ uaddl_sz \tmp3\().8h, \tmp4\().8h, v24, v27, \sz
+ rshrn_sz v3, v0.8h, v1.8h, #3, \sz // out p1
+
+ add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp7\().8h, \tmp8\().8h, \sz
+ sub_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp1\().8h, \tmp2\().8h, \sz
+ uaddl_sz \tmp5\().8h, \tmp6\().8h, v21, v24, \sz
+ uaddl_sz \tmp7\().8h, \tmp8\().8h, v25, v27, \sz
+ rshrn_sz v4, v0.8h, v1.8h, #3, \sz // out p0
+
+ add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp3\().8h, \tmp4\().8h, \sz
+ sub_sz \tmp7\().8h, \tmp8\().8h, \tmp7\().8h, \tmp8\().8h, \tmp5\().8h, \tmp6\().8h, \sz
+ uaddl_sz \tmp1\().8h, \tmp2\().8h, v22, v25, \sz
+ uaddl_sz \tmp3\().8h, \tmp4\().8h, v26, v27, \sz
+ rshrn_sz v5, v0.8h, v1.8h, #3, \sz // out q0
+
+ add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp7\().8h, \tmp8\().8h, \sz
+ sub_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp1\().8h, \tmp2\().8h, \sz
+ rshrn_sz \tmp5, v0.8h, v1.8h, #3, \sz // out q1
+
+ add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp3\().8h, \tmp4\().8h, \sz
+ // The output here is written back into the input registers. This doesn't
+ // matter for the flat8part below, since we only update those pixels
+ // which won't be touched below.
+ bit v21\sz, v2\sz, v6\sz
+ bit v22\sz, v3\sz, v6\sz
+ bit v23\sz, v4\sz, v6\sz
+ rshrn_sz \tmp6, v0.8h, v1.8h, #3, \sz // out q2
+ bit v24\sz, v5\sz, v6\sz
+ bit v25\sz, \tmp5\sz, v6\sz
+ bit v26\sz, \tmp6\sz, v6\sz
+.endif
+.if \wd == 16
+6:
+ orr v2\sz, v6\sz, v7\sz
+ mov x5, v2.d[0]
+.ifc \sz, .16b
+ mov x6, v2.d[1]
+ adds x5, x5, x6
+ b.ne 1f
+.else
+ cbnz x5, 1f
+.endif
+ // If no pixels needed flat8in nor flat8out, jump to a
+ // writeout of the inner 4 pixels
+ br x14
+1:
+
+ mov x5, v7.d[0]
+.ifc \sz, .16b
+ mov x6, v7.d[1]
+ adds x5, x5, x6
+ b.ne 1f
+.else
+ cbnz x5, 1f
+.endif
+ // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
+ br x15
+
+1:
+ // flat8out
+ // This writes all outputs into v2-v17 (skipping v6 and v16).
+ // If this part is skipped, the output is read from v21-v26 (which is the input
+ // to this section).
+ ushll_sz v0.8h, v1.8h, v16, #3, \sz // 8 * v16
+ usubw_sz v0.8h, v1.8h, v0.8h, v1.8h, v16, \sz // 7 * v16
+ uaddw_sz v0.8h, v1.8h, v0.8h, v1.8h, v17, \sz
+ uaddl_sz v8.8h, v9.8h, v17, v18, \sz
+ uaddl_sz v10.8h, v11.8h, v19, v20, \sz
+ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v8.8h, v9.8h, \sz
+ uaddl_sz v8.8h, v9.8h, v16, v17, \sz
+ uaddl_sz v12.8h, v13.8h, v21, v22, \sz
+ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz
+ uaddl_sz v10.8h, v11.8h, v18, v25, \sz
+ uaddl_sz v14.8h, v15.8h, v23, v24, \sz
+ sub_sz v10.8h, v11.8h, v10.8h, v11.8h, v8.8h, v9.8h, \sz
+ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v12.8h, v13.8h, \sz
+ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz
+ uaddl_sz v12.8h, v13.8h, v16, v18, \sz
+ uaddl_sz v14.8h, v15.8h, v19, v26, \sz
+ rshrn_sz v2, v0.8h, v1.8h, #4, \sz
+
+ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz
+ uaddl_sz v8.8h, v9.8h, v16, v19, \sz
+ uaddl_sz v10.8h, v11.8h, v20, v27, \sz
+ sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz
+ bif v2\sz, v17\sz, v7\sz
+ rshrn_sz v3, v0.8h, v1.8h, #4, \sz
+
+ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz
+ uaddl_sz v12.8h, v13.8h, v16, v20, \sz
+ uaddl_sz v14.8h, v15.8h, v21, v28, \sz
+ sub_sz v10.8h, v11.8h, v10.8h, v11.8h, v8.8h, v9.8h, \sz
+ bif v3\sz, v18\sz, v7\sz
+ rshrn_sz v4, v0.8h, v1.8h, #4, \sz
+
+ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz
+ uaddl_sz v8.8h, v9.8h, v16, v21, \sz
+ uaddl_sz v10.8h, v11.8h, v22, v29, \sz
+ sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz
+ bif v4\sz, v19\sz, v7\sz
+ rshrn_sz v5, v0.8h, v1.8h, #4, \sz
+
+ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz
+ uaddl_sz v12.8h, v13.8h, v16, v22, \sz
+ uaddl_sz v14.8h, v15.8h, v23, v30, \sz
+ sub_sz v10.8h, v11.8h, v10.8h, v11.8h, v8.8h, v9.8h, \sz
+ bif v5\sz, v20\sz, v7\sz
+ rshrn_sz v6, v0.8h, v1.8h, #4, \sz
+
+ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz
+ uaddl_sz v10.8h, v11.8h, v16, v23, \sz
+ sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz
+ uaddl_sz v12.8h, v13.8h, v24, v31, \sz
+ bif v6\sz, v21\sz, v7\sz
+ rshrn_sz v8, v0.8h, v1.8h, #4, \sz
+
+ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz
+ sub_sz v10.8h, v11.8h, v12.8h, v13.8h, v10.8h, v11.8h, \sz
+ uaddl_sz v12.8h, v13.8h, v17, v24, \sz
+ uaddl_sz v14.8h, v15.8h, v25, v31, \sz
+ bif v8\sz, v22\sz, v7\sz
+ rshrn_sz v9, v0.8h, v1.8h, #4, \sz
+
+ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz
+ sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz
+ uaddl_sz v12.8h, v13.8h, v26, v31, \sz
+ bif v9\sz, v23\sz, v7\sz
+ rshrn_sz v10, v0.8h, v1.8h, #4, \sz
+
+ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz
+ uaddl_sz v14.8h, v15.8h, v18, v25, \sz
+ uaddl_sz v18.8h, v19.8h, v19, v26, \sz
+ sub_sz v12.8h, v13.8h, v12.8h, v13.8h, v14.8h, v15.8h, \sz
+ uaddl_sz v14.8h, v15.8h, v27, v31, \sz
+ bif v10\sz, v24\sz, v7\sz
+ rshrn_sz v11, v0.8h, v1.8h, #4, \sz
+
+ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v12.8h, v13.8h, \sz
+ uaddl_sz v12.8h, v13.8h, v20, v27, \sz
+ sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v18.8h, v19.8h, \sz
+ uaddl_sz v18.8h, v19.8h, v28, v31, \sz
+ bif v11\sz, v25\sz, v7\sz
+ sub_sz v18.8h, v19.8h, v18.8h, v19.8h, v12.8h, v13.8h, \sz
+ rshrn_sz v12, v0.8h, v1.8h, #4, \sz
+
+ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz
+ uaddl_sz v14.8h, v15.8h, v21, v28, \sz
+ uaddl_sz v20.8h, v21.8h, v29, v31, \sz
+ bif v12\sz, v26\sz, v7\sz
+ rshrn_sz v13, v0.8h, v1.8h, #4, \sz
+
+ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v18.8h, v19.8h, \sz
+ sub_sz v20.8h, v21.8h, v20.8h, v21.8h, v14.8h, v15.8h, \sz
+ uaddl_sz v18.8h, v19.8h, v22, v29, \sz
+ uaddl_sz v22.8h, v23.8h, v30, v31, \sz
+ bif v13\sz, v27\sz, v7\sz
+ rshrn_sz v14, v0.8h, v1.8h, #4, \sz
+
+ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v20.8h, v21.8h, \sz
+ sub_sz v22.8h, v23.8h, v22.8h, v23.8h, v18.8h, v19.8h, \sz
+ bif v14\sz, v28\sz, v7\sz
+ rshrn_sz v15, v0.8h, v1.8h, #4, \sz
+
+ add_sz v0.8h, v1.8h, v0.8h, v1.8h, v22.8h, v23.8h, \sz
+ bif v15\sz, v29\sz, v7\sz
+ rshrn_sz v17, v0.8h, v1.8h, #4, \sz
+ bif v17\sz, v30\sz, v7\sz
+.endif
+.endm
+
+// For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
+// while we need those for inputs/outputs in wd=16 and use v8-v15
+// for temp registers there instead.
+function vp9_loop_filter_4
+ loop_filter 4, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31
+ ret
+9:
+ br x10
+endfunc
+
+function vp9_loop_filter_4_16b_mix_44
+ loop_filter 4, .16b, 44, v16, v17, v18, v19, v28, v29, v30, v31
+ ret
+9:
+ br x10
+endfunc
+
+function vp9_loop_filter_8
+ loop_filter 8, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31
+ ret
+6:
+ br x13
+9:
+ br x10
+endfunc
+
+function vp9_loop_filter_8_16b_mix
+ loop_filter 8, .16b, 88, v16, v17, v18, v19, v28, v29, v30, v31
+ ret
+6:
+ br x13
+9:
+ br x10
+endfunc
+
+function vp9_loop_filter_16
+ loop_filter 16, .8b, 0, v8, v9, v10, v11, v12, v13, v14, v15
+ ret
+9:
+ ldp d8, d9, [sp], 0x10
+ ldp d10, d11, [sp], 0x10
+ ldp d12, d13, [sp], 0x10
+ ldp d14, d15, [sp], 0x10
+ br x10
+endfunc
+
+function vp9_loop_filter_16_16b
+ loop_filter 16, .16b, 0, v8, v9, v10, v11, v12, v13, v14, v15
+ ret
+9:
+ ldp d8, d9, [sp], 0x10
+ ldp d10, d11, [sp], 0x10
+ ldp d12, d13, [sp], 0x10
+ ldp d14, d15, [sp], 0x10
+ br x10
+endfunc
+
+.macro loop_filter_4
+ bl vp9_loop_filter_4
+.endm
+
+.macro loop_filter_4_16b_mix mix
+ bl vp9_loop_filter_4_16b_mix_\mix
+.endm
+
+.macro loop_filter_8
+ // calculate alternative 'return' targets
+ adr x13, 6f
+ bl vp9_loop_filter_8
+.endm
+
+.macro loop_filter_8_16b_mix mix
+ // calculate alternative 'return' targets
+ adr x13, 6f
+.if \mix == 48
+ mov x11, #0xffffffff00000000
+.elseif \mix == 84
+ mov x11, #0x00000000ffffffff
+.else
+ mov x11, #0xffffffffffffffff
+.endif
+ bl vp9_loop_filter_8_16b_mix
+.endm
+
+.macro loop_filter_16
+ // calculate alternative 'return' targets
+ adr x14, 7f
+ adr x15, 8f
+ bl vp9_loop_filter_16
+.endm
+
+.macro loop_filter_16_16b
+ // calculate alternative 'return' targets
+ adr x14, 7f
+ adr x15, 8f
+ bl vp9_loop_filter_16_16b
+.endm
+
+
+// The public functions in this file have got the following signature:
+// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
+
+function ff_vp9_loop_filter_v_4_8_neon, export=1
+ mov x10, x30
+ sub x9, x0, x1, lsl #2
+ ld1 {v20.8b}, [x9], x1 // p3
+ ld1 {v24.8b}, [x0], x1 // q0
+ ld1 {v21.8b}, [x9], x1 // p2
+ ld1 {v25.8b}, [x0], x1 // q1
+ ld1 {v22.8b}, [x9], x1 // p1
+ ld1 {v26.8b}, [x0], x1 // q2
+ ld1 {v23.8b}, [x9], x1 // p0
+ ld1 {v27.8b}, [x0], x1 // q3
+ sub x0, x0, x1, lsl #2
+ sub x9, x9, x1, lsl #1
+
+ loop_filter_4
+
+ st1 {v22.8b}, [x9], x1
+ st1 {v24.8b}, [x0], x1
+ st1 {v23.8b}, [x9], x1
+ st1 {v25.8b}, [x0], x1
+
+ br x10
+endfunc
+
+function ff_vp9_loop_filter_v_44_16_neon, export=1
+ mov x10, x30
+ sub x9, x0, x1, lsl #2
+ ld1 {v20.16b}, [x9], x1 // p3
+ ld1 {v24.16b}, [x0], x1 // q0
+ ld1 {v21.16b}, [x9], x1 // p2
+ ld1 {v25.16b}, [x0], x1 // q1
+ ld1 {v22.16b}, [x9], x1 // p1
+ ld1 {v26.16b}, [x0], x1 // q2
+ ld1 {v23.16b}, [x9], x1 // p0
+ ld1 {v27.16b}, [x0], x1 // q3
+ sub x0, x0, x1, lsl #2
+ sub x9, x9, x1, lsl #1
+
+ loop_filter_4_16b_mix 44
+
+ st1 {v22.16b}, [x9], x1
+ st1 {v24.16b}, [x0], x1
+ st1 {v23.16b}, [x9], x1
+ st1 {v25.16b}, [x0], x1
+
+ br x10
+endfunc
+
+function ff_vp9_loop_filter_h_4_8_neon, export=1
+ mov x10, x30
+ sub x9, x0, #4
+ add x0, x9, x1, lsl #2
+ ld1 {v20.8b}, [x9], x1
+ ld1 {v24.8b}, [x0], x1
+ ld1 {v21.8b}, [x9], x1
+ ld1 {v25.8b}, [x0], x1
+ ld1 {v22.8b}, [x9], x1
+ ld1 {v26.8b}, [x0], x1
+ ld1 {v23.8b}, [x9], x1
+ ld1 {v27.8b}, [x0], x1
+
+ sub x9, x9, x1, lsl #2
+ sub x0, x0, x1, lsl #2
+ // Move x0/x9 forward by 2 pixels; we don't need to rewrite the
+ // outermost 2 pixels since they aren't changed.
+ add x9, x9, #2
+ add x0, x0, #2
+
+ transpose_8x8B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ loop_filter_4
+
+ // We only will write the mid 4 pixels back; after the loop filter,
+ // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
+ // We need to transpose them to columns, done with a 4x8 transpose
+ // (which in practice is two 4x4 transposes of the two 4x4 halves
+ // of the 8x4 pixels; into 4x8 pixels).
+ transpose_4x8B v22, v23, v24, v25, v26, v27, v28, v29
+ st1 {v22.s}[0], [x9], x1
+ st1 {v22.s}[1], [x0], x1
+ st1 {v23.s}[0], [x9], x1
+ st1 {v23.s}[1], [x0], x1
+ st1 {v24.s}[0], [x9], x1
+ st1 {v24.s}[1], [x0], x1
+ st1 {v25.s}[0], [x9], x1
+ st1 {v25.s}[1], [x0], x1
+
+ br x10
+endfunc
+
+function ff_vp9_loop_filter_h_44_16_neon, export=1
+ mov x10, x30
+ sub x9, x0, #4
+ add x0, x9, x1, lsl #3
+ ld1 {v20.8b}, [x9], x1
+ ld1 {v20.d}[1], [x0], x1
+ ld1 {v21.8b}, [x9], x1
+ ld1 {v21.d}[1], [x0], x1
+ ld1 {v22.8b}, [x9], x1
+ ld1 {v22.d}[1], [x0], x1
+ ld1 {v23.8b}, [x9], x1
+ ld1 {v23.d}[1], [x0], x1
+ ld1 {v24.8b}, [x9], x1
+ ld1 {v24.d}[1], [x0], x1
+ ld1 {v25.8b}, [x9], x1
+ ld1 {v25.d}[1], [x0], x1
+ ld1 {v26.8b}, [x9], x1
+ ld1 {v26.d}[1], [x0], x1
+ ld1 {v27.8b}, [x9], x1
+ ld1 {v27.d}[1], [x0], x1
+
+ sub x9, x9, x1, lsl #3
+ sub x0, x0, x1, lsl #3
+ add x9, x9, #2
+ add x0, x0, #2
+
+ transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ loop_filter_4_16b_mix 44
+
+ transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29
+
+ st1 {v22.s}[0], [x9], x1
+ st1 {v22.s}[2], [x0], x1
+ st1 {v23.s}[0], [x9], x1
+ st1 {v23.s}[2], [x0], x1
+ st1 {v24.s}[0], [x9], x1
+ st1 {v24.s}[2], [x0], x1
+ st1 {v25.s}[0], [x9], x1
+ st1 {v25.s}[2], [x0], x1
+ st1 {v22.s}[1], [x9], x1
+ st1 {v22.s}[3], [x0], x1
+ st1 {v23.s}[1], [x9], x1
+ st1 {v23.s}[3], [x0], x1
+ st1 {v24.s}[1], [x9], x1
+ st1 {v24.s}[3], [x0], x1
+ st1 {v25.s}[1], [x9], x1
+ st1 {v25.s}[3], [x0], x1
+
+ br x10
+endfunc
+
+function ff_vp9_loop_filter_v_8_8_neon, export=1
+ mov x10, x30
+ sub x9, x0, x1, lsl #2
+ ld1 {v20.8b}, [x9], x1 // p3
+ ld1 {v24.8b}, [x0], x1 // q0
+ ld1 {v21.8b}, [x9], x1 // p2
+ ld1 {v25.8b}, [x0], x1 // q1
+ ld1 {v22.8b}, [x9], x1 // p1
+ ld1 {v26.8b}, [x0], x1 // q2
+ ld1 {v23.8b}, [x9], x1 // p0
+ ld1 {v27.8b}, [x0], x1 // q3
+ sub x9, x9, x1, lsl #2
+ sub x0, x0, x1, lsl #2
+ add x9, x9, x1
+
+ loop_filter_8
+
+ st1 {v21.8b}, [x9], x1
+ st1 {v24.8b}, [x0], x1
+ st1 {v22.8b}, [x9], x1
+ st1 {v25.8b}, [x0], x1
+ st1 {v23.8b}, [x9], x1
+ st1 {v26.8b}, [x0], x1
+
+ br x10
+6:
+ sub x9, x0, x1, lsl #1
+ st1 {v22.8b}, [x9], x1
+ st1 {v24.8b}, [x0], x1
+ st1 {v23.8b}, [x9], x1
+ st1 {v25.8b}, [x0], x1
+ br x10
+endfunc
+
+.macro mix_v_16 mix
+function ff_vp9_loop_filter_v_\mix\()_16_neon, export=1
+ mov x10, x30
+ sub x9, x0, x1, lsl #2
+ ld1 {v20.16b}, [x9], x1 // p3
+ ld1 {v24.16b}, [x0], x1 // q0
+ ld1 {v21.16b}, [x9], x1 // p2
+ ld1 {v25.16b}, [x0], x1 // q1
+ ld1 {v22.16b}, [x9], x1 // p1
+ ld1 {v26.16b}, [x0], x1 // q2
+ ld1 {v23.16b}, [x9], x1 // p0
+ ld1 {v27.16b}, [x0], x1 // q3
+ sub x9, x9, x1, lsl #2
+ sub x0, x0, x1, lsl #2
+ add x9, x9, x1
+
+ loop_filter_8_16b_mix \mix
+
+ st1 {v21.16b}, [x9], x1
+ st1 {v24.16b}, [x0], x1
+ st1 {v22.16b}, [x9], x1
+ st1 {v25.16b}, [x0], x1
+ st1 {v23.16b}, [x9], x1
+ st1 {v26.16b}, [x0], x1
+
+ br x10
+6:
+ sub x9, x0, x1, lsl #1
+ st1 {v22.16b}, [x9], x1
+ st1 {v24.16b}, [x0], x1
+ st1 {v23.16b}, [x9], x1
+ st1 {v25.16b}, [x0], x1
+ br x10
+endfunc
+.endm
+
+mix_v_16 48
+mix_v_16 84
+mix_v_16 88
+
+function ff_vp9_loop_filter_h_8_8_neon, export=1
+ mov x10, x30
+ sub x9, x0, #4
+ add x0, x9, x1, lsl #2
+ ld1 {v20.8b}, [x9], x1
+ ld1 {v24.8b}, [x0], x1
+ ld1 {v21.8b}, [x9], x1
+ ld1 {v25.8b}, [x0], x1
+ ld1 {v22.8b}, [x9], x1
+ ld1 {v26.8b}, [x0], x1
+ ld1 {v23.8b}, [x9], x1
+ ld1 {v27.8b}, [x0], x1
+
+ sub x9, x9, x1, lsl #2
+ sub x0, x0, x1, lsl #2
+
+ transpose_8x8B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ loop_filter_8
+
+ // Even though only 6 pixels per row have been changed, we write the
+ // full 8 pixel registers.
+ transpose_8x8B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ st1 {v20.8b}, [x9], x1
+ st1 {v24.8b}, [x0], x1
+ st1 {v21.8b}, [x9], x1
+ st1 {v25.8b}, [x0], x1
+ st1 {v22.8b}, [x9], x1
+ st1 {v26.8b}, [x0], x1
+ st1 {v23.8b}, [x9], x1
+ st1 {v27.8b}, [x0], x1
+
+ br x10
+6:
+ // If we didn't need to do the flat8in part, we use the same writeback
+ // as in loop_filter_h_4_8.
+ add x9, x9, #2
+ add x0, x0, #2
+ transpose_4x8B v22, v23, v24, v25, v26, v27, v28, v29
+ st1 {v22.s}[0], [x9], x1
+ st1 {v22.s}[1], [x0], x1
+ st1 {v23.s}[0], [x9], x1
+ st1 {v23.s}[1], [x0], x1
+ st1 {v24.s}[0], [x9], x1
+ st1 {v24.s}[1], [x0], x1
+ st1 {v25.s}[0], [x9], x1
+ st1 {v25.s}[1], [x0], x1
+ br x10
+endfunc
+
+.macro mix_h_16 mix
+function ff_vp9_loop_filter_h_\mix\()_16_neon, export=1
+ mov x10, x30
+ sub x9, x0, #4
+ add x0, x9, x1, lsl #3
+ ld1 {v20.8b}, [x9], x1
+ ld1 {v20.d}[1], [x0], x1
+ ld1 {v21.8b}, [x9], x1
+ ld1 {v21.d}[1], [x0], x1
+ ld1 {v22.8b}, [x9], x1
+ ld1 {v22.d}[1], [x0], x1
+ ld1 {v23.8b}, [x9], x1
+ ld1 {v23.d}[1], [x0], x1
+ ld1 {v24.8b}, [x9], x1
+ ld1 {v24.d}[1], [x0], x1
+ ld1 {v25.8b}, [x9], x1
+ ld1 {v25.d}[1], [x0], x1
+ ld1 {v26.8b}, [x9], x1
+ ld1 {v26.d}[1], [x0], x1
+ ld1 {v27.8b}, [x9], x1
+ ld1 {v27.d}[1], [x0], x1
+
+ sub x9, x9, x1, lsl #3
+ sub x0, x0, x1, lsl #3
+
+ transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ loop_filter_8_16b_mix \mix
+
+ transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ st1 {v20.8b}, [x9], x1
+ st1 {v20.d}[1], [x0], x1
+ st1 {v21.8b}, [x9], x1
+ st1 {v21.d}[1], [x0], x1
+ st1 {v22.8b}, [x9], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.8b}, [x9], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.8b}, [x9], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.8b}, [x9], x1
+ st1 {v25.d}[1], [x0], x1
+ st1 {v26.8b}, [x9], x1
+ st1 {v26.d}[1], [x0], x1
+ st1 {v27.8b}, [x9], x1
+ st1 {v27.d}[1], [x0], x1
+
+ br x10
+6:
+ add x9, x9, #2
+ add x0, x0, #2
+ transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29
+ st1 {v22.s}[0], [x9], x1
+ st1 {v22.s}[2], [x0], x1
+ st1 {v23.s}[0], [x9], x1
+ st1 {v23.s}[2], [x0], x1
+ st1 {v24.s}[0], [x9], x1
+ st1 {v24.s}[2], [x0], x1
+ st1 {v25.s}[0], [x9], x1
+ st1 {v25.s}[2], [x0], x1
+ st1 {v22.s}[1], [x9], x1
+ st1 {v22.s}[3], [x0], x1
+ st1 {v23.s}[1], [x9], x1
+ st1 {v23.s}[3], [x0], x1
+ st1 {v24.s}[1], [x9], x1
+ st1 {v24.s}[3], [x0], x1
+ st1 {v25.s}[1], [x9], x1
+ st1 {v25.s}[3], [x0], x1
+ br x10
+endfunc
+.endm
+
+mix_h_16 48
+mix_h_16 84
+mix_h_16 88
+
+function ff_vp9_loop_filter_v_16_8_neon, export=1
+ mov x10, x30
+ stp d14, d15, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+ sub x9, x0, x1, lsl #3
+ ld1 {v16.8b}, [x9], x1 // p7
+ ld1 {v24.8b}, [x0], x1 // q0
+ ld1 {v17.8b}, [x9], x1 // p6
+ ld1 {v25.8b}, [x0], x1 // q1
+ ld1 {v18.8b}, [x9], x1 // p5
+ ld1 {v26.8b}, [x0], x1 // q2
+ ld1 {v19.8b}, [x9], x1 // p4
+ ld1 {v27.8b}, [x0], x1 // q3
+ ld1 {v20.8b}, [x9], x1 // p3
+ ld1 {v28.8b}, [x0], x1 // q4
+ ld1 {v21.8b}, [x9], x1 // p2
+ ld1 {v29.8b}, [x0], x1 // q5
+ ld1 {v22.8b}, [x9], x1 // p1
+ ld1 {v30.8b}, [x0], x1 // q6
+ ld1 {v23.8b}, [x9], x1 // p0
+ ld1 {v31.8b}, [x0], x1 // q7
+ sub x9, x9, x1, lsl #3
+ sub x0, x0, x1, lsl #3
+ add x9, x9, x1
+
+ loop_filter_16
+
+ // If we did the flat8out part, we get the output in
+ // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
+ // store v2-v9 there, and v10-v17 into x0.
+ st1 {v2.8b}, [x9], x1
+ st1 {v10.8b}, [x0], x1
+ st1 {v3.8b}, [x9], x1
+ st1 {v11.8b}, [x0], x1
+ st1 {v4.8b}, [x9], x1
+ st1 {v12.8b}, [x0], x1
+ st1 {v5.8b}, [x9], x1
+ st1 {v13.8b}, [x0], x1
+ st1 {v6.8b}, [x9], x1
+ st1 {v14.8b}, [x0], x1
+ st1 {v8.8b}, [x9], x1
+ st1 {v15.8b}, [x0], x1
+ st1 {v9.8b}, [x9], x1
+ st1 {v17.8b}, [x0], x1
+9:
+ ldp d8, d9, [sp], 0x10
+ ldp d10, d11, [sp], 0x10
+ ldp d12, d13, [sp], 0x10
+ ldp d14, d15, [sp], 0x10
+ br x10
+8:
+ add x9, x9, x1, lsl #2
+ // If we didn't do the flat8out part, the output is left in the
+ // input registers.
+ st1 {v21.8b}, [x9], x1
+ st1 {v24.8b}, [x0], x1
+ st1 {v22.8b}, [x9], x1
+ st1 {v25.8b}, [x0], x1
+ st1 {v23.8b}, [x9], x1
+ st1 {v26.8b}, [x0], x1
+ b 9b
+7:
+ sub x9, x0, x1, lsl #1
+ st1 {v22.8b}, [x9], x1
+ st1 {v24.8b}, [x0], x1
+ st1 {v23.8b}, [x9], x1
+ st1 {v25.8b}, [x0], x1
+ b 9b
+endfunc
+
+function ff_vp9_loop_filter_v_16_16_neon, export=1
+ mov x10, x30
+ stp d14, d15, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+ sub x9, x0, x1, lsl #3
+ ld1 {v16.16b}, [x9], x1 // p7
+ ld1 {v24.16b}, [x0], x1 // q0
+ ld1 {v17.16b}, [x9], x1 // p6
+ ld1 {v25.16b}, [x0], x1 // q1
+ ld1 {v18.16b}, [x9], x1 // p5
+ ld1 {v26.16b}, [x0], x1 // q2
+ ld1 {v19.16b}, [x9], x1 // p4
+ ld1 {v27.16b}, [x0], x1 // q3
+ ld1 {v20.16b}, [x9], x1 // p3
+ ld1 {v28.16b}, [x0], x1 // q4
+ ld1 {v21.16b}, [x9], x1 // p2
+ ld1 {v29.16b}, [x0], x1 // q5
+ ld1 {v22.16b}, [x9], x1 // p1
+ ld1 {v30.16b}, [x0], x1 // q6
+ ld1 {v23.16b}, [x9], x1 // p0
+ ld1 {v31.16b}, [x0], x1 // q7
+ sub x9, x9, x1, lsl #3
+ sub x0, x0, x1, lsl #3
+ add x9, x9, x1
+
+ loop_filter_16_16b
+
+ st1 {v2.16b}, [x9], x1
+ st1 {v10.16b}, [x0], x1
+ st1 {v3.16b}, [x9], x1
+ st1 {v11.16b}, [x0], x1
+ st1 {v4.16b}, [x9], x1
+ st1 {v12.16b}, [x0], x1
+ st1 {v5.16b}, [x9], x1
+ st1 {v13.16b}, [x0], x1
+ st1 {v6.16b}, [x9], x1
+ st1 {v14.16b}, [x0], x1
+ st1 {v8.16b}, [x9], x1
+ st1 {v15.16b}, [x0], x1
+ st1 {v9.16b}, [x9], x1
+ st1 {v17.16b}, [x0], x1
+9:
+ ldp d8, d9, [sp], 0x10
+ ldp d10, d11, [sp], 0x10
+ ldp d12, d13, [sp], 0x10
+ ldp d14, d15, [sp], 0x10
+ br x10
+8:
+ add x9, x9, x1, lsl #2
+ st1 {v21.16b}, [x9], x1
+ st1 {v24.16b}, [x0], x1
+ st1 {v22.16b}, [x9], x1
+ st1 {v25.16b}, [x0], x1
+ st1 {v23.16b}, [x9], x1
+ st1 {v26.16b}, [x0], x1
+ b 9b
+7:
+ sub x9, x0, x1, lsl #1
+ st1 {v22.16b}, [x9], x1
+ st1 {v24.16b}, [x0], x1
+ st1 {v23.16b}, [x9], x1
+ st1 {v25.16b}, [x0], x1
+ b 9b
+endfunc
+
+function ff_vp9_loop_filter_h_16_8_neon, export=1
+ mov x10, x30
+ stp d14, d15, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+ sub x9, x0, #8
+ ld1 {v16.8b}, [x9], x1
+ ld1 {v24.8b}, [x0], x1
+ ld1 {v17.8b}, [x9], x1
+ ld1 {v25.8b}, [x0], x1
+ ld1 {v18.8b}, [x9], x1
+ ld1 {v26.8b}, [x0], x1
+ ld1 {v19.8b}, [x9], x1
+ ld1 {v27.8b}, [x0], x1
+ ld1 {v20.8b}, [x9], x1
+ ld1 {v28.8b}, [x0], x1
+ ld1 {v21.8b}, [x9], x1
+ ld1 {v29.8b}, [x0], x1
+ ld1 {v22.8b}, [x9], x1
+ ld1 {v30.8b}, [x0], x1
+ ld1 {v23.8b}, [x9], x1
+ ld1 {v31.8b}, [x0], x1
+ sub x0, x0, x1, lsl #3
+ sub x9, x9, x1, lsl #3
+
+ // The 16x8 pixels read above is in two 8x8 blocks; the left
+ // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes
+ // of this, to get one column per register.
+ transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
+ transpose_8x8B v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
+
+ loop_filter_16
+
+ transpose_8x8B v16, v2, v3, v4, v5, v6, v8, v9, v0, v1
+ transpose_8x8B v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
+
+ st1 {v16.8b}, [x9], x1
+ st1 {v10.8b}, [x0], x1
+ st1 {v2.8b}, [x9], x1
+ st1 {v11.8b}, [x0], x1
+ st1 {v3.8b}, [x9], x1
+ st1 {v12.8b}, [x0], x1
+ st1 {v4.8b}, [x9], x1
+ st1 {v13.8b}, [x0], x1
+ st1 {v5.8b}, [x9], x1
+ st1 {v14.8b}, [x0], x1
+ st1 {v6.8b}, [x9], x1
+ st1 {v15.8b}, [x0], x1
+ st1 {v8.8b}, [x9], x1
+ st1 {v17.8b}, [x0], x1
+ st1 {v9.8b}, [x9], x1
+ st1 {v31.8b}, [x0], x1
+9:
+ ldp d8, d9, [sp], 0x10
+ ldp d10, d11, [sp], 0x10
+ ldp d12, d13, [sp], 0x10
+ ldp d14, d15, [sp], 0x10
+ br x10
+8:
+ // The same writeback as in loop_filter_h_8_8
+ sub x9, x0, #4
+ add x0, x9, x1, lsl #2
+ transpose_8x8B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ st1 {v20.8b}, [x9], x1
+ st1 {v24.8b}, [x0], x1
+ st1 {v21.8b}, [x9], x1
+ st1 {v25.8b}, [x0], x1
+ st1 {v22.8b}, [x9], x1
+ st1 {v26.8b}, [x0], x1
+ st1 {v23.8b}, [x9], x1
+ st1 {v27.8b}, [x0], x1
+ b 9b
+7:
+ // The same writeback as in loop_filter_h_4_8
+ sub x9, x0, #2
+ add x0, x9, x1, lsl #2
+ transpose_4x8B v22, v23, v24, v25, v26, v27, v28, v29
+ st1 {v22.s}[0], [x9], x1
+ st1 {v22.s}[1], [x0], x1
+ st1 {v23.s}[0], [x9], x1
+ st1 {v23.s}[1], [x0], x1
+ st1 {v24.s}[0], [x9], x1
+ st1 {v24.s}[1], [x0], x1
+ st1 {v25.s}[0], [x9], x1
+ st1 {v25.s}[1], [x0], x1
+ b 9b
+endfunc
+
+function ff_vp9_loop_filter_h_16_16_neon, export=1
+ mov x10, x30
+ stp d14, d15, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+ sub x9, x0, #8
+ ld1 {v16.8b}, [x9], x1
+ ld1 {v24.8b}, [x0], x1
+ ld1 {v17.8b}, [x9], x1
+ ld1 {v25.8b}, [x0], x1
+ ld1 {v18.8b}, [x9], x1
+ ld1 {v26.8b}, [x0], x1
+ ld1 {v19.8b}, [x9], x1
+ ld1 {v27.8b}, [x0], x1
+ ld1 {v20.8b}, [x9], x1
+ ld1 {v28.8b}, [x0], x1
+ ld1 {v21.8b}, [x9], x1
+ ld1 {v29.8b}, [x0], x1
+ ld1 {v22.8b}, [x9], x1
+ ld1 {v30.8b}, [x0], x1
+ ld1 {v23.8b}, [x9], x1
+ ld1 {v31.8b}, [x0], x1
+ ld1 {v16.d}[1], [x9], x1
+ ld1 {v24.d}[1], [x0], x1
+ ld1 {v17.d}[1], [x9], x1
+ ld1 {v25.d}[1], [x0], x1
+ ld1 {v18.d}[1], [x9], x1
+ ld1 {v26.d}[1], [x0], x1
+ ld1 {v19.d}[1], [x9], x1
+ ld1 {v27.d}[1], [x0], x1
+ ld1 {v20.d}[1], [x9], x1
+ ld1 {v28.d}[1], [x0], x1
+ ld1 {v21.d}[1], [x9], x1
+ ld1 {v29.d}[1], [x0], x1
+ ld1 {v22.d}[1], [x9], x1
+ ld1 {v30.d}[1], [x0], x1
+ ld1 {v23.d}[1], [x9], x1
+ ld1 {v31.d}[1], [x0], x1
+ sub x0, x0, x1, lsl #4
+ sub x9, x9, x1, lsl #4
+
+ transpose_8x16B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
+ transpose_8x16B v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
+
+ loop_filter_16_16b
+
+ transpose_8x16B v16, v2, v3, v4, v5, v6, v8, v9, v0, v1
+ transpose_8x16B v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
+
+ st1 {v16.8b}, [x9], x1
+ st1 {v10.8b}, [x0], x1
+ st1 {v2.8b}, [x9], x1
+ st1 {v11.8b}, [x0], x1
+ st1 {v3.8b}, [x9], x1
+ st1 {v12.8b}, [x0], x1
+ st1 {v4.8b}, [x9], x1
+ st1 {v13.8b}, [x0], x1
+ st1 {v5.8b}, [x9], x1
+ st1 {v14.8b}, [x0], x1
+ st1 {v6.8b}, [x9], x1
+ st1 {v15.8b}, [x0], x1
+ st1 {v8.8b}, [x9], x1
+ st1 {v17.8b}, [x0], x1
+ st1 {v9.8b}, [x9], x1
+ st1 {v31.8b}, [x0], x1
+ st1 {v16.d}[1], [x9], x1
+ st1 {v10.d}[1], [x0], x1
+ st1 {v2.d}[1], [x9], x1
+ st1 {v11.d}[1], [x0], x1
+ st1 {v3.d}[1], [x9], x1
+ st1 {v12.d}[1], [x0], x1
+ st1 {v4.d}[1], [x9], x1
+ st1 {v13.d}[1], [x0], x1
+ st1 {v5.d}[1], [x9], x1
+ st1 {v14.d}[1], [x0], x1
+ st1 {v6.d}[1], [x9], x1
+ st1 {v15.d}[1], [x0], x1
+ st1 {v8.d}[1], [x9], x1
+ st1 {v17.d}[1], [x0], x1
+ st1 {v9.d}[1], [x9], x1
+ st1 {v31.d}[1], [x0], x1
+9:
+ ldp d8, d9, [sp], 0x10
+ ldp d10, d11, [sp], 0x10
+ ldp d12, d13, [sp], 0x10
+ ldp d14, d15, [sp], 0x10
+ br x10
+8:
+ sub x9, x0, #4
+ add x0, x9, x1, lsl #3
+ transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ st1 {v20.8b}, [x9], x1
+ st1 {v20.d}[1], [x0], x1
+ st1 {v21.8b}, [x9], x1
+ st1 {v21.d}[1], [x0], x1
+ st1 {v22.8b}, [x9], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.8b}, [x9], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.8b}, [x9], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.8b}, [x9], x1
+ st1 {v25.d}[1], [x0], x1
+ st1 {v26.8b}, [x9], x1
+ st1 {v26.d}[1], [x0], x1
+ st1 {v27.8b}, [x9], x1
+ st1 {v27.d}[1], [x0], x1
+ b 9b
+7:
+ sub x9, x0, #2
+ add x0, x9, x1, lsl #3
+ transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29
+ st1 {v22.s}[0], [x9], x1
+ st1 {v22.s}[2], [x0], x1
+ st1 {v23.s}[0], [x9], x1
+ st1 {v23.s}[2], [x0], x1
+ st1 {v24.s}[0], [x9], x1
+ st1 {v24.s}[2], [x0], x1
+ st1 {v25.s}[0], [x9], x1
+ st1 {v25.s}[2], [x0], x1
+ st1 {v22.s}[1], [x9], x1
+ st1 {v22.s}[3], [x0], x1
+ st1 {v23.s}[1], [x9], x1
+ st1 {v23.s}[3], [x0], x1
+ st1 {v24.s}[1], [x9], x1
+ st1 {v24.s}[3], [x0], x1
+ st1 {v25.s}[1], [x9], x1
+ st1 {v25.s}[3], [x0], x1
+ b 9b
+endfunc
diff --git a/media/ffvpx/libavcodec/aarch64/vp9mc_16bpp_neon.S b/media/ffvpx/libavcodec/aarch64/vp9mc_16bpp_neon.S
new file mode 100644
index 0000000000..cac6428709
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/vp9mc_16bpp_neon.S
@@ -0,0 +1,631 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+// All public functions in this file have the following signature:
+// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
+// const uint8_t *ref, ptrdiff_t ref_stride,
+// int h, int mx, int my);
+
+function ff_vp9_copy128_aarch64, export=1
+1:
+ ldp x5, x6, [x2]
+ ldp x7, x8, [x2, #16]
+ stp x5, x6, [x0]
+ ldp x9, x10, [x2, #32]
+ stp x7, x8, [x0, #16]
+ subs w4, w4, #1
+ ldp x11, x12, [x2, #48]
+ stp x9, x10, [x0, #32]
+ stp x11, x12, [x0, #48]
+ ldp x5, x6, [x2, #64]
+ ldp x7, x8, [x2, #80]
+ stp x5, x6, [x0, #64]
+ ldp x9, x10, [x2, #96]
+ stp x7, x8, [x0, #80]
+ ldp x11, x12, [x2, #112]
+ stp x9, x10, [x0, #96]
+ stp x11, x12, [x0, #112]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_vp9_avg64_16_neon, export=1
+ mov x5, x0
+ sub x1, x1, #64
+ sub x3, x3, #64
+1:
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
+ urhadd v0.8h, v0.8h, v4.8h
+ urhadd v1.8h, v1.8h, v5.8h
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
+ urhadd v2.8h, v2.8h, v6.8h
+ urhadd v3.8h, v3.8h, v7.8h
+ subs w4, w4, #1
+ urhadd v16.8h, v16.8h, v20.8h
+ urhadd v17.8h, v17.8h, v21.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x5], #64
+ urhadd v18.8h, v18.8h, v22.8h
+ urhadd v19.8h, v19.8h, v23.8h
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_vp9_avg32_16_neon, export=1
+ mov x5, x0
+1:
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x3
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
+ urhadd v0.8h, v0.8h, v4.8h
+ urhadd v1.8h, v1.8h, v5.8h
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
+ urhadd v2.8h, v2.8h, v6.8h
+ urhadd v3.8h, v3.8h, v7.8h
+ subs w4, w4, #2
+ urhadd v16.8h, v16.8h, v20.8h
+ urhadd v17.8h, v17.8h, v21.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x5], x1
+ urhadd v18.8h, v18.8h, v22.8h
+ urhadd v19.8h, v19.8h, v23.8h
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_vp9_avg16_16_neon, export=1
+1:
+ ld1 {v2.8h, v3.8h}, [x2], x3
+ ld1 {v0.8h, v1.8h}, [x0]
+ urhadd v0.8h, v0.8h, v2.8h
+ urhadd v1.8h, v1.8h, v3.8h
+ subs w4, w4, #1
+ st1 {v0.8h, v1.8h}, [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_vp9_avg8_16_neon, export=1
+ mov x5, x0
+1:
+ ld1 {v2.8h}, [x2], x3
+ ld1 {v0.8h}, [x0], x1
+ ld1 {v3.8h}, [x2], x3
+ urhadd v0.8h, v0.8h, v2.8h
+ ld1 {v1.8h}, [x0], x1
+ urhadd v1.8h, v1.8h, v3.8h
+ subs w4, w4, #2
+ st1 {v0.8h}, [x5], x1
+ st1 {v1.8h}, [x5], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_vp9_avg4_16_neon, export=1
+ mov x5, x0
+1:
+ ld1 {v2.4h}, [x2], x3
+ ld1 {v0.4h}, [x0], x1
+ ld1 {v3.4h}, [x2], x3
+ urhadd v0.4h, v0.4h, v2.4h
+ ld1 {v1.4h}, [x0], x1
+ urhadd v1.4h, v1.4h, v3.4h
+ subs w4, w4, #2
+ st1 {v0.4h}, [x5], x1
+ st1 {v1.8b}, [x5], x1
+ b.ne 1b
+ ret
+endfunc
+
+
+// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
+// for size >= 16), and multiply-accumulate into dst1 and dst5 (or
+// dst1-dst2 and dst5-dst6 for size >= 8 and dst1-dst4 and dst5-dst8
+// for size >= 16)
+.macro extmlal dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, src1, src2, src3, src4, src5, src6, offset, size
+ ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
+ ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
+ smlal \dst1\().4s, v20.4h, v0.h[\offset]
+ smlal \dst5\().4s, v22.4h, v0.h[\offset]
+.if \size >= 16
+ ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
+ ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
+.endif
+.if \size >= 8
+ smlal2 \dst2\().4s, v20.8h, v0.h[\offset]
+ smlal2 \dst6\().4s, v22.8h, v0.h[\offset]
+.endif
+.if \size >= 16
+ smlal \dst3\().4s, v21.4h, v0.h[\offset]
+ smlal \dst7\().4s, v23.4h, v0.h[\offset]
+ smlal2 \dst4\().4s, v21.8h, v0.h[\offset]
+ smlal2 \dst8\().4s, v23.8h, v0.h[\offset]
+.endif
+.endm
+
+
+// Instantiate a horizontal filter function for the given size.
+// This can work on 4, 8 or 16 pixels in parallel; for larger
+// widths it will do 16 pixels at a time and loop horizontally.
+// The actual width (in bytes) is passed in x5, the height in w4 and
+// the filter coefficients in x9.
+.macro do_8tap_h type, size
+function \type\()_8tap_\size\()h
+ sub x2, x2, #6
+ add x6, x0, x1
+ add x7, x2, x3
+ add x1, x1, x1
+ add x3, x3, x3
+ // Only size >= 16 loops horizontally and needs
+ // reduced dst stride
+.if \size >= 16
+ sub x1, x1, x5
+.endif
+ // size >= 16 loads two qwords and increments r2,
+ // for size 4/8 it's enough with one qword and no
+ // postincrement
+.if \size >= 16
+ sub x3, x3, x5
+ sub x3, x3, #16
+.endif
+ // Load the filter vector
+ ld1 {v0.8h}, [x9]
+1:
+.if \size >= 16
+ mov x9, x5
+.endif
+ // Load src
+.if \size >= 16
+ ld1 {v5.8h, v6.8h, v7.8h}, [x2], #48
+ ld1 {v16.8h, v17.8h, v18.8h}, [x7], #48
+.else
+ ld1 {v5.8h, v6.8h}, [x2]
+ ld1 {v16.8h, v17.8h}, [x7]
+.endif
+2:
+
+ smull v1.4s, v5.4h, v0.h[0]
+ smull v24.4s, v16.4h, v0.h[0]
+.if \size >= 8
+ smull2 v2.4s, v5.8h, v0.h[0]
+ smull2 v25.4s, v16.8h, v0.h[0]
+.endif
+.if \size >= 16
+ smull v3.4s, v6.4h, v0.h[0]
+ smull v26.4s, v17.4h, v0.h[0]
+ smull2 v4.4s, v6.8h, v0.h[0]
+ smull2 v27.4s, v17.8h, v0.h[0]
+.endif
+ extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 1, \size
+ extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 2, \size
+ extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 3, \size
+ extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 4, \size
+ extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 5, \size
+ extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 6, \size
+ extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 7, \size
+
+ // Round, shift and saturate
+ // The sqrshrun takes care of clamping negative values to zero, but
+ // we manually need to do umin with the max pixel value.
+ sqrshrun v1.4h, v1.4s, #7
+ sqrshrun v24.4h, v24.4s, #7
+.if \size >= 8
+ sqrshrun2 v1.8h, v2.4s, #7
+ sqrshrun2 v24.8h, v25.4s, #7
+ umin v1.8h, v1.8h, v31.8h
+ umin v24.8h, v24.8h, v31.8h
+.if \size >= 16
+ sqrshrun v2.4h, v3.4s, #7
+ sqrshrun v25.4h, v26.4s, #7
+ sqrshrun2 v2.8h, v4.4s, #7
+ sqrshrun2 v25.8h, v27.4s, #7
+ umin v2.8h, v2.8h, v31.8h
+ umin v25.8h, v25.8h, v31.8h
+.endif
+.else
+ umin v1.4h, v1.4h, v31.4h
+ umin v24.4h, v24.4h, v31.4h
+.endif
+ // Average
+.ifc \type,avg
+.if \size >= 16
+ ld1 {v3.8h, v4.8h}, [x0]
+ ld1 {v29.8h, v30.8h}, [x6]
+ urhadd v1.8h, v1.8h, v3.8h
+ urhadd v2.8h, v2.8h, v4.8h
+ urhadd v24.8h, v24.8h, v29.8h
+ urhadd v25.8h, v25.8h, v30.8h
+.elseif \size >= 8
+ ld1 {v3.8h}, [x0]
+ ld1 {v4.8h}, [x6]
+ urhadd v1.8h, v1.8h, v3.8h
+ urhadd v24.8h, v24.8h, v4.8h
+.else
+ ld1 {v3.4h}, [x0]
+ ld1 {v4.4h}, [x6]
+ urhadd v1.4h, v1.4h, v3.4h
+ urhadd v24.4h, v24.4h, v4.4h
+.endif
+.endif
+ // Store and loop horizontally (for size >= 16)
+.if \size >= 16
+ subs x9, x9, #32
+ st1 {v1.8h, v2.8h}, [x0], #32
+ st1 {v24.8h, v25.8h}, [x6], #32
+ b.eq 3f
+ mov v5.16b, v7.16b
+ mov v16.16b, v18.16b
+ ld1 {v6.8h, v7.8h}, [x2], #32
+ ld1 {v17.8h, v18.8h}, [x7], #32
+ b 2b
+.elseif \size == 8
+ st1 {v1.8h}, [x0]
+ st1 {v24.8h}, [x6]
+.else // \size == 4
+ st1 {v1.4h}, [x0]
+ st1 {v24.4h}, [x6]
+.endif
+3:
+ // Loop vertically
+ add x0, x0, x1
+ add x6, x6, x1
+ add x2, x2, x3
+ add x7, x7, x3
+ subs w4, w4, #2
+ b.ne 1b
+ ret
+endfunc
+.endm
+
+.macro do_8tap_h_size size
+do_8tap_h put, \size
+do_8tap_h avg, \size
+.endm
+
+do_8tap_h_size 4
+do_8tap_h_size 8
+do_8tap_h_size 16
+
+.macro do_8tap_h_func type, filter, offset, size, bpp
+function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1
+ mvni v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
+ movrel x6, X(ff_vp9_subpel_filters), 256*\offset
+ cmp w5, #8
+ add x9, x6, w5, uxtw #4
+ mov x5, #2*\size
+.if \size >= 16
+ b \type\()_8tap_16h
+.else
+ b \type\()_8tap_\size\()h
+.endif
+endfunc
+.endm
+
+.macro do_8tap_h_filters size, bpp
+do_8tap_h_func put, regular, 1, \size, \bpp
+do_8tap_h_func avg, regular, 1, \size, \bpp
+do_8tap_h_func put, sharp, 2, \size, \bpp
+do_8tap_h_func avg, sharp, 2, \size, \bpp
+do_8tap_h_func put, smooth, 0, \size, \bpp
+do_8tap_h_func avg, smooth, 0, \size, \bpp
+.endm
+
+.macro do_8tap_h_filters_bpp bpp
+do_8tap_h_filters 64, \bpp
+do_8tap_h_filters 32, \bpp
+do_8tap_h_filters 16, \bpp
+do_8tap_h_filters 8, \bpp
+do_8tap_h_filters 4, \bpp
+.endm
+
+do_8tap_h_filters_bpp 10
+do_8tap_h_filters_bpp 12
+
+
+// Vertical filters
+
+// Round, shift and saturate and store reg1-reg4
+.macro do_store4 reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, minreg, type
+ sqrshrun \reg1\().4h, \reg1\().4s, #7
+ sqrshrun \reg2\().4h, \reg2\().4s, #7
+ sqrshrun \reg3\().4h, \reg3\().4s, #7
+ sqrshrun \reg4\().4h, \reg4\().4s, #7
+.ifc \type,avg
+ ld1 {\tmp1\().4h}, [x7], x1
+ ld1 {\tmp2\().4h}, [x7], x1
+ ld1 {\tmp3\().4h}, [x7], x1
+ ld1 {\tmp4\().4h}, [x7], x1
+.endif
+ umin \reg1\().4h, \reg1\().4h, \minreg\().4h
+ umin \reg2\().4h, \reg2\().4h, \minreg\().4h
+ umin \reg3\().4h, \reg3\().4h, \minreg\().4h
+ umin \reg4\().4h, \reg4\().4h, \minreg\().4h
+.ifc \type,avg
+ urhadd \reg1\().4h, \reg1\().4h, \tmp1\().4h
+ urhadd \reg2\().4h, \reg2\().4h, \tmp2\().4h
+ urhadd \reg3\().4h, \reg3\().4h, \tmp3\().4h
+ urhadd \reg4\().4h, \reg4\().4h, \tmp4\().4h
+.endif
+ st1 {\reg1\().4h}, [x0], x1
+ st1 {\reg2\().4h}, [x0], x1
+ st1 {\reg3\().4h}, [x0], x1
+ st1 {\reg4\().4h}, [x0], x1
+.endm
+
+// Round, shift and saturate and store reg1-8, where
+// reg1-2, reg3-4 etc pairwise correspond to 4 rows.
+.macro do_store8 reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, minreg, type
+ sqrshrun \reg1\().4h, \reg1\().4s, #7
+ sqrshrun2 \reg1\().8h, \reg2\().4s, #7
+ sqrshrun \reg2\().4h, \reg3\().4s, #7
+ sqrshrun2 \reg2\().8h, \reg4\().4s, #7
+ sqrshrun \reg3\().4h, \reg5\().4s, #7
+ sqrshrun2 \reg3\().8h, \reg6\().4s, #7
+ sqrshrun \reg4\().4h, \reg7\().4s, #7
+ sqrshrun2 \reg4\().8h, \reg8\().4s, #7
+.ifc \type,avg
+ ld1 {\reg5\().8h}, [x7], x1
+ ld1 {\reg6\().8h}, [x7], x1
+ ld1 {\reg7\().8h}, [x7], x1
+ ld1 {\reg8\().8h}, [x7], x1
+.endif
+ umin \reg1\().8h, \reg1\().8h, \minreg\().8h
+ umin \reg2\().8h, \reg2\().8h, \minreg\().8h
+ umin \reg3\().8h, \reg3\().8h, \minreg\().8h
+ umin \reg4\().8h, \reg4\().8h, \minreg\().8h
+.ifc \type,avg
+ urhadd \reg1\().8h, \reg1\().8h, \reg5\().8h
+ urhadd \reg2\().8h, \reg2\().8h, \reg6\().8h
+ urhadd \reg3\().8h, \reg3\().8h, \reg7\().8h
+ urhadd \reg4\().8h, \reg4\().8h, \reg8\().8h
+.endif
+ st1 {\reg1\().8h}, [x0], x1
+ st1 {\reg2\().8h}, [x0], x1
+ st1 {\reg3\().8h}, [x0], x1
+ st1 {\reg4\().8h}, [x0], x1
+.endm
+
+// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
+// (src1-src8 into dst1, src2-src9 into dst2).
+.macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2
+ smull \dst1\().4s, \src1\().4h, v0.h[0]
+ smull \dst2\().4s, \src2\().4h, v0.h[0]
+ smull \tmp1\().4s, \src2\().4h, v0.h[1]
+ smull \tmp2\().4s, \src3\().4h, v0.h[1]
+ smlal \dst1\().4s, \src3\().4h, v0.h[2]
+ smlal \dst2\().4s, \src4\().4h, v0.h[2]
+ smlal \tmp1\().4s, \src4\().4h, v0.h[3]
+ smlal \tmp2\().4s, \src5\().4h, v0.h[3]
+ smlal \dst1\().4s, \src5\().4h, v0.h[4]
+ smlal \dst2\().4s, \src6\().4h, v0.h[4]
+ smlal \tmp1\().4s, \src6\().4h, v0.h[5]
+ smlal \tmp2\().4s, \src7\().4h, v0.h[5]
+ smlal \dst1\().4s, \src7\().4h, v0.h[6]
+ smlal \dst2\().4s, \src8\().4h, v0.h[6]
+ smlal \tmp1\().4s, \src8\().4h, v0.h[7]
+ smlal \tmp2\().4s, \src9\().4h, v0.h[7]
+ add \dst1\().4s, \dst1\().4s, \tmp1\().4s
+ add \dst2\().4s, \dst2\().4s, \tmp2\().4s
+.endm
+
+// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst4
+// (src1-src8 into dst1-dst2, src2-src9 into dst3-dst4).
+.macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9
+ smull \dst1\().4s, \src1\().4h, v0.h[0]
+ smull2 \dst2\().4s, \src1\().8h, v0.h[0]
+ smull \dst3\().4s, \src2\().4h, v0.h[0]
+ smull2 \dst4\().4s, \src2\().8h, v0.h[0]
+ smlal \dst1\().4s, \src2\().4h, v0.h[1]
+ smlal2 \dst2\().4s, \src2\().8h, v0.h[1]
+ smlal \dst3\().4s, \src3\().4h, v0.h[1]
+ smlal2 \dst4\().4s, \src3\().8h, v0.h[1]
+ smlal \dst1\().4s, \src3\().4h, v0.h[2]
+ smlal2 \dst2\().4s, \src3\().8h, v0.h[2]
+ smlal \dst3\().4s, \src4\().4h, v0.h[2]
+ smlal2 \dst4\().4s, \src4\().8h, v0.h[2]
+ smlal \dst1\().4s, \src4\().4h, v0.h[3]
+ smlal2 \dst2\().4s, \src4\().8h, v0.h[3]
+ smlal \dst3\().4s, \src5\().4h, v0.h[3]
+ smlal2 \dst4\().4s, \src5\().8h, v0.h[3]
+ smlal \dst1\().4s, \src5\().4h, v0.h[4]
+ smlal2 \dst2\().4s, \src5\().8h, v0.h[4]
+ smlal \dst3\().4s, \src6\().4h, v0.h[4]
+ smlal2 \dst4\().4s, \src6\().8h, v0.h[4]
+ smlal \dst1\().4s, \src6\().4h, v0.h[5]
+ smlal2 \dst2\().4s, \src6\().8h, v0.h[5]
+ smlal \dst3\().4s, \src7\().4h, v0.h[5]
+ smlal2 \dst4\().4s, \src7\().8h, v0.h[5]
+ smlal \dst1\().4s, \src7\().4h, v0.h[6]
+ smlal2 \dst2\().4s, \src7\().8h, v0.h[6]
+ smlal \dst3\().4s, \src8\().4h, v0.h[6]
+ smlal2 \dst4\().4s, \src8\().8h, v0.h[6]
+ smlal \dst1\().4s, \src8\().4h, v0.h[7]
+ smlal2 \dst2\().4s, \src8\().8h, v0.h[7]
+ smlal \dst3\().4s, \src9\().4h, v0.h[7]
+ smlal2 \dst4\().4s, \src9\().8h, v0.h[7]
+.endm
+
+// Instantiate a vertical filter function for filtering 8 pixels at a time.
+// The height is passed in x4, the width in x5 and the filter coefficients
+// in x6.
+.macro do_8tap_8v type
+function \type\()_8tap_8v
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ ld1 {v0.8h}, [x6]
+1:
+.ifc \type,avg
+ mov x7, x0
+.endif
+ mov x6, x4
+
+ ld1 {v17.8h}, [x2], x3
+ ld1 {v18.8h}, [x2], x3
+ ld1 {v19.8h}, [x2], x3
+ ld1 {v20.8h}, [x2], x3
+ ld1 {v21.8h}, [x2], x3
+ ld1 {v22.8h}, [x2], x3
+ ld1 {v23.8h}, [x2], x3
+2:
+ ld1 {v24.8h}, [x2], x3
+ ld1 {v25.8h}, [x2], x3
+ ld1 {v26.8h}, [x2], x3
+ ld1 {v27.8h}, [x2], x3
+
+ convolve8 v2, v3, v4, v5, v17, v18, v19, v20, v21, v22, v23, v24, v25
+ convolve8 v6, v7, v30, v31, v19, v20, v21, v22, v23, v24, v25, v26, v27
+ do_store8 v2, v3, v4, v5, v6, v7, v30, v31, v1, \type
+
+ subs x6, x6, #4
+ b.eq 8f
+
+ ld1 {v16.8h}, [x2], x3
+ ld1 {v17.8h}, [x2], x3
+ ld1 {v18.8h}, [x2], x3
+ ld1 {v19.8h}, [x2], x3
+ convolve8 v2, v3, v4, v5, v21, v22, v23, v24, v25, v26, v27, v16, v17
+ convolve8 v6, v7, v20, v21, v23, v24, v25, v26, v27, v16, v17, v18, v19
+ do_store8 v2, v3, v4, v5, v6, v7, v20, v21, v1, \type
+
+ subs x6, x6, #4
+ b.eq 8f
+
+ ld1 {v20.8h}, [x2], x3
+ ld1 {v21.8h}, [x2], x3
+ ld1 {v22.8h}, [x2], x3
+ ld1 {v23.8h}, [x2], x3
+ convolve8 v2, v3, v4, v5, v25, v26, v27, v16, v17, v18, v19, v20, v21
+ convolve8 v6, v7, v24, v25, v27, v16, v17, v18, v19, v20, v21, v22, v23
+ do_store8 v2, v3, v4, v5, v6, v7, v24, v25, v1, \type
+
+ subs x6, x6, #4
+ b.ne 2b
+
+8:
+ subs x5, x5, #8
+ b.eq 9f
+ // x0 -= h * dst_stride
+ msub x0, x1, x4, x0
+ // x2 -= h * src_stride
+ msub x2, x3, x4, x2
+ // x2 -= 8 * src_stride
+ sub x2, x2, x3, lsl #3
+ // x2 += 1 * src_stride
+ add x2, x2, x3
+ add x2, x2, #16
+ add x0, x0, #16
+ b 1b
+9:
+ ret
+endfunc
+.endm
+
+do_8tap_8v put
+do_8tap_8v avg
+
+
+// Instantiate a vertical filter function for filtering a 4 pixels wide
+// slice. This only is designed to work for 4 or 8 output lines.
+.macro do_8tap_4v type
+function \type\()_8tap_4v
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ ld1 {v0.8h}, [x6]
+.ifc \type,avg
+ mov x7, x0
+.endif
+
+ ld1 {v16.4h}, [x2], x3
+ ld1 {v17.4h}, [x2], x3
+ ld1 {v18.4h}, [x2], x3
+ ld1 {v19.4h}, [x2], x3
+ ld1 {v20.4h}, [x2], x3
+ ld1 {v21.4h}, [x2], x3
+ ld1 {v22.4h}, [x2], x3
+ ld1 {v23.4h}, [x2], x3
+ ld1 {v24.4h}, [x2], x3
+ ld1 {v25.4h}, [x2], x3
+ ld1 {v26.4h}, [x2], x3
+
+ convolve4 v2, v3, v16, v17, v18, v19, v20, v21, v22, v23, v24, v30, v31
+ convolve4 v4, v5, v18, v19, v20, v21, v22, v23, v24, v25, v26, v30, v31
+ do_store4 v2, v3, v4, v5, v28, v29, v30, v31, v1, \type
+
+ subs x4, x4, #4
+ b.eq 9f
+
+ ld1 {v27.4h}, [x2], x3
+ ld1 {v28.4h}, [x2], x3
+ ld1 {v29.4h}, [x2], x3
+ ld1 {v30.4h}, [x2], x3
+
+ convolve4 v2, v3, v20, v21, v22, v23, v24, v25, v26, v27, v28, v16, v17
+ convolve4 v4, v5, v22, v23, v24, v25, v26, v27, v28, v29, v30, v16, v17
+ do_store4 v2, v3, v4, v5, v16, v17, v18, v19, v1, \type
+
+9:
+ ret
+endfunc
+.endm
+
+do_8tap_4v put
+do_8tap_4v avg
+
+
+.macro do_8tap_v_func type, filter, offset, size, bpp
+function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1
+ uxtw x4, w4
+ mvni v1.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
+ movrel x5, X(ff_vp9_subpel_filters), 256*\offset
+ add x6, x5, w6, uxtw #4
+ mov x5, #\size
+.if \size >= 8
+ b \type\()_8tap_8v
+.else
+ b \type\()_8tap_4v
+.endif
+endfunc
+.endm
+
+.macro do_8tap_v_filters size, bpp
+do_8tap_v_func put, regular, 1, \size, \bpp
+do_8tap_v_func avg, regular, 1, \size, \bpp
+do_8tap_v_func put, sharp, 2, \size, \bpp
+do_8tap_v_func avg, sharp, 2, \size, \bpp
+do_8tap_v_func put, smooth, 0, \size, \bpp
+do_8tap_v_func avg, smooth, 0, \size, \bpp
+.endm
+
+.macro do_8tap_v_filters_bpp bpp
+do_8tap_v_filters 64, \bpp
+do_8tap_v_filters 32, \bpp
+do_8tap_v_filters 16, \bpp
+do_8tap_v_filters 8, \bpp
+do_8tap_v_filters 4, \bpp
+.endm
+
+do_8tap_v_filters_bpp 10
+do_8tap_v_filters_bpp 12
diff --git a/media/ffvpx/libavcodec/aarch64/vp9mc_neon.S b/media/ffvpx/libavcodec/aarch64/vp9mc_neon.S
new file mode 100644
index 0000000000..f67624ca04
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/vp9mc_neon.S
@@ -0,0 +1,687 @@
+/*
+ * Copyright (c) 2016 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+// All public functions in this file have the following signature:
+// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
+// const uint8_t *ref, ptrdiff_t ref_stride,
+// int h, int mx, int my);
+
+function ff_vp9_copy64_aarch64, export=1
+1:
+ ldp x5, x6, [x2]
+ ldp x7, x8, [x2, #16]
+ stp x5, x6, [x0]
+ ldp x9, x10, [x2, #32]
+ stp x7, x8, [x0, #16]
+ subs w4, w4, #1
+ ldp x11, x12, [x2, #48]
+ stp x9, x10, [x0, #32]
+ stp x11, x12, [x0, #48]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_vp9_avg64_neon, export=1
+ mov x5, x0
+1:
+ ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x2], x3
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
+ urhadd v0.16b, v0.16b, v4.16b
+ urhadd v1.16b, v1.16b, v5.16b
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
+ urhadd v2.16b, v2.16b, v6.16b
+ urhadd v3.16b, v3.16b, v7.16b
+ subs w4, w4, #2
+ urhadd v16.16b, v16.16b, v20.16b
+ urhadd v17.16b, v17.16b, v21.16b
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], x1
+ urhadd v18.16b, v18.16b, v22.16b
+ urhadd v19.16b, v19.16b, v23.16b
+ st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x5], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_vp9_copy32_aarch64, export=1
+1:
+ ldp x5, x6, [x2]
+ ldp x7, x8, [x2, #16]
+ stp x5, x6, [x0]
+ subs w4, w4, #1
+ stp x7, x8, [x0, #16]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_vp9_avg32_neon, export=1
+1:
+ ld1 {v2.16b, v3.16b}, [x2], x3
+ ld1 {v0.16b, v1.16b}, [x0]
+ urhadd v0.16b, v0.16b, v2.16b
+ urhadd v1.16b, v1.16b, v3.16b
+ subs w4, w4, #1
+ st1 {v0.16b, v1.16b}, [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_vp9_copy16_neon, export=1
+ add x5, x0, x1
+ lsl x1, x1, #1
+ add x6, x2, x3
+ lsl x3, x3, #1
+1:
+ ld1 {v0.16b}, [x2], x3
+ ld1 {v1.16b}, [x6], x3
+ ld1 {v2.16b}, [x2], x3
+ ld1 {v3.16b}, [x6], x3
+ subs w4, w4, #4
+ st1 {v0.16b}, [x0], x1
+ st1 {v1.16b}, [x5], x1
+ st1 {v2.16b}, [x0], x1
+ st1 {v3.16b}, [x5], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_vp9_avg16_neon, export=1
+ mov x5, x0
+1:
+ ld1 {v2.16b}, [x2], x3
+ ld1 {v0.16b}, [x0], x1
+ ld1 {v3.16b}, [x2], x3
+ urhadd v0.16b, v0.16b, v2.16b
+ ld1 {v1.16b}, [x0], x1
+ urhadd v1.16b, v1.16b, v3.16b
+ subs w4, w4, #2
+ st1 {v0.16b}, [x5], x1
+ st1 {v1.16b}, [x5], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_vp9_copy8_neon, export=1
+1:
+ ld1 {v0.8b}, [x2], x3
+ ld1 {v1.8b}, [x2], x3
+ subs w4, w4, #2
+ st1 {v0.8b}, [x0], x1
+ st1 {v1.8b}, [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_vp9_avg8_neon, export=1
+ mov x5, x0
+1:
+ ld1 {v2.8b}, [x2], x3
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v3.8b}, [x2], x3
+ urhadd v0.8b, v0.8b, v2.8b
+ ld1 {v1.8b}, [x0], x1
+ urhadd v1.8b, v1.8b, v3.8b
+ subs w4, w4, #2
+ st1 {v0.8b}, [x5], x1
+ st1 {v1.8b}, [x5], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_vp9_copy4_neon, export=1
+1:
+ ld1 {v0.s}[0], [x2], x3
+ ld1 {v1.s}[0], [x2], x3
+ st1 {v0.s}[0], [x0], x1
+ ld1 {v2.s}[0], [x2], x3
+ st1 {v1.s}[0], [x0], x1
+ ld1 {v3.s}[0], [x2], x3
+ subs w4, w4, #4
+ st1 {v2.s}[0], [x0], x1
+ st1 {v3.s}[0], [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_vp9_avg4_neon, export=1
+ mov x5, x0
+1:
+ ld1 {v2.s}[0], [x2], x3
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v2.s}[1], [x2], x3
+ ld1 {v0.s}[1], [x0], x1
+ ld1 {v3.s}[0], [x2], x3
+ ld1 {v1.s}[0], [x0], x1
+ ld1 {v3.s}[1], [x2], x3
+ ld1 {v1.s}[1], [x0], x1
+ subs w4, w4, #4
+ urhadd v0.8b, v0.8b, v2.8b
+ urhadd v1.8b, v1.8b, v3.8b
+ st1 {v0.s}[0], [x5], x1
+ st1 {v0.s}[1], [x5], x1
+ st1 {v1.s}[0], [x5], x1
+ st1 {v1.s}[1], [x5], x1
+ b.ne 1b
+ ret
+endfunc
+
+
+// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
+// for size >= 16), and multiply-accumulate into dst1 and dst3 (or
+// dst1-dst2 and dst3-dst4 for size >= 16)
+.macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
+ ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
+ ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
+.if \size >= 16
+ mla \dst1\().8h, v20.8h, v0.h[\offset]
+ ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
+ mla \dst3\().8h, v22.8h, v0.h[\offset]
+ ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
+ mla \dst2\().8h, v21.8h, v0.h[\offset]
+ mla \dst4\().8h, v23.8h, v0.h[\offset]
+.elseif \size == 8
+ mla \dst1\().8h, v20.8h, v0.h[\offset]
+ mla \dst3\().8h, v22.8h, v0.h[\offset]
+.else
+ mla \dst1\().4h, v20.4h, v0.h[\offset]
+ mla \dst3\().4h, v22.4h, v0.h[\offset]
+.endif
+.endm
+// The same as above, but don't accumulate straight into the
+// destination, but use a temp register and accumulate with saturation.
+.macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
+ ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
+ ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
+.if \size >= 16
+ mul v20.8h, v20.8h, v0.h[\offset]
+ ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
+ mul v22.8h, v22.8h, v0.h[\offset]
+ ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
+ mul v21.8h, v21.8h, v0.h[\offset]
+ mul v23.8h, v23.8h, v0.h[\offset]
+.elseif \size == 8
+ mul v20.8h, v20.8h, v0.h[\offset]
+ mul v22.8h, v22.8h, v0.h[\offset]
+.else
+ mul v20.4h, v20.4h, v0.h[\offset]
+ mul v22.4h, v22.4h, v0.h[\offset]
+.endif
+.if \size == 4
+ sqadd \dst1\().4h, \dst1\().4h, v20.4h
+ sqadd \dst3\().4h, \dst3\().4h, v22.4h
+.else
+ sqadd \dst1\().8h, \dst1\().8h, v20.8h
+ sqadd \dst3\().8h, \dst3\().8h, v22.8h
+.if \size >= 16
+ sqadd \dst2\().8h, \dst2\().8h, v21.8h
+ sqadd \dst4\().8h, \dst4\().8h, v23.8h
+.endif
+.endif
+.endm
+
+
+// Instantiate a horizontal filter function for the given size.
+// This can work on 4, 8 or 16 pixels in parallel; for larger
+// widths it will do 16 pixels at a time and loop horizontally.
+// The actual width is passed in x5, the height in w4 and the
+// filter coefficients in x9. idx2 is the index of the largest
+// filter coefficient (3 or 4) and idx1 is the other one of them.
+.macro do_8tap_h type, size, idx1, idx2
+function \type\()_8tap_\size\()h_\idx1\idx2
+ sub x2, x2, #3
+ add x6, x0, x1
+ add x7, x2, x3
+ add x1, x1, x1
+ add x3, x3, x3
+ // Only size >= 16 loops horizontally and needs
+ // reduced dst stride
+.if \size >= 16
+ sub x1, x1, x5
+.endif
+ // size >= 16 loads two qwords and increments x2,
+ // for size 4/8 it's enough with one qword and no
+ // postincrement
+.if \size >= 16
+ sub x3, x3, x5
+ sub x3, x3, #8
+.endif
+ // Load the filter vector
+ ld1 {v0.8h}, [x9]
+1:
+.if \size >= 16
+ mov x9, x5
+.endif
+ // Load src
+.if \size >= 16
+ ld1 {v4.8b, v5.8b, v6.8b}, [x2], #24
+ ld1 {v16.8b, v17.8b, v18.8b}, [x7], #24
+.else
+ ld1 {v4.8b, v5.8b}, [x2]
+ ld1 {v16.8b, v17.8b}, [x7]
+.endif
+ uxtl v4.8h, v4.8b
+ uxtl v5.8h, v5.8b
+ uxtl v16.8h, v16.8b
+ uxtl v17.8h, v17.8b
+.if \size >= 16
+ uxtl v6.8h, v6.8b
+ uxtl v18.8h, v18.8b
+.endif
+2:
+
+ // Accumulate, adding idx2 last with a separate
+ // saturating add. The positive filter coefficients
+ // for all indices except idx2 must add up to less
+ // than 127 for this not to overflow.
+ mul v1.8h, v4.8h, v0.h[0]
+ mul v24.8h, v16.8h, v0.h[0]
+.if \size >= 16
+ mul v2.8h, v5.8h, v0.h[0]
+ mul v25.8h, v17.8h, v0.h[0]
+.endif
+ extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 1, \size
+ extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 2, \size
+ extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, \idx1, \size
+ extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 5, \size
+ extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 6, \size
+ extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 7, \size
+ extmulqadd v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, \idx2, \size
+
+ // Round, shift and saturate
+ sqrshrun v1.8b, v1.8h, #7
+ sqrshrun v24.8b, v24.8h, #7
+.if \size >= 16
+ sqrshrun2 v1.16b, v2.8h, #7
+ sqrshrun2 v24.16b, v25.8h, #7
+.endif
+ // Average
+.ifc \type,avg
+.if \size >= 16
+ ld1 {v2.16b}, [x0]
+ ld1 {v3.16b}, [x6]
+ urhadd v1.16b, v1.16b, v2.16b
+ urhadd v24.16b, v24.16b, v3.16b
+.elseif \size == 8
+ ld1 {v2.8b}, [x0]
+ ld1 {v3.8b}, [x6]
+ urhadd v1.8b, v1.8b, v2.8b
+ urhadd v24.8b, v24.8b, v3.8b
+.else
+ ld1 {v2.s}[0], [x0]
+ ld1 {v3.s}[0], [x6]
+ urhadd v1.8b, v1.8b, v2.8b
+ urhadd v24.8b, v24.8b, v3.8b
+.endif
+.endif
+ // Store and loop horizontally (for size >= 16)
+.if \size >= 16
+ subs x9, x9, #16
+ st1 {v1.16b}, [x0], #16
+ st1 {v24.16b}, [x6], #16
+ b.eq 3f
+ mov v4.16b, v6.16b
+ mov v16.16b, v18.16b
+ ld1 {v6.16b}, [x2], #16
+ ld1 {v18.16b}, [x7], #16
+ uxtl v5.8h, v6.8b
+ uxtl2 v6.8h, v6.16b
+ uxtl v17.8h, v18.8b
+ uxtl2 v18.8h, v18.16b
+ b 2b
+.elseif \size == 8
+ st1 {v1.8b}, [x0]
+ st1 {v24.8b}, [x6]
+.else // \size == 4
+ st1 {v1.s}[0], [x0]
+ st1 {v24.s}[0], [x6]
+.endif
+3:
+ // Loop vertically
+ add x0, x0, x1
+ add x6, x6, x1
+ add x2, x2, x3
+ add x7, x7, x3
+ subs w4, w4, #2
+ b.ne 1b
+ ret
+endfunc
+.endm
+
+.macro do_8tap_h_size size
+do_8tap_h put, \size, 3, 4
+do_8tap_h avg, \size, 3, 4
+do_8tap_h put, \size, 4, 3
+do_8tap_h avg, \size, 4, 3
+.endm
+
+do_8tap_h_size 4
+do_8tap_h_size 8
+do_8tap_h_size 16
+
+.macro do_8tap_h_func type, filter, offset, size
+function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
+ movrel x6, X(ff_vp9_subpel_filters), 256*\offset
+ cmp w5, #8
+ add x9, x6, w5, uxtw #4
+ mov x5, #\size
+.if \size >= 16
+ b.ge \type\()_8tap_16h_34
+ b \type\()_8tap_16h_43
+.else
+ b.ge \type\()_8tap_\size\()h_34
+ b \type\()_8tap_\size\()h_43
+.endif
+endfunc
+.endm
+
+.macro do_8tap_h_filters size
+do_8tap_h_func put, regular, 1, \size
+do_8tap_h_func avg, regular, 1, \size
+do_8tap_h_func put, sharp, 2, \size
+do_8tap_h_func avg, sharp, 2, \size
+do_8tap_h_func put, smooth, 0, \size
+do_8tap_h_func avg, smooth, 0, \size
+.endm
+
+do_8tap_h_filters 64
+do_8tap_h_filters 32
+do_8tap_h_filters 16
+do_8tap_h_filters 8
+do_8tap_h_filters 4
+
+
+// Vertical filters
+
+// Round, shift and saturate and store reg1-reg2 over 4 lines
+.macro do_store4 reg1, reg2, tmp1, tmp2, type
+ sqrshrun \reg1\().8b, \reg1\().8h, #7
+ sqrshrun \reg2\().8b, \reg2\().8h, #7
+.ifc \type,avg
+ ld1 {\tmp1\().s}[0], [x7], x1
+ ld1 {\tmp2\().s}[0], [x7], x1
+ ld1 {\tmp1\().s}[1], [x7], x1
+ ld1 {\tmp2\().s}[1], [x7], x1
+ urhadd \reg1\().8b, \reg1\().8b, \tmp1\().8b
+ urhadd \reg2\().8b, \reg2\().8b, \tmp2\().8b
+.endif
+ st1 {\reg1\().s}[0], [x0], x1
+ st1 {\reg2\().s}[0], [x0], x1
+ st1 {\reg1\().s}[1], [x0], x1
+ st1 {\reg2\().s}[1], [x0], x1
+.endm
+
+// Round, shift and saturate and store reg1-4
+.macro do_store reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, type
+ sqrshrun \reg1\().8b, \reg1\().8h, #7
+ sqrshrun \reg2\().8b, \reg2\().8h, #7
+ sqrshrun \reg3\().8b, \reg3\().8h, #7
+ sqrshrun \reg4\().8b, \reg4\().8h, #7
+.ifc \type,avg
+ ld1 {\tmp1\().8b}, [x7], x1
+ ld1 {\tmp2\().8b}, [x7], x1
+ ld1 {\tmp3\().8b}, [x7], x1
+ ld1 {\tmp4\().8b}, [x7], x1
+ urhadd \reg1\().8b, \reg1\().8b, \tmp1\().8b
+ urhadd \reg2\().8b, \reg2\().8b, \tmp2\().8b
+ urhadd \reg3\().8b, \reg3\().8b, \tmp3\().8b
+ urhadd \reg4\().8b, \reg4\().8b, \tmp4\().8b
+.endif
+ st1 {\reg1\().8b}, [x0], x1
+ st1 {\reg2\().8b}, [x0], x1
+ st1 {\reg3\().8b}, [x0], x1
+ st1 {\reg4\().8b}, [x0], x1
+.endm
+
+// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
+// (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately
+// at the end with saturation. Indices 0 and 7 always have negative or zero
+// coefficients, so they can be accumulated into tmp1-tmp2 together with the
+// largest coefficient.
+.macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2
+ mul \dst1\().8h, \src2\().8h, v0.h[1]
+ mul \dst2\().8h, \src3\().8h, v0.h[1]
+ mul \tmp1\().8h, \src1\().8h, v0.h[0]
+ mul \tmp2\().8h, \src2\().8h, v0.h[0]
+ mla \dst1\().8h, \src3\().8h, v0.h[2]
+ mla \dst2\().8h, \src4\().8h, v0.h[2]
+.if \idx1 == 3
+ mla \dst1\().8h, \src4\().8h, v0.h[3]
+ mla \dst2\().8h, \src5\().8h, v0.h[3]
+.else
+ mla \dst1\().8h, \src5\().8h, v0.h[4]
+ mla \dst2\().8h, \src6\().8h, v0.h[4]
+.endif
+ mla \dst1\().8h, \src6\().8h, v0.h[5]
+ mla \dst2\().8h, \src7\().8h, v0.h[5]
+ mla \tmp1\().8h, \src8\().8h, v0.h[7]
+ mla \tmp2\().8h, \src9\().8h, v0.h[7]
+ mla \dst1\().8h, \src7\().8h, v0.h[6]
+ mla \dst2\().8h, \src8\().8h, v0.h[6]
+.if \idx2 == 3
+ mla \tmp1\().8h, \src4\().8h, v0.h[3]
+ mla \tmp2\().8h, \src5\().8h, v0.h[3]
+.else
+ mla \tmp1\().8h, \src5\().8h, v0.h[4]
+ mla \tmp2\().8h, \src6\().8h, v0.h[4]
+.endif
+ sqadd \dst1\().8h, \dst1\().8h, \tmp1\().8h
+ sqadd \dst2\().8h, \dst2\().8h, \tmp2\().8h
+.endm
+
+// Load pixels and extend them to 16 bit
+.macro loadl dst1, dst2, dst3, dst4
+ ld1 {v1.8b}, [x2], x3
+ ld1 {v2.8b}, [x2], x3
+ ld1 {v3.8b}, [x2], x3
+.ifnb \dst4
+ ld1 {v4.8b}, [x2], x3
+.endif
+ uxtl \dst1\().8h, v1.8b
+ uxtl \dst2\().8h, v2.8b
+ uxtl \dst3\().8h, v3.8b
+.ifnb \dst4
+ uxtl \dst4\().8h, v4.8b
+.endif
+.endm
+
+// Instantiate a vertical filter function for filtering 8 pixels at a time.
+// The height is passed in x4, the width in x5 and the filter coefficients
+// in x6. idx2 is the index of the largest filter coefficient (3 or 4)
+// and idx1 is the other one of them.
+.macro do_8tap_8v type, idx1, idx2
+function \type\()_8tap_8v_\idx1\idx2
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ ld1 {v0.8h}, [x6]
+1:
+.ifc \type,avg
+ mov x7, x0
+.endif
+ mov x6, x4
+
+ loadl v17, v18, v19
+
+ loadl v20, v21, v22, v23
+2:
+ loadl v24, v25, v26, v27
+ convolve v1, v2, v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v5, v6
+ convolve v3, v4, v19, v20, v21, v22, v23, v24, v25, v26, v27, \idx1, \idx2, v5, v6
+ do_store v1, v2, v3, v4, v5, v6, v7, v28, \type
+
+ subs x6, x6, #4
+ b.eq 8f
+
+ loadl v16, v17, v18, v19
+ convolve v1, v2, v21, v22, v23, v24, v25, v26, v27, v16, v17, \idx1, \idx2, v5, v6
+ convolve v3, v4, v23, v24, v25, v26, v27, v16, v17, v18, v19, \idx1, \idx2, v5, v6
+ do_store v1, v2, v3, v4, v5, v6, v7, v28, \type
+
+ subs x6, x6, #4
+ b.eq 8f
+
+ loadl v20, v21, v22, v23
+ convolve v1, v2, v25, v26, v27, v16, v17, v18, v19, v20, v21, \idx1, \idx2, v5, v6
+ convolve v3, v4, v27, v16, v17, v18, v19, v20, v21, v22, v23, \idx1, \idx2, v5, v6
+ do_store v1, v2, v3, v4, v5, v6, v7, v28, \type
+
+ subs x6, x6, #4
+ b.ne 2b
+
+8:
+ subs x5, x5, #8
+ b.eq 9f
+ // x0 -= h * dst_stride
+ msub x0, x1, x4, x0
+ // x2 -= h * src_stride
+ msub x2, x3, x4, x2
+ // x2 -= 8 * src_stride
+ sub x2, x2, x3, lsl #3
+ // x2 += 1 * src_stride
+ add x2, x2, x3
+ add x2, x2, #8
+ add x0, x0, #8
+ b 1b
+9:
+ ret
+endfunc
+.endm
+
+do_8tap_8v put, 3, 4
+do_8tap_8v put, 4, 3
+do_8tap_8v avg, 3, 4
+do_8tap_8v avg, 4, 3
+
+
+// Instantiate a vertical filter function for filtering a 4 pixels wide
+// slice. The first half of the registers contain one row, while the second
+// half of a register contains the second-next row (also stored in the first
+// half of the register two steps ahead). The convolution does two outputs
+// at a time; the output of v17-v24 into one, and v18-v25 into another one.
+// The first half of first output is the first output row, the first half
+// of the other output is the second output row. The second halves of the
+// registers are rows 3 and 4.
+// This only is designed to work for 4 or 8 output lines.
+.macro do_8tap_4v type, idx1, idx2
+function \type\()_8tap_4v_\idx1\idx2
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ ld1 {v0.8h}, [x6]
+.ifc \type,avg
+ mov x7, x0
+.endif
+
+ ld1 {v1.s}[0], [x2], x3
+ ld1 {v2.s}[0], [x2], x3
+ ld1 {v3.s}[0], [x2], x3
+ ld1 {v4.s}[0], [x2], x3
+ ld1 {v5.s}[0], [x2], x3
+ ld1 {v6.s}[0], [x2], x3
+ trn1 v1.2s, v1.2s, v3.2s
+ ld1 {v7.s}[0], [x2], x3
+ trn1 v2.2s, v2.2s, v4.2s
+ ld1 {v26.s}[0], [x2], x3
+ uxtl v17.8h, v1.8b
+ trn1 v3.2s, v3.2s, v5.2s
+ ld1 {v27.s}[0], [x2], x3
+ uxtl v18.8h, v2.8b
+ trn1 v4.2s, v4.2s, v6.2s
+ ld1 {v28.s}[0], [x2], x3
+ uxtl v19.8h, v3.8b
+ trn1 v5.2s, v5.2s, v7.2s
+ ld1 {v29.s}[0], [x2], x3
+ uxtl v20.8h, v4.8b
+ trn1 v6.2s, v6.2s, v26.2s
+ uxtl v21.8h, v5.8b
+ trn1 v7.2s, v7.2s, v27.2s
+ uxtl v22.8h, v6.8b
+ trn1 v26.2s, v26.2s, v28.2s
+ uxtl v23.8h, v7.8b
+ trn1 v27.2s, v27.2s, v29.2s
+ uxtl v24.8h, v26.8b
+ uxtl v25.8h, v27.8b
+
+ convolve v1, v2, v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v3, v4
+ do_store4 v1, v2, v5, v6, \type
+
+ subs x4, x4, #4
+ b.eq 9f
+
+ ld1 {v1.s}[0], [x2], x3
+ ld1 {v2.s}[0], [x2], x3
+ trn1 v28.2s, v28.2s, v1.2s
+ trn1 v29.2s, v29.2s, v2.2s
+ ld1 {v1.s}[1], [x2], x3
+ uxtl v26.8h, v28.8b
+ ld1 {v2.s}[1], [x2], x3
+ uxtl v27.8h, v29.8b
+ uxtl v28.8h, v1.8b
+ uxtl v29.8h, v2.8b
+
+ convolve v1, v2, v21, v22, v23, v24, v25, v26, v27, v28, v29, \idx1, \idx2, v3, v4
+ do_store4 v1, v2, v5, v6, \type
+
+9:
+ ret
+endfunc
+.endm
+
+do_8tap_4v put, 3, 4
+do_8tap_4v put, 4, 3
+do_8tap_4v avg, 3, 4
+do_8tap_4v avg, 4, 3
+
+
+.macro do_8tap_v_func type, filter, offset, size
+function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
+ uxtw x4, w4
+ movrel x5, X(ff_vp9_subpel_filters), 256*\offset
+ cmp w6, #8
+ add x6, x5, w6, uxtw #4
+ mov x5, #\size
+.if \size >= 8
+ b.ge \type\()_8tap_8v_34
+ b \type\()_8tap_8v_43
+.else
+ b.ge \type\()_8tap_4v_34
+ b \type\()_8tap_4v_43
+.endif
+endfunc
+.endm
+
+.macro do_8tap_v_filters size
+do_8tap_v_func put, regular, 1, \size
+do_8tap_v_func avg, regular, 1, \size
+do_8tap_v_func put, sharp, 2, \size
+do_8tap_v_func avg, sharp, 2, \size
+do_8tap_v_func put, smooth, 0, \size
+do_8tap_v_func avg, smooth, 0, \size
+.endm
+
+do_8tap_v_filters 64
+do_8tap_v_filters 32
+do_8tap_v_filters 16
+do_8tap_v_filters 8
+do_8tap_v_filters 4